eval-toolkit 0.27.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_toolkit/__init__.py +238 -0
- eval_toolkit/__main__.py +156 -0
- eval_toolkit/_version.py +5 -0
- eval_toolkit/analysis.py +196 -0
- eval_toolkit/artifacts.py +376 -0
- eval_toolkit/bootstrap.py +1344 -0
- eval_toolkit/calibration.py +1143 -0
- eval_toolkit/claims.py +670 -0
- eval_toolkit/config.py +112 -0
- eval_toolkit/docs.py +305 -0
- eval_toolkit/evidence.py +90 -0
- eval_toolkit/harness.py +1193 -0
- eval_toolkit/leakage.py +1052 -0
- eval_toolkit/loaders.py +424 -0
- eval_toolkit/manifest.py +622 -0
- eval_toolkit/metrics.py +1720 -0
- eval_toolkit/operating_points.py +192 -0
- eval_toolkit/paths.py +125 -0
- eval_toolkit/plotting.py +991 -0
- eval_toolkit/protocols.py +98 -0
- eval_toolkit/provenance.py +255 -0
- eval_toolkit/py.typed +0 -0
- eval_toolkit/schemas/manifest.v1.json +155 -0
- eval_toolkit/schemas/manifest.v2.json +186 -0
- eval_toolkit/schemas/manifest.v3.json +186 -0
- eval_toolkit/schemas/results.v1.json +87 -0
- eval_toolkit/schemas/results_full.v1.json +83 -0
- eval_toolkit/seeds.py +119 -0
- eval_toolkit/splits.py +520 -0
- eval_toolkit/text_dedup.py +1403 -0
- eval_toolkit/thresholds.py +819 -0
- eval_toolkit-0.27.1.dist-info/METADATA +314 -0
- eval_toolkit-0.27.1.dist-info/RECORD +36 -0
- eval_toolkit-0.27.1.dist-info/WHEEL +4 -0
- eval_toolkit-0.27.1.dist-info/entry_points.txt +2 -0
- eval_toolkit-0.27.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1143 @@
|
|
|
1
|
+
r"""Calibration: reliability curves, Bayes-optimal thresholds, isotonic/Platt/temperature scaling.
|
|
2
|
+
|
|
3
|
+
Public surface:
|
|
4
|
+
|
|
5
|
+
- :func:`reliability_curve` — bin-level calibration data
|
|
6
|
+
(DeGroot & Fienberg 1983 [#degroot]_; Niculescu-Mizil & Caruana 2005 [#nm05]_)
|
|
7
|
+
- :func:`maximum_calibration_error` — worst-bin calibration gap
|
|
8
|
+
(Naeini & Cooper 2014 [#mce]_); companion scalar to the ECE summaries
|
|
9
|
+
surfaced inside :func:`reliability_curve`.
|
|
10
|
+
- :func:`bayes_optimal_threshold` — closed-form cost-sensitive decision boundary
|
|
11
|
+
(Elkan 2001 [#elkan]_); :class:`CostMatrix` packages prior + costs + abstain cost.
|
|
12
|
+
- :func:`fit_isotonic_calibrator` — Niculescu-Mizil & Caruana 2005 [#nm05]_
|
|
13
|
+
- :func:`fit_platt_calibrator` — Platt 1999 [#platt]_ sigmoid scaling; returns a
|
|
14
|
+
:class:`PlattFit` dataclass exposing the fitted ``(a, b)`` parameters alongside
|
|
15
|
+
the transform callable (frozen, ``__call__``-able for back-compat with v0.11).
|
|
16
|
+
- :func:`fit_temperature` — Guo et al. 2017 [#guo]_ — fits T on val *logits* (literature standard)
|
|
17
|
+
- :func:`fit_temperature_oracle` — Guo et al. 2017 [#guo]_ — fits T on *probabilities*; diagnostic
|
|
18
|
+
upper-bound only (T is fit on the data it then scores).
|
|
19
|
+
|
|
20
|
+
References
|
|
21
|
+
----------
|
|
22
|
+
.. [#degroot] DeGroot, M. H. & Fienberg, S. E. "The Comparison and Evaluation of Forecasters."
|
|
23
|
+
*The Statistician* 32 (1/2): 12-22, 1983.
|
|
24
|
+
.. [#elkan] Elkan, C. "The Foundations of Cost-Sensitive Learning." IJCAI 2001.
|
|
25
|
+
.. [#guo] Guo, C., Pleiss, G., Sun, Y. & Weinberger, K. "On Calibration of Modern Neural Networks."
|
|
26
|
+
ICML 2017. arXiv:1706.04599.
|
|
27
|
+
.. [#mce] Naeini, M. P. & Cooper, G. F. "Binary Classifier Calibration: A Bayesian Non-Parametric
|
|
28
|
+
Approach." SDM 2014.
|
|
29
|
+
.. [#nm05] Niculescu-Mizil, A. & Caruana, R. "Predicting Good Probabilities With Supervised
|
|
30
|
+
Learning." ICML 2005.
|
|
31
|
+
.. [#platt] Platt, J. "Probabilistic Outputs for Support Vector Machines and Comparisons to
|
|
32
|
+
Regularized Likelihood Methods." *Advances in Large Margin Classifiers*, 1999.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from collections.abc import Callable
|
|
38
|
+
from dataclasses import dataclass
|
|
39
|
+
from typing import Final, Literal
|
|
40
|
+
|
|
41
|
+
import numpy as np
|
|
42
|
+
from scipy.optimize import minimize, minimize_scalar
|
|
43
|
+
from scipy.special import log_softmax
|
|
44
|
+
from sklearn.calibration import calibration_curve
|
|
45
|
+
from sklearn.isotonic import IsotonicRegression
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"DEFAULT_FN_COST",
|
|
49
|
+
"DEFAULT_FP_COST",
|
|
50
|
+
"DEFAULT_N_BINS",
|
|
51
|
+
"DEFAULT_PRIOR",
|
|
52
|
+
"DEFAULT_STRATEGY",
|
|
53
|
+
"CostMatrix",
|
|
54
|
+
"PlattFit",
|
|
55
|
+
"bayes_optimal_threshold",
|
|
56
|
+
"fit_beta_calibrator",
|
|
57
|
+
"fit_isotonic_calibrator",
|
|
58
|
+
"fit_platt_calibrator",
|
|
59
|
+
"fit_temperature",
|
|
60
|
+
"fit_temperature_oracle",
|
|
61
|
+
"maximum_calibration_error",
|
|
62
|
+
"reliability_curve",
|
|
63
|
+
"reliability_diagram_data",
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
DEFAULT_N_BINS: Final[int] = 10
|
|
67
|
+
DEFAULT_STRATEGY: Final[Literal["uniform", "quantile"]] = "quantile"
|
|
68
|
+
|
|
69
|
+
# Example cost-matrix defaults (rare-positive deployment surface). These are
|
|
70
|
+
# illustrative scaffolding; a real cost matrix should come from stakeholder
|
|
71
|
+
# elicitation, not library defaults.
|
|
72
|
+
DEFAULT_PRIOR: Final[float] = 0.01
|
|
73
|
+
DEFAULT_FP_COST: Final[float] = 1.0
|
|
74
|
+
DEFAULT_FN_COST: Final[float] = 10.0
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def reliability_curve(
|
|
78
|
+
y_true: np.ndarray,
|
|
79
|
+
y_score: np.ndarray,
|
|
80
|
+
*,
|
|
81
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
82
|
+
strategy: Literal["uniform", "quantile"] = DEFAULT_STRATEGY,
|
|
83
|
+
) -> dict[str, object]:
|
|
84
|
+
"""Bin-level calibration data wrapping :func:`sklearn.calibration.calibration_curve`.
|
|
85
|
+
|
|
86
|
+
Returns a JSON-friendly dict with bin centers, observed positive rates,
|
|
87
|
+
per-bin counts, and both equal-width and equal-mass ECE summaries.
|
|
88
|
+
Single-class slices are skipped with an explicit marker.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
y_true : np.ndarray, shape (n,)
|
|
93
|
+
Binary labels in {0, 1}.
|
|
94
|
+
y_score : np.ndarray, shape (n,)
|
|
95
|
+
Predicted probabilities in [0, 1].
|
|
96
|
+
n_bins : int, optional
|
|
97
|
+
Number of bins (default 10).
|
|
98
|
+
strategy : {"uniform", "quantile"}, optional
|
|
99
|
+
Equal-width vs equal-mass binning. Default "quantile".
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
dict
|
|
104
|
+
Either the calibration record with keys ``prob_true``, ``prob_pred``,
|
|
105
|
+
``bin_edges``, ``n_per_bin``, ``ece_equal_mass``, ``ece_equal_width``,
|
|
106
|
+
``n_bins``, ``strategy``, ``n``, ``n_positive``,
|
|
107
|
+
or ``{"skipped": "...", "n", "n_positive"}`` for a single-class slice.
|
|
108
|
+
|
|
109
|
+
Raises
|
|
110
|
+
------
|
|
111
|
+
ValueError
|
|
112
|
+
On shape mismatch, empty input, ``n_bins <= 1``, or unknown strategy.
|
|
113
|
+
|
|
114
|
+
Examples
|
|
115
|
+
--------
|
|
116
|
+
>>> import numpy as np
|
|
117
|
+
>>> rng = np.random.default_rng(42)
|
|
118
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
119
|
+
>>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
|
|
120
|
+
>>> result = reliability_curve(y, s, n_bins=5, strategy="uniform")
|
|
121
|
+
>>> sorted(result.keys())[:5]
|
|
122
|
+
['bin_edges', 'ece_equal_mass', 'ece_equal_width', 'n', 'n_bins']
|
|
123
|
+
"""
|
|
124
|
+
y_true_arr = np.asarray(y_true).astype(int)
|
|
125
|
+
y_score_arr = np.asarray(y_score).astype(float)
|
|
126
|
+
if y_true_arr.shape != y_score_arr.shape:
|
|
127
|
+
raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
|
|
128
|
+
if y_true_arr.size == 0:
|
|
129
|
+
raise ValueError("y_true is empty")
|
|
130
|
+
if n_bins <= 1:
|
|
131
|
+
raise ValueError(f"n_bins must be > 1, got {n_bins}")
|
|
132
|
+
if strategy not in {"uniform", "quantile"}:
|
|
133
|
+
raise ValueError(f"strategy must be 'uniform' or 'quantile', got {strategy!r}")
|
|
134
|
+
|
|
135
|
+
n = int(y_true_arr.size)
|
|
136
|
+
n_positive = int(y_true_arr.sum())
|
|
137
|
+
if n_positive == 0 or n_positive == n:
|
|
138
|
+
return {
|
|
139
|
+
"skipped": (
|
|
140
|
+
"single-class slice; calibration is degenerate (per-bin observed "
|
|
141
|
+
"rates are constant 0 or 1)."
|
|
142
|
+
),
|
|
143
|
+
"n": n,
|
|
144
|
+
"n_positive": n_positive,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
prob_true, prob_pred = calibration_curve(
|
|
148
|
+
y_true_arr, y_score_arr, n_bins=n_bins, strategy=strategy
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if strategy == "uniform":
|
|
152
|
+
bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
|
|
153
|
+
else:
|
|
154
|
+
bin_edges = np.quantile(y_score_arr, np.linspace(0.0, 1.0, n_bins + 1))
|
|
155
|
+
n_per_bin, _ = np.histogram(y_score_arr, bins=bin_edges)
|
|
156
|
+
|
|
157
|
+
ece_equal_mass = _ece_via_calibration_curve(y_true_arr, y_score_arr, n_bins, "quantile")
|
|
158
|
+
ece_equal_width = _ece_via_calibration_curve(y_true_arr, y_score_arr, n_bins, "uniform")
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"n": n,
|
|
162
|
+
"n_positive": n_positive,
|
|
163
|
+
"n_bins": int(n_bins),
|
|
164
|
+
"strategy": strategy,
|
|
165
|
+
"prob_true": [float(x) for x in prob_true],
|
|
166
|
+
"prob_pred": [float(x) for x in prob_pred],
|
|
167
|
+
"bin_edges": [float(x) for x in bin_edges],
|
|
168
|
+
"n_per_bin": [int(x) for x in n_per_bin],
|
|
169
|
+
"ece_equal_mass": float(ece_equal_mass),
|
|
170
|
+
"ece_equal_width": float(ece_equal_width),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def reliability_diagram_data(
|
|
175
|
+
y_true: np.ndarray,
|
|
176
|
+
y_score: np.ndarray,
|
|
177
|
+
*,
|
|
178
|
+
n_bins: int = 10,
|
|
179
|
+
strategy: Literal["uniform", "quantile"] = "quantile",
|
|
180
|
+
) -> list[dict[str, float | int]]:
|
|
181
|
+
"""Structured per-bin reliability rows for serialization or plotting.
|
|
182
|
+
|
|
183
|
+
Wraps :func:`reliability_curve` and reshapes its outputs into a list
|
|
184
|
+
of bin records suitable for direct parquet / JSON serialization or
|
|
185
|
+
for handing to :func:`eval_toolkit.plotting.plot_reliability_diagram`.
|
|
186
|
+
|
|
187
|
+
Schema (each dict):
|
|
188
|
+
- ``bin_lower``, ``bin_upper`` — bin edges (float).
|
|
189
|
+
- ``mean_pred`` — mean predicted probability inside the bin.
|
|
190
|
+
- ``frac_positive`` — fraction of positives inside the bin.
|
|
191
|
+
- ``n`` — number of rows in the bin (int).
|
|
192
|
+
|
|
193
|
+
Matplotlib is *not* required: the helper lives in
|
|
194
|
+
:mod:`eval_toolkit.calibration` so it can be imported by serializing
|
|
195
|
+
callers that don't pull in plotting deps.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
y_true, y_score : np.ndarray
|
|
200
|
+
Binary labels and predicted probabilities.
|
|
201
|
+
n_bins : int, optional
|
|
202
|
+
Number of bins. Default 10.
|
|
203
|
+
strategy : {"uniform", "quantile"}, optional
|
|
204
|
+
Quantile (equal-mass; default) or uniform (equal-width).
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
list[dict[str, float | int]]
|
|
209
|
+
Empty list for degenerate slices (single-class or empty;
|
|
210
|
+
:func:`reliability_curve` returns a ``skipped`` sentinel which
|
|
211
|
+
this helper flattens to ``[]``).
|
|
212
|
+
|
|
213
|
+
Examples
|
|
214
|
+
--------
|
|
215
|
+
>>> import numpy as np
|
|
216
|
+
>>> rng = np.random.default_rng(0)
|
|
217
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
218
|
+
>>> s = rng.uniform(0, 1, size=200)
|
|
219
|
+
>>> rows = reliability_diagram_data(y, s, n_bins=5)
|
|
220
|
+
>>> sorted(rows[0].keys())
|
|
221
|
+
['bin_lower', 'bin_upper', 'frac_positive', 'mean_pred', 'n']
|
|
222
|
+
"""
|
|
223
|
+
if len(y_true) == 0 or len(np.unique(y_true)) < 2:
|
|
224
|
+
return []
|
|
225
|
+
rc = reliability_curve(y_true, y_score, n_bins=n_bins, strategy=strategy)
|
|
226
|
+
if "skipped" in rc:
|
|
227
|
+
return []
|
|
228
|
+
prob_true = np.asarray(rc["prob_true"])
|
|
229
|
+
prob_pred = np.asarray(rc["prob_pred"])
|
|
230
|
+
bin_edges = np.asarray(rc["bin_edges"])
|
|
231
|
+
n_per_bin = np.asarray(rc["n_per_bin"])
|
|
232
|
+
rows: list[dict[str, float | int]] = []
|
|
233
|
+
for i in range(len(prob_true)):
|
|
234
|
+
lo = float(bin_edges[i])
|
|
235
|
+
hi = float(bin_edges[i + 1]) if i + 1 < len(bin_edges) else float(bin_edges[-1])
|
|
236
|
+
rows.append(
|
|
237
|
+
{
|
|
238
|
+
"bin_lower": lo,
|
|
239
|
+
"bin_upper": hi,
|
|
240
|
+
"mean_pred": float(prob_pred[i]),
|
|
241
|
+
"frac_positive": float(prob_true[i]),
|
|
242
|
+
"n": int(n_per_bin[i]),
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
return rows
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _ece_via_calibration_curve(
|
|
249
|
+
y_true: np.ndarray,
|
|
250
|
+
y_score: np.ndarray,
|
|
251
|
+
n_bins: int,
|
|
252
|
+
strategy: Literal["uniform", "quantile"],
|
|
253
|
+
) -> float:
|
|
254
|
+
"""ECE computed via sklearn's ``calibration_curve`` (handles empty bins).
|
|
255
|
+
|
|
256
|
+
Used internally by :func:`reliability_curve`. For metric-only ECE in
|
|
257
|
+
bootstrap contexts, use ``eval_toolkit.metrics.expected_calibration_error``.
|
|
258
|
+
"""
|
|
259
|
+
prob_true, prob_pred = calibration_curve(y_true, y_score, n_bins=n_bins, strategy=strategy)
|
|
260
|
+
if strategy == "uniform":
|
|
261
|
+
bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
|
|
262
|
+
else:
|
|
263
|
+
bin_edges = np.quantile(y_score, np.linspace(0.0, 1.0, n_bins + 1))
|
|
264
|
+
n_per_bin, _ = np.histogram(y_score, bins=bin_edges)
|
|
265
|
+
non_empty_mask = n_per_bin > 0
|
|
266
|
+
n_per_bin_nonempty = n_per_bin[non_empty_mask]
|
|
267
|
+
if len(n_per_bin_nonempty) != len(prob_true):
|
|
268
|
+
n_per_bin_nonempty = np.full(len(prob_true), len(y_score) / max(len(prob_true), 1))
|
|
269
|
+
weights = n_per_bin_nonempty / max(int(n_per_bin_nonempty.sum()), 1)
|
|
270
|
+
return float((weights * np.abs(prob_true - prob_pred)).sum())
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def maximum_calibration_error(
|
|
274
|
+
y_true: np.ndarray,
|
|
275
|
+
y_score: np.ndarray,
|
|
276
|
+
*,
|
|
277
|
+
n_bins: int = DEFAULT_N_BINS,
|
|
278
|
+
strategy: Literal["uniform", "quantile"] = DEFAULT_STRATEGY,
|
|
279
|
+
) -> float | None:
|
|
280
|
+
r"""Maximum Calibration Error — worst-bin |observed_rate − mean_predicted|.
|
|
281
|
+
|
|
282
|
+
Companion scalar to ECE: where ECE is the *weighted-average* calibration gap,
|
|
283
|
+
MCE is the *worst-bin* gap. Surfaces the worst-calibrated bin so a model with
|
|
284
|
+
low ECE but one very-poorly-calibrated bin is not given a clean bill of
|
|
285
|
+
health (Naeini & Cooper 2014 [#mce]_).
|
|
286
|
+
|
|
287
|
+
Single-class slices return ``None`` (calibration is degenerate when one
|
|
288
|
+
class is absent — per-bin observed rates are constant 0 or 1).
|
|
289
|
+
|
|
290
|
+
Parameters
|
|
291
|
+
----------
|
|
292
|
+
y_true : np.ndarray, shape (n,)
|
|
293
|
+
Binary labels in {0, 1}.
|
|
294
|
+
y_score : np.ndarray, shape (n,)
|
|
295
|
+
Predicted probabilities in [0, 1].
|
|
296
|
+
n_bins : int, optional
|
|
297
|
+
Number of bins (default 10).
|
|
298
|
+
strategy : {"uniform", "quantile"}, optional
|
|
299
|
+
Equal-width vs equal-mass binning. Default "quantile" (matches
|
|
300
|
+
:func:`reliability_curve` and yields more robust per-bin estimates on
|
|
301
|
+
imbalanced score distributions).
|
|
302
|
+
|
|
303
|
+
Returns
|
|
304
|
+
-------
|
|
305
|
+
float | None
|
|
306
|
+
Worst-bin calibration gap in [0, 1], or ``None`` for single-class
|
|
307
|
+
slices.
|
|
308
|
+
|
|
309
|
+
Raises
|
|
310
|
+
------
|
|
311
|
+
ValueError
|
|
312
|
+
On shape mismatch, empty input, ``n_bins <= 1``, or unknown strategy.
|
|
313
|
+
|
|
314
|
+
Examples
|
|
315
|
+
--------
|
|
316
|
+
>>> import numpy as np
|
|
317
|
+
>>> rng = np.random.default_rng(42)
|
|
318
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
319
|
+
>>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
|
|
320
|
+
>>> mce = maximum_calibration_error(y, s, n_bins=5, strategy="quantile")
|
|
321
|
+
>>> 0.0 <= mce <= 1.0
|
|
322
|
+
True
|
|
323
|
+
"""
|
|
324
|
+
y_true_arr = np.asarray(y_true).astype(int)
|
|
325
|
+
y_score_arr = np.asarray(y_score).astype(float)
|
|
326
|
+
if y_true_arr.shape != y_score_arr.shape:
|
|
327
|
+
raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
|
|
328
|
+
if y_true_arr.size == 0:
|
|
329
|
+
raise ValueError("y_true is empty")
|
|
330
|
+
if n_bins <= 1:
|
|
331
|
+
raise ValueError(f"n_bins must be > 1, got {n_bins}")
|
|
332
|
+
if strategy not in {"uniform", "quantile"}:
|
|
333
|
+
raise ValueError(f"strategy must be 'uniform' or 'quantile', got {strategy!r}")
|
|
334
|
+
|
|
335
|
+
n_positive = int(y_true_arr.sum())
|
|
336
|
+
if n_positive == 0 or n_positive == y_true_arr.size:
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
prob_true, prob_pred = calibration_curve(
|
|
340
|
+
y_true_arr, y_score_arr, n_bins=n_bins, strategy=strategy
|
|
341
|
+
)
|
|
342
|
+
if len(prob_true) == 0:
|
|
343
|
+
return None
|
|
344
|
+
return float(np.abs(prob_true - prob_pred).max())
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
|
|
348
|
+
r"""Bayes-optimal threshold per Elkan 2001 [#elkan]_ cost-sensitive derivation.
|
|
349
|
+
|
|
350
|
+
For a calibrated probabilistic classifier P(y=1 | x), the cost-minimizing
|
|
351
|
+
decision rule is "predict 1 iff score ≥ t*" with:
|
|
352
|
+
|
|
353
|
+
.. math:: t^* = \frac{c_{FP} \cdot (1 - π)}{c_{FP} \cdot (1 - π) + c_{FN} \cdot π}
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
π : float
|
|
358
|
+
Deployment positive-class prior P(y=1) ∈ [0, 1].
|
|
359
|
+
(π = empirical positive prior; English alias on first appearance per the
|
|
360
|
+
Unicode-identifier convention in STYLE.md.)
|
|
361
|
+
c_fp : float
|
|
362
|
+
Cost of a false positive. Must be > 0.
|
|
363
|
+
c_fn : float
|
|
364
|
+
Cost of a false negative. Must be > 0.
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
float
|
|
369
|
+
Optimal threshold ∈ [0, 1].
|
|
370
|
+
|
|
371
|
+
Raises
|
|
372
|
+
------
|
|
373
|
+
ValueError
|
|
374
|
+
If π is outside [0, 1] or costs are non-positive.
|
|
375
|
+
|
|
376
|
+
Examples
|
|
377
|
+
--------
|
|
378
|
+
Symmetric costs at prior=0.5: threshold should equal the prior.
|
|
379
|
+
|
|
380
|
+
>>> bayes_optimal_threshold(0.5, c_fp=1.0, c_fn=1.0)
|
|
381
|
+
0.5
|
|
382
|
+
|
|
383
|
+
Rare-positive case with FN 10× more expensive than FP:
|
|
384
|
+
|
|
385
|
+
>>> round(bayes_optimal_threshold(0.01, c_fp=1.0, c_fn=10.0), 4)
|
|
386
|
+
0.9083
|
|
387
|
+
|
|
388
|
+
Edge cases:
|
|
389
|
+
|
|
390
|
+
>>> bayes_optimal_threshold(0.0, c_fp=1.0, c_fn=1.0)
|
|
391
|
+
1.0
|
|
392
|
+
>>> bayes_optimal_threshold(1.0, c_fp=1.0, c_fn=1.0)
|
|
393
|
+
0.0
|
|
394
|
+
|
|
395
|
+
Notes
|
|
396
|
+
-----
|
|
397
|
+
Symmetric costs (c_fp == c_fn) collapse the formula to t* = 1 - π.
|
|
398
|
+
Equivalently, when costs are equal the optimal threshold is the *negative*
|
|
399
|
+
prior — predicting 1 whenever P(y=1 | x) > P(y=0).
|
|
400
|
+
|
|
401
|
+
Attribution caveat: Elkan 2001 §4 derives the **prior-independent**
|
|
402
|
+
posterior-formula ``t* = c_fp / (c_fp + c_fn)`` for thresholding a
|
|
403
|
+
*Bayes-calibrated* posterior P(y=1 | x). The formula implemented here
|
|
404
|
+
is the **prior-corrected** form for thresholding raw scores at a known
|
|
405
|
+
deployment prior π, which agrees with Elkan only under symmetric costs.
|
|
406
|
+
For our intended use (deployment prior + asymmetric costs) the
|
|
407
|
+
prior-corrected form is what the user wants — but the citation should
|
|
408
|
+
be read as "Elkan 2001 cost-sensitive framework", not literal §4.
|
|
409
|
+
|
|
410
|
+
References
|
|
411
|
+
----------
|
|
412
|
+
.. [#elkan] Elkan, C. "The foundations of cost-sensitive learning." IJCAI
|
|
413
|
+
2001.
|
|
414
|
+
"""
|
|
415
|
+
if not 0.0 <= π <= 1.0:
|
|
416
|
+
raise ValueError(f"π (prior) must be in [0, 1], got {π}")
|
|
417
|
+
if c_fp <= 0:
|
|
418
|
+
raise ValueError(f"c_fp must be > 0, got {c_fp}")
|
|
419
|
+
if c_fn <= 0:
|
|
420
|
+
raise ValueError(f"c_fn must be > 0, got {c_fn}")
|
|
421
|
+
|
|
422
|
+
if π == 0.0:
|
|
423
|
+
return 1.0
|
|
424
|
+
if π == 1.0:
|
|
425
|
+
return 0.0
|
|
426
|
+
numerator = c_fp * (1.0 - π)
|
|
427
|
+
denominator = numerator + c_fn * π
|
|
428
|
+
return float(numerator / denominator)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
@dataclass(frozen=True, slots=True)
|
|
432
|
+
class CostMatrix:
|
|
433
|
+
r"""Frozen scaffolding for FP/FN/abstain costs at an assumed prior.
|
|
434
|
+
|
|
435
|
+
Pairs a deployment prior with FP/FN costs (and optionally an abstain cost
|
|
436
|
+
for selective classification). The :attr:`bayes_threshold` property
|
|
437
|
+
composes :func:`bayes_optimal_threshold`.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
----------
|
|
441
|
+
prior : float, optional
|
|
442
|
+
Assumed deployment prevalence P(y=1). Default 0.01.
|
|
443
|
+
fp_cost : float, optional
|
|
444
|
+
Cost of a false positive. Default 1.0.
|
|
445
|
+
fn_cost : float, optional
|
|
446
|
+
Cost of a false negative. Default 10.0.
|
|
447
|
+
abstain_cost : float or None, optional
|
|
448
|
+
Optional cost of abstaining/escalating. ``None`` means abstention is
|
|
449
|
+
not allowed in this policy.
|
|
450
|
+
notes : str, optional
|
|
451
|
+
Free-form annotation.
|
|
452
|
+
|
|
453
|
+
Examples
|
|
454
|
+
--------
|
|
455
|
+
>>> cm = CostMatrix(prior=0.5, fp_cost=1.0, fn_cost=1.0)
|
|
456
|
+
>>> cm.bayes_threshold
|
|
457
|
+
0.5
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
prior: float = DEFAULT_PRIOR
|
|
461
|
+
fp_cost: float = DEFAULT_FP_COST
|
|
462
|
+
fn_cost: float = DEFAULT_FN_COST
|
|
463
|
+
abstain_cost: float | None = None
|
|
464
|
+
notes: str = ""
|
|
465
|
+
|
|
466
|
+
def __post_init__(self) -> None:
|
|
467
|
+
"""Validate the cost-matrix triple."""
|
|
468
|
+
if not 0.0 <= self.prior <= 1.0:
|
|
469
|
+
raise ValueError(f"prior must be in [0, 1], got {self.prior}")
|
|
470
|
+
if self.fp_cost <= 0:
|
|
471
|
+
raise ValueError(f"fp_cost must be > 0, got {self.fp_cost}")
|
|
472
|
+
if self.fn_cost <= 0:
|
|
473
|
+
raise ValueError(f"fn_cost must be > 0, got {self.fn_cost}")
|
|
474
|
+
if self.abstain_cost is not None and self.abstain_cost < 0:
|
|
475
|
+
raise ValueError(f"abstain_cost must be >= 0 if set, got {self.abstain_cost}")
|
|
476
|
+
|
|
477
|
+
@property
|
|
478
|
+
def bayes_threshold(self) -> float:
|
|
479
|
+
"""Compose :func:`bayes_optimal_threshold` using this matrix's fields."""
|
|
480
|
+
return bayes_optimal_threshold(self.prior, self.fp_cost, self.fn_cost)
|
|
481
|
+
|
|
482
|
+
def expected_cost(
|
|
483
|
+
self, y_true: np.ndarray, y_score: np.ndarray, threshold: float | None = None
|
|
484
|
+
) -> float:
|
|
485
|
+
r"""Empirical expected cost on labeled data at a given (or Bayes-optimal) threshold.
|
|
486
|
+
|
|
487
|
+
For each row, the cost of the prediction at ``threshold`` is:
|
|
488
|
+
|
|
489
|
+
- True positive (y=1, pred=1): 0
|
|
490
|
+
- True negative (y=0, pred=0): 0
|
|
491
|
+
- False positive (y=0, pred=1): ``fp_cost``
|
|
492
|
+
- False negative (y=1, pred=0): ``fn_cost``
|
|
493
|
+
|
|
494
|
+
Returns the mean cost across the dataset, weighted equally per row.
|
|
495
|
+
|
|
496
|
+
Parameters
|
|
497
|
+
----------
|
|
498
|
+
y_true : np.ndarray, shape (n,)
|
|
499
|
+
Binary labels in {0, 1}.
|
|
500
|
+
y_score : np.ndarray, shape (n,)
|
|
501
|
+
Predicted probabilities in [0, 1].
|
|
502
|
+
threshold : float or None, optional
|
|
503
|
+
Decision threshold; if ``None``, uses :attr:`bayes_threshold`.
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
float
|
|
508
|
+
Mean cost per row.
|
|
509
|
+
|
|
510
|
+
Raises
|
|
511
|
+
------
|
|
512
|
+
ValueError
|
|
513
|
+
If ``y_true`` and ``y_score`` have mismatched shapes.
|
|
514
|
+
|
|
515
|
+
Examples
|
|
516
|
+
--------
|
|
517
|
+
>>> cm = CostMatrix(prior=0.5, fp_cost=1.0, fn_cost=10.0)
|
|
518
|
+
>>> y = np.array([0, 1, 0, 1])
|
|
519
|
+
>>> s = np.array([0.6, 0.4, 0.1, 0.9]) # 1 FP, 1 FN
|
|
520
|
+
>>> # At threshold=0.5: pred = [1, 0, 0, 1]; FP at idx 0, FN at idx 1
|
|
521
|
+
>>> cm.expected_cost(y, s, threshold=0.5)
|
|
522
|
+
2.75
|
|
523
|
+
"""
|
|
524
|
+
t = threshold if threshold is not None else self.bayes_threshold
|
|
525
|
+
y_arr = np.asarray(y_true).astype(int)
|
|
526
|
+
s_arr = np.asarray(y_score, dtype=float)
|
|
527
|
+
if y_arr.shape != s_arr.shape:
|
|
528
|
+
raise ValueError(f"y_true shape {y_arr.shape} != y_score shape {s_arr.shape}")
|
|
529
|
+
y_pred = (s_arr >= t).astype(int)
|
|
530
|
+
fp = ((y_pred == 1) & (y_arr == 0)).sum()
|
|
531
|
+
fn = ((y_pred == 0) & (y_arr == 1)).sum()
|
|
532
|
+
n = max(len(y_arr), 1)
|
|
533
|
+
return float((fp * self.fp_cost + fn * self.fn_cost) / n)
|
|
534
|
+
|
|
535
|
+
def to_dict(self) -> dict[str, object]:
|
|
536
|
+
"""JSON-serializable form with the derived threshold."""
|
|
537
|
+
return {
|
|
538
|
+
"prior": self.prior,
|
|
539
|
+
"fp_cost": self.fp_cost,
|
|
540
|
+
"fn_cost": self.fn_cost,
|
|
541
|
+
"abstain_cost": self.abstain_cost,
|
|
542
|
+
"notes": self.notes,
|
|
543
|
+
"bayes_threshold": self.bayes_threshold,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
_SCORE_CLIP_LO = 1e-7
|
|
548
|
+
_SCORE_CLIP_HI = 1.0 - 1e-7
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _validate_calibrator_inputs(
|
|
552
|
+
y_true: np.ndarray, y_score: np.ndarray
|
|
553
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
554
|
+
"""Shared input validation for the three calibrator fitters."""
|
|
555
|
+
y_true_arr = np.asarray(y_true).astype(int)
|
|
556
|
+
y_score_arr = np.asarray(y_score).astype(float)
|
|
557
|
+
if y_true_arr.shape != y_score_arr.shape:
|
|
558
|
+
raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
|
|
559
|
+
if y_true_arr.size == 0:
|
|
560
|
+
raise ValueError("y_true is empty")
|
|
561
|
+
if not np.isfinite(y_score_arr).all():
|
|
562
|
+
raise ValueError("y_score contains NaN or inf")
|
|
563
|
+
n_pos = int(y_true_arr.sum())
|
|
564
|
+
if n_pos == 0 or n_pos == y_true_arr.size:
|
|
565
|
+
raise ValueError(
|
|
566
|
+
f"y_true must contain both classes; got n={y_true_arr.size}, n_positive={n_pos}"
|
|
567
|
+
)
|
|
568
|
+
return y_true_arr, y_score_arr
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def fit_isotonic_calibrator(
|
|
572
|
+
y_true: np.ndarray, y_score: np.ndarray
|
|
573
|
+
) -> Callable[[np.ndarray], np.ndarray]:
|
|
574
|
+
"""Niculescu-Mizil & Caruana 2005 [#nm05]_ isotonic regression.
|
|
575
|
+
|
|
576
|
+
Parameters
|
|
577
|
+
----------
|
|
578
|
+
y_true : np.ndarray, shape (n,)
|
|
579
|
+
Binary labels in {0, 1}.
|
|
580
|
+
y_score : np.ndarray, shape (n,)
|
|
581
|
+
Predicted probabilities in [0, 1].
|
|
582
|
+
|
|
583
|
+
Returns
|
|
584
|
+
-------
|
|
585
|
+
callable
|
|
586
|
+
Maps raw scores to monotonically calibrated probabilities,
|
|
587
|
+
clipped to [0, 1] via ``out_of_bounds="clip"``.
|
|
588
|
+
|
|
589
|
+
Raises
|
|
590
|
+
------
|
|
591
|
+
ValueError
|
|
592
|
+
On shape mismatch, empty input, non-finite scores, or single-class
|
|
593
|
+
``y_true`` (calibration is degenerate).
|
|
594
|
+
|
|
595
|
+
Examples
|
|
596
|
+
--------
|
|
597
|
+
>>> import numpy as np
|
|
598
|
+
>>> rng = np.random.default_rng(42)
|
|
599
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
600
|
+
>>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
|
|
601
|
+
>>> g = fit_isotonic_calibrator(y, s)
|
|
602
|
+
>>> calibrated = g(s)
|
|
603
|
+
>>> bool(calibrated.min() >= 0.0 and calibrated.max() <= 1.0)
|
|
604
|
+
True
|
|
605
|
+
|
|
606
|
+
Notes
|
|
607
|
+
-----
|
|
608
|
+
Isotonic regression fits a monotonic step function from raw scores to
|
|
609
|
+
calibrated probabilities. The fit is non-parametric; on small fitting
|
|
610
|
+
sets it can overfit. Niculescu-Mizil & Caruana 2005 §5 finds isotonic
|
|
611
|
+
competitive with Platt only at **n ≳ 1000**; below ~1000 Platt scaling
|
|
612
|
+
(or :class:`fit_beta_calibrator`) typically generalizes better. Prefer
|
|
613
|
+
Platt / Beta for small calibration sets.
|
|
614
|
+
|
|
615
|
+
References
|
|
616
|
+
----------
|
|
617
|
+
.. [1] Niculescu-Mizil, A. & Caruana, R. "Predicting good probabilities
|
|
618
|
+
with supervised learning." ICML 2005.
|
|
619
|
+
.. [2] Zadrozny, B. & Elkan, C. "Transforming classifier scores into
|
|
620
|
+
accurate multiclass probability estimates." KDD 2002.
|
|
621
|
+
"""
|
|
622
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
623
|
+
iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
|
|
624
|
+
iso.fit(y_score_arr, y_true_arr)
|
|
625
|
+
|
|
626
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
627
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
628
|
+
if not np.isfinite(arr).all():
|
|
629
|
+
raise ValueError("scores contains NaN or inf")
|
|
630
|
+
out: np.ndarray = np.asarray(iso.predict(arr), dtype=float)
|
|
631
|
+
return out
|
|
632
|
+
|
|
633
|
+
return apply
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _platt_loss_grad(
|
|
637
|
+
ab: np.ndarray, scores: np.ndarray, smoothed_targets: np.ndarray
|
|
638
|
+
) -> tuple[float, np.ndarray]:
|
|
639
|
+
"""Binomial NLL + gradient under Laplace-smoothed targets (Lin 2007 §2).
|
|
640
|
+
|
|
641
|
+
Parameters
|
|
642
|
+
----------
|
|
643
|
+
ab : np.ndarray, shape (2,)
|
|
644
|
+
Sigmoid parameters ``(a, b)``; the calibrated score is
|
|
645
|
+
:math:`\\sigma(a \\cdot s + b)`.
|
|
646
|
+
scores : np.ndarray, shape (n,)
|
|
647
|
+
Raw scores ``F`` (Platt's notation).
|
|
648
|
+
smoothed_targets : np.ndarray, shape (n,)
|
|
649
|
+
Laplace-smoothed targets ``T`` per Lin 2007 (avoids
|
|
650
|
+
log-of-zero singularities under MLE).
|
|
651
|
+
|
|
652
|
+
Returns
|
|
653
|
+
-------
|
|
654
|
+
loss : float
|
|
655
|
+
Total NLL.
|
|
656
|
+
grad : np.ndarray, shape (2,)
|
|
657
|
+
Gradient w.r.t. ``(a, b)``.
|
|
658
|
+
"""
|
|
659
|
+
a, b = ab
|
|
660
|
+
z = a * scores + b
|
|
661
|
+
# Stable: NLL_i = T·log(1+exp(-z)) + (1-T)·log(1+exp(z))
|
|
662
|
+
pos_part = smoothed_targets * np.logaddexp(0.0, -z)
|
|
663
|
+
neg_part = (1.0 - smoothed_targets) * np.logaddexp(0.0, z)
|
|
664
|
+
loss = float((pos_part + neg_part).sum())
|
|
665
|
+
# dNLL/dz_i = σ(z_i) - T_i
|
|
666
|
+
sigmoid_z = 1.0 / (1.0 + np.exp(-z))
|
|
667
|
+
err = sigmoid_z - smoothed_targets
|
|
668
|
+
grad = np.array([float(np.dot(err, scores)), float(err.sum())])
|
|
669
|
+
return loss, grad
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
@dataclass(frozen=True, slots=True)
|
|
673
|
+
class PlattFit:
|
|
674
|
+
r"""Fitted Platt sigmoid calibrator: ``(a, b)`` parameters + transform.
|
|
675
|
+
|
|
676
|
+
Returned by :func:`fit_platt_calibrator` so callers can both apply the
|
|
677
|
+
calibrator (via ``__call__``) and serialize / inspect / reuse the fitted
|
|
678
|
+
parameters without reverse-engineering them from the closure.
|
|
679
|
+
|
|
680
|
+
The ``__call__`` delegation preserves back-compat with eval-toolkit ≤ 0.11.0,
|
|
681
|
+
where ``fit_platt_calibrator`` returned a plain ``Callable``: any caller
|
|
682
|
+
that used the return value as ``g(scores)`` continues to work unchanged.
|
|
683
|
+
|
|
684
|
+
Attributes
|
|
685
|
+
----------
|
|
686
|
+
transform : Callable[[np.ndarray], np.ndarray]
|
|
687
|
+
Maps raw scores to calibrated probabilities via :math:`\sigma(a s + b)`.
|
|
688
|
+
a : float
|
|
689
|
+
Fitted slope.
|
|
690
|
+
b : float
|
|
691
|
+
Fitted intercept.
|
|
692
|
+
|
|
693
|
+
Examples
|
|
694
|
+
--------
|
|
695
|
+
>>> import numpy as np
|
|
696
|
+
>>> rng = np.random.default_rng(42)
|
|
697
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
698
|
+
>>> s = (y + rng.normal(0, 0.5, size=200))
|
|
699
|
+
>>> fit = fit_platt_calibrator(y, s)
|
|
700
|
+
>>> bool(fit.a > 0.0) # well-separated → positive slope
|
|
701
|
+
True
|
|
702
|
+
>>> calibrated = fit(s) # __call__ delegates to transform
|
|
703
|
+
>>> bool(calibrated.min() > 0.0 and calibrated.max() < 1.0)
|
|
704
|
+
True
|
|
705
|
+
"""
|
|
706
|
+
|
|
707
|
+
transform: Callable[[np.ndarray], np.ndarray]
|
|
708
|
+
a: float
|
|
709
|
+
b: float
|
|
710
|
+
|
|
711
|
+
def __call__(self, scores: np.ndarray) -> np.ndarray:
|
|
712
|
+
return self.transform(scores)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def fit_platt_calibrator(y_true: np.ndarray, y_score: np.ndarray) -> PlattFit:
|
|
716
|
+
r"""Platt 1999 [#platt]_ sigmoid scaling with Lin 2007 [#lin]_ Laplace-smoothed targets.
|
|
717
|
+
|
|
718
|
+
Canonical Platt scaling: fits :math:`\sigma(a \cdot s + b)` to maximize
|
|
719
|
+
the binomial likelihood under Laplace-smoothed targets
|
|
720
|
+
|
|
721
|
+
.. math::
|
|
722
|
+
|
|
723
|
+
T_i = \frac{n_+ + 1}{n_+ + 2} \quad \text{if } y_i = 1, \qquad
|
|
724
|
+
T_i = \frac{1}{n_- + 2} \quad \text{if } y_i = 0,
|
|
725
|
+
|
|
726
|
+
where :math:`n_+` and :math:`n_-` are the positive and negative counts.
|
|
727
|
+
The smoothing avoids the MLE singularity at zero/one counts and matches
|
|
728
|
+
:class:`sklearn.calibration._SigmoidCalibration` to within optimizer
|
|
729
|
+
tolerance.
|
|
730
|
+
|
|
731
|
+
Parameters
|
|
732
|
+
----------
|
|
733
|
+
y_true : np.ndarray, shape (n,)
|
|
734
|
+
Binary labels in {0, 1}.
|
|
735
|
+
y_score : np.ndarray, shape (n,)
|
|
736
|
+
Predicted probabilities or scores.
|
|
737
|
+
|
|
738
|
+
Returns
|
|
739
|
+
-------
|
|
740
|
+
PlattFit
|
|
741
|
+
Frozen dataclass exposing the fitted ``a`` (slope) and ``b``
|
|
742
|
+
(intercept) parameters alongside a ``transform`` callable. The
|
|
743
|
+
instance itself is ``__call__``-able for back-compat with v0.11
|
|
744
|
+
and earlier (a plain ``Callable`` annotation accepts a ``PlattFit``).
|
|
745
|
+
|
|
746
|
+
Raises
|
|
747
|
+
------
|
|
748
|
+
ValueError
|
|
749
|
+
On shape mismatch, empty input, non-finite scores, or single-class
|
|
750
|
+
``y_true``.
|
|
751
|
+
RuntimeError
|
|
752
|
+
If the L-BFGS-B optimizer fails to converge. The error message
|
|
753
|
+
includes the SciPy optimizer message for diagnostics.
|
|
754
|
+
|
|
755
|
+
Examples
|
|
756
|
+
--------
|
|
757
|
+
>>> import numpy as np
|
|
758
|
+
>>> rng = np.random.default_rng(42)
|
|
759
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
760
|
+
>>> s = (y + rng.normal(0, 0.5, size=200))
|
|
761
|
+
>>> fit = fit_platt_calibrator(y, s)
|
|
762
|
+
>>> isinstance(fit.a, float) and isinstance(fit.b, float)
|
|
763
|
+
True
|
|
764
|
+
>>> out = fit(s) # __call__ delegates to transform
|
|
765
|
+
>>> bool(out.min() > 0.0 and out.max() < 1.0)
|
|
766
|
+
True
|
|
767
|
+
|
|
768
|
+
Notes
|
|
769
|
+
-----
|
|
770
|
+
Platt scaling fits the two-parameter sigmoid
|
|
771
|
+
|
|
772
|
+
.. math:: P(y=1 \mid s) = \sigma(a \cdot s + b) = \frac{1}{1 + \exp(-(a s + b))}
|
|
773
|
+
|
|
774
|
+
by maximum-likelihood under Lin 2007's Laplace-smoothed targets. Unlike
|
|
775
|
+
isotonic, the parametric form regularizes small samples but cannot
|
|
776
|
+
correct strongly non-monotone miscalibration.
|
|
777
|
+
|
|
778
|
+
Initialization follows sklearn / Lin 2007: ``a₀ = 0``, ``b₀ = log((n_- + 1) / (n_+ + 1))``;
|
|
779
|
+
the optimizer is L-BFGS-B with analytic gradient.
|
|
780
|
+
|
|
781
|
+
Behavior change vs eval-toolkit ≤ 0.2.0: previous versions wrapped
|
|
782
|
+
:class:`sklearn.linear_model.LogisticRegression` with default L2
|
|
783
|
+
regularization. v0.3.0 implements canonical Platt directly to match
|
|
784
|
+
:class:`sklearn.calibration._SigmoidCalibration` (Lin 2007). Empirical
|
|
785
|
+
delta on imbalanced data is ~1–3% ECE.
|
|
786
|
+
|
|
787
|
+
Return-type change vs eval-toolkit ≤ 0.11.0: previously returned a plain
|
|
788
|
+
``Callable[[np.ndarray], np.ndarray]``. v0.12.0 returns a :class:`PlattFit`
|
|
789
|
+
dataclass that exposes the fitted ``(a, b)`` and delegates ``__call__``
|
|
790
|
+
to the transform. Existing callers typed as ``Callable`` keep working;
|
|
791
|
+
new callers can read ``fit.a`` / ``fit.b`` directly (no logit-probe).
|
|
792
|
+
|
|
793
|
+
References
|
|
794
|
+
----------
|
|
795
|
+
.. [#platt] Platt, J. "Probabilistic outputs for support vector machines
|
|
796
|
+
and comparisons to regularized likelihood methods." Advances in Large
|
|
797
|
+
Margin Classifiers, 1999.
|
|
798
|
+
.. [#lin] Lin, H. T., Lin, C. J., & Weng, R. C. "A note on Platt's
|
|
799
|
+
probabilistic outputs for support vector machines." Machine Learning
|
|
800
|
+
68(3), 2007.
|
|
801
|
+
"""
|
|
802
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
803
|
+
|
|
804
|
+
n_pos = float(np.sum(y_true_arr > 0))
|
|
805
|
+
n_neg = float(y_true_arr.size - n_pos)
|
|
806
|
+
smoothed = np.empty_like(y_score_arr)
|
|
807
|
+
smoothed[y_true_arr > 0] = (n_pos + 1.0) / (n_pos + 2.0)
|
|
808
|
+
smoothed[y_true_arr <= 0] = 1.0 / (n_neg + 2.0)
|
|
809
|
+
|
|
810
|
+
ab_init = np.array([0.0, float(np.log((n_neg + 1.0) / (n_pos + 1.0)))])
|
|
811
|
+
result = minimize(
|
|
812
|
+
_platt_loss_grad,
|
|
813
|
+
ab_init,
|
|
814
|
+
args=(y_score_arr, smoothed),
|
|
815
|
+
method="L-BFGS-B",
|
|
816
|
+
jac=True,
|
|
817
|
+
)
|
|
818
|
+
if not result.success:
|
|
819
|
+
raise RuntimeError(f"Platt calibration optimization failed: {result.message}")
|
|
820
|
+
a_fit, b_fit = float(result.x[0]), float(result.x[1])
|
|
821
|
+
|
|
822
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
823
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
824
|
+
if not np.isfinite(arr).all():
|
|
825
|
+
raise ValueError("scores contains NaN or inf")
|
|
826
|
+
z = a_fit * arr + b_fit
|
|
827
|
+
out: np.ndarray = (1.0 / (1.0 + np.exp(-z))).astype(float)
|
|
828
|
+
return out
|
|
829
|
+
|
|
830
|
+
return PlattFit(transform=apply, a=a_fit, b=b_fit)
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def fit_beta_calibrator(
|
|
834
|
+
y_true: np.ndarray, y_score: np.ndarray
|
|
835
|
+
) -> Callable[[np.ndarray], np.ndarray]:
|
|
836
|
+
r"""Beta calibration per Kull et al. 2017 [#kull]_.
|
|
837
|
+
|
|
838
|
+
Three-parameter generalization of Platt scaling using the Beta family:
|
|
839
|
+
|
|
840
|
+
.. math::
|
|
841
|
+
|
|
842
|
+
P(y=1 \mid s) = \frac{1}{1 + \exp(-(a \log s - b \log (1 - s) + c))}
|
|
843
|
+
|
|
844
|
+
Equivalently, fit a logistic regression on the 2-feature transform
|
|
845
|
+
:math:`(\log s, \log(1 - s))`. The slope coefficient on :math:`\log s`
|
|
846
|
+
is ``a``; the (negated) slope on :math:`\log(1-s)` is ``b``; the
|
|
847
|
+
intercept is ``c``.
|
|
848
|
+
|
|
849
|
+
Beta calibration empirically dominates Platt scaling on most
|
|
850
|
+
real-world classifiers (Kull et al. 2017 §5), at the cost of one
|
|
851
|
+
extra parameter and a slightly more complex feature transform. It
|
|
852
|
+
is *not* monotone in the score (unlike Platt and isotonic) — for
|
|
853
|
+
that, use :func:`fit_isotonic_calibrator`.
|
|
854
|
+
|
|
855
|
+
Parameters
|
|
856
|
+
----------
|
|
857
|
+
y_true : np.ndarray, shape (n,)
|
|
858
|
+
Binary labels in {0, 1}.
|
|
859
|
+
y_score : np.ndarray, shape (n,)
|
|
860
|
+
Predicted probabilities in (0, 1). Scores at the extremes
|
|
861
|
+
``{0, 1}`` are clipped to ``[1e-7, 1 - 1e-7]`` so the log-link
|
|
862
|
+
is finite.
|
|
863
|
+
|
|
864
|
+
Returns
|
|
865
|
+
-------
|
|
866
|
+
callable
|
|
867
|
+
Maps raw scores to calibrated probabilities via the fitted
|
|
868
|
+
3-parameter Beta sigmoid.
|
|
869
|
+
|
|
870
|
+
Raises
|
|
871
|
+
------
|
|
872
|
+
ValueError
|
|
873
|
+
On shape mismatch, empty input, non-finite scores, or
|
|
874
|
+
single-class ``y_true``.
|
|
875
|
+
|
|
876
|
+
Examples
|
|
877
|
+
--------
|
|
878
|
+
>>> import numpy as np
|
|
879
|
+
>>> rng = np.random.default_rng(42)
|
|
880
|
+
>>> y = rng.integers(0, 2, size=300)
|
|
881
|
+
>>> s = (y + rng.normal(0, 0.4, size=300)).clip(0.01, 0.99)
|
|
882
|
+
>>> g = fit_beta_calibrator(y, s)
|
|
883
|
+
>>> out = g(s)
|
|
884
|
+
>>> bool(out.min() >= 0.0 and out.max() <= 1.0)
|
|
885
|
+
True
|
|
886
|
+
|
|
887
|
+
See Also
|
|
888
|
+
--------
|
|
889
|
+
eval_toolkit.calibration.fit_platt_calibrator :
|
|
890
|
+
2-parameter sigmoid; Beta is a strict generalization.
|
|
891
|
+
eval_toolkit.calibration.fit_isotonic_calibrator :
|
|
892
|
+
Non-parametric monotone alternative.
|
|
893
|
+
|
|
894
|
+
Notes
|
|
895
|
+
-----
|
|
896
|
+
Implementation: build features ``(log s, log(1 - s))`` and fit
|
|
897
|
+
sklearn ``LogisticRegression`` with no regularization (``C=1e9``)
|
|
898
|
+
to recover the Beta-calibration MLE. This matches the reference
|
|
899
|
+
`betacal <https://github.com/betacal/python>`_ implementation up to
|
|
900
|
+
optimizer tolerance for typical data.
|
|
901
|
+
|
|
902
|
+
References
|
|
903
|
+
----------
|
|
904
|
+
.. [#kull] Kull, M., Filho, T. S., & Flach, P. "Beta calibration: a
|
|
905
|
+
well-founded and easily implemented improvement on logistic
|
|
906
|
+
calibration for binary classifiers." AISTATS 2017.
|
|
907
|
+
arXiv:1607.06770.
|
|
908
|
+
"""
|
|
909
|
+
from sklearn.linear_model import LogisticRegression # noqa: PLC0415
|
|
910
|
+
|
|
911
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
912
|
+
s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
913
|
+
features = np.column_stack([np.log(s_clipped), np.log(1.0 - s_clipped)])
|
|
914
|
+
# Effectively unregularized (C very large) — matches Kull 2017 MLE.
|
|
915
|
+
lr = LogisticRegression(C=1e9, solver="lbfgs", max_iter=2000)
|
|
916
|
+
lr.fit(features, y_true_arr)
|
|
917
|
+
|
|
918
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
919
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
920
|
+
if not np.isfinite(arr).all():
|
|
921
|
+
raise ValueError("scores contains NaN or inf")
|
|
922
|
+
clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
923
|
+
f = np.column_stack([np.log(clipped), np.log(1.0 - clipped)])
|
|
924
|
+
out: np.ndarray = lr.predict_proba(f)[:, 1].astype(float)
|
|
925
|
+
return out
|
|
926
|
+
|
|
927
|
+
return apply
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def fit_temperature(
|
|
931
|
+
val_logits: np.ndarray,
|
|
932
|
+
val_labels: np.ndarray,
|
|
933
|
+
bounds: tuple[float, float] = (0.05, 20.0),
|
|
934
|
+
) -> dict[str, float]:
|
|
935
|
+
r"""Single-parameter temperature scaling per Guo et al. 2017 [#guo]_.
|
|
936
|
+
|
|
937
|
+
Fits a scalar T > 0 on validation logits to minimize negative log-likelihood:
|
|
938
|
+
|
|
939
|
+
.. math:: T^* = \arg\min_T - \frac{1}{n}\sum_i \log p_{y_i}(x_i / T)
|
|
940
|
+
|
|
941
|
+
where :math:`p_y(x / T) = \mathrm{softmax}(x/T)_y`. T scales the entire
|
|
942
|
+
logit vector before softmax, so accuracy (argmax) is preserved exactly
|
|
943
|
+
while the confidence distribution flattens (T > 1) or sharpens (T < 1).
|
|
944
|
+
|
|
945
|
+
Parameters
|
|
946
|
+
----------
|
|
947
|
+
val_logits : np.ndarray, shape (n, 2)
|
|
948
|
+
Validation logits for the binary classifier (column 0 = negative class,
|
|
949
|
+
column 1 = positive class).
|
|
950
|
+
val_labels : np.ndarray, shape (n,)
|
|
951
|
+
Binary labels in {0, 1}.
|
|
952
|
+
bounds : tuple of float, optional
|
|
953
|
+
``(lo, hi)`` bracket for ``T``. Default ``(0.05, 20.0)``.
|
|
954
|
+
|
|
955
|
+
Returns
|
|
956
|
+
-------
|
|
957
|
+
dict
|
|
958
|
+
Keys: ``temperature`` (T*), ``nll_pre`` (NLL at T=1), ``nll_post``
|
|
959
|
+
(NLL at T=T*), ``improvement`` (nll_pre - nll_post; ≥ 0 always).
|
|
960
|
+
|
|
961
|
+
Raises
|
|
962
|
+
------
|
|
963
|
+
ValueError
|
|
964
|
+
If ``val_logits`` shape is not (n, 2), shapes mismatch, or labels are
|
|
965
|
+
not binary.
|
|
966
|
+
RuntimeError
|
|
967
|
+
If the bounded scalar optimizer fails to converge.
|
|
968
|
+
|
|
969
|
+
Examples
|
|
970
|
+
--------
|
|
971
|
+
>>> import numpy as np
|
|
972
|
+
>>> rng = np.random.default_rng(42)
|
|
973
|
+
>>> # synthesize uncalibrated logits with known T_true = 3.0
|
|
974
|
+
>>> base = rng.normal(size=(500, 2))
|
|
975
|
+
>>> labels = (base[:, 1] > base[:, 0]).astype(int)
|
|
976
|
+
>>> logits = base * 3.0 # makes scores overconfident
|
|
977
|
+
>>> result = fit_temperature(logits, labels)
|
|
978
|
+
>>> 0.05 <= result['temperature'] <= 20.0
|
|
979
|
+
True
|
|
980
|
+
>>> result['nll_post'] <= result['nll_pre'] # always non-increasing
|
|
981
|
+
True
|
|
982
|
+
|
|
983
|
+
Notes
|
|
984
|
+
-----
|
|
985
|
+
Temperature scaling preserves accuracy exactly because dividing all
|
|
986
|
+
logits by the same scalar does not change the argmax. It only rescales
|
|
987
|
+
the *confidence* (max softmax probability), which is what miscalibration
|
|
988
|
+
in modern overconfident networks measures.
|
|
989
|
+
|
|
990
|
+
References
|
|
991
|
+
----------
|
|
992
|
+
.. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
|
|
993
|
+
calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
|
|
994
|
+
"""
|
|
995
|
+
if val_logits.ndim != 2 or val_logits.shape[1] != 2:
|
|
996
|
+
raise ValueError(f"val_logits must be (n, 2), got shape {val_logits.shape}")
|
|
997
|
+
if val_logits.shape[0] != val_labels.shape[0]:
|
|
998
|
+
raise ValueError(
|
|
999
|
+
f"length mismatch: logits {val_logits.shape[0]} vs labels {val_labels.shape[0]}"
|
|
1000
|
+
)
|
|
1001
|
+
if val_logits.shape[0] == 0:
|
|
1002
|
+
raise ValueError("val_logits is empty")
|
|
1003
|
+
if not np.isfinite(val_logits).all():
|
|
1004
|
+
raise ValueError("val_logits contains NaN or inf")
|
|
1005
|
+
if not set(np.unique(val_labels).tolist()).issubset({0, 1}):
|
|
1006
|
+
raise ValueError("val_labels must be binary (0/1)")
|
|
1007
|
+
n_pos = int(np.sum(val_labels))
|
|
1008
|
+
if n_pos == 0 or n_pos == val_labels.shape[0]:
|
|
1009
|
+
raise ValueError(
|
|
1010
|
+
f"val_labels must contain both classes; got n={val_labels.shape[0]}, "
|
|
1011
|
+
f"n_positive={n_pos}"
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
nll_pre = _negative_log_likelihood(1.0, val_logits, val_labels)
|
|
1015
|
+
res = minimize_scalar(
|
|
1016
|
+
_negative_log_likelihood,
|
|
1017
|
+
bounds=bounds,
|
|
1018
|
+
method="bounded",
|
|
1019
|
+
args=(val_logits, val_labels),
|
|
1020
|
+
)
|
|
1021
|
+
if not res.success:
|
|
1022
|
+
raise RuntimeError(f"temperature optimization failed: {res.message}")
|
|
1023
|
+
t_opt = float(res.x)
|
|
1024
|
+
nll_post = _negative_log_likelihood(t_opt, val_logits, val_labels)
|
|
1025
|
+
return {
|
|
1026
|
+
"temperature": t_opt,
|
|
1027
|
+
"nll_pre": nll_pre,
|
|
1028
|
+
"nll_post": nll_post,
|
|
1029
|
+
"improvement": nll_pre - nll_post,
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
|
|
1033
|
+
def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -> float:
|
|
1034
|
+
"""NLL of softmax(logits / T) against true labels."""
|
|
1035
|
+
if t <= 0:
|
|
1036
|
+
return float("inf")
|
|
1037
|
+
log_probs = log_softmax(logits / t, axis=-1)
|
|
1038
|
+
return float(-log_probs[np.arange(len(labels)), labels].mean())
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
def fit_temperature_oracle(
|
|
1042
|
+
y_true: np.ndarray, y_score: np.ndarray
|
|
1043
|
+
) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
|
|
1044
|
+
r"""**DIAGNOSTIC ONLY** — fit-on-test oracle T-scaling per Guo et al. 2017 [#guo]_.
|
|
1045
|
+
|
|
1046
|
+
.. warning::
|
|
1047
|
+
|
|
1048
|
+
**Do not use this function as a deployment policy.** It fits ``T``
|
|
1049
|
+
on the same data the returned callable scores — the canonical
|
|
1050
|
+
"fit-on-test" methodological pitfall. ECE measured on the fitted
|
|
1051
|
+
scores is systematically **under**-estimated, sometimes by 50% or
|
|
1052
|
+
more (Vaicenavicius 2019 §3, Kumar 2019 §5, Roelofs 2022). Use
|
|
1053
|
+
:func:`fit_temperature` (fit on a separate validation set) for
|
|
1054
|
+
deployment; use this function only to compute a diagnostic
|
|
1055
|
+
upper bound on what *any* single-T recalibration could achieve
|
|
1056
|
+
if T were chosen optimally per slice.
|
|
1057
|
+
|
|
1058
|
+
Internally inverts probabilities to logits via :math:`\log(p / (1 - p))`,
|
|
1059
|
+
fits T to minimize NLL on the T-scaled logits, then exposes a callable
|
|
1060
|
+
that applies :math:`\sigma(\mathrm{logit} / T)`.
|
|
1061
|
+
|
|
1062
|
+
Parameters
|
|
1063
|
+
----------
|
|
1064
|
+
y_true : np.ndarray, shape (n,)
|
|
1065
|
+
Binary labels in {0, 1}.
|
|
1066
|
+
y_score : np.ndarray, shape (n,)
|
|
1067
|
+
Predicted probabilities in (0, 1). Scores at the extremes {0, 1} are
|
|
1068
|
+
clipped to [1e-7, 1-1e-7] so the logit inversion is finite.
|
|
1069
|
+
|
|
1070
|
+
Returns
|
|
1071
|
+
-------
|
|
1072
|
+
tuple
|
|
1073
|
+
``(T_optimal, apply)`` where ``apply`` maps any input probability array
|
|
1074
|
+
through :math:`\sigma(\mathrm{logit}(p) / T_{optimal})`.
|
|
1075
|
+
|
|
1076
|
+
Raises
|
|
1077
|
+
------
|
|
1078
|
+
ValueError
|
|
1079
|
+
On shape mismatch, empty input, non-finite scores, or single-class
|
|
1080
|
+
``y_true``.
|
|
1081
|
+
|
|
1082
|
+
Examples
|
|
1083
|
+
--------
|
|
1084
|
+
>>> import numpy as np
|
|
1085
|
+
>>> rng = np.random.default_rng(42)
|
|
1086
|
+
>>> y = rng.integers(0, 2, size=200)
|
|
1087
|
+
>>> s = (y + rng.normal(0, 0.5, size=200)).clip(0.01, 0.99)
|
|
1088
|
+
>>> import warnings
|
|
1089
|
+
>>> with warnings.catch_warnings():
|
|
1090
|
+
... warnings.simplefilter("ignore", UserWarning)
|
|
1091
|
+
... T_opt, apply = fit_temperature_oracle(y, s)
|
|
1092
|
+
>>> T_opt > 0
|
|
1093
|
+
True
|
|
1094
|
+
"""
|
|
1095
|
+
import warnings as _warnings # noqa: PLC0415 (deferred to keep top of file lean)
|
|
1096
|
+
|
|
1097
|
+
_warnings.warn(
|
|
1098
|
+
"fit_temperature_oracle is fit-on-test and produces an under-estimated "
|
|
1099
|
+
"ECE; use fit_temperature with a held-out validation set for deployment. "
|
|
1100
|
+
"This warning may be suppressed in test contexts: "
|
|
1101
|
+
"`warnings.simplefilter('ignore', UserWarning)`.",
|
|
1102
|
+
UserWarning,
|
|
1103
|
+
stacklevel=2,
|
|
1104
|
+
)
|
|
1105
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
1106
|
+
|
|
1107
|
+
def _logits_from_probs(p: np.ndarray) -> np.ndarray:
|
|
1108
|
+
clipped = np.clip(p, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
1109
|
+
out: np.ndarray = np.log(clipped / (1.0 - clipped))
|
|
1110
|
+
return out
|
|
1111
|
+
|
|
1112
|
+
def _sigmoid(z: np.ndarray) -> np.ndarray:
|
|
1113
|
+
out: np.ndarray = 1.0 / (1.0 + np.exp(-z))
|
|
1114
|
+
return out
|
|
1115
|
+
|
|
1116
|
+
logits = _logits_from_probs(y_score_arr)
|
|
1117
|
+
|
|
1118
|
+
def nll_at_t(t: float) -> float:
|
|
1119
|
+
if t <= 0:
|
|
1120
|
+
return float("inf")
|
|
1121
|
+
scaled = logits / t
|
|
1122
|
+
# Stable log-sigmoid via softplus identity.
|
|
1123
|
+
log_p1 = -np.logaddexp(0.0, -scaled)
|
|
1124
|
+
log_p0 = -np.logaddexp(0.0, scaled)
|
|
1125
|
+
return float(-(y_true_arr * log_p1 + (1 - y_true_arr) * log_p0).sum())
|
|
1126
|
+
|
|
1127
|
+
result = minimize_scalar(
|
|
1128
|
+
nll_at_t,
|
|
1129
|
+
bounds=(0.05, 20.0),
|
|
1130
|
+
method="bounded",
|
|
1131
|
+
options={"xatol": 1e-4},
|
|
1132
|
+
)
|
|
1133
|
+
t_optimal = float(result.x)
|
|
1134
|
+
|
|
1135
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
1136
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
1137
|
+
if not np.isfinite(arr).all():
|
|
1138
|
+
raise ValueError("scores contains NaN or inf")
|
|
1139
|
+
scaled = _logits_from_probs(arr) / t_optimal
|
|
1140
|
+
out: np.ndarray = _sigmoid(scaled).astype(float)
|
|
1141
|
+
return out
|
|
1142
|
+
|
|
1143
|
+
return t_optimal, apply
|