eval-toolkit 0.27.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1143 @@
1
+ r"""Calibration: reliability curves, Bayes-optimal thresholds, isotonic/Platt/temperature scaling.
2
+
3
+ Public surface:
4
+
5
+ - :func:`reliability_curve` — bin-level calibration data
6
+ (DeGroot & Fienberg 1983 [#degroot]_; Niculescu-Mizil & Caruana 2005 [#nm05]_)
7
+ - :func:`maximum_calibration_error` — worst-bin calibration gap
8
+ (Naeini & Cooper 2014 [#mce]_); companion scalar to the ECE summaries
9
+ surfaced inside :func:`reliability_curve`.
10
+ - :func:`bayes_optimal_threshold` — closed-form cost-sensitive decision boundary
11
+ (Elkan 2001 [#elkan]_); :class:`CostMatrix` packages prior + costs + abstain cost.
12
+ - :func:`fit_isotonic_calibrator` — Niculescu-Mizil & Caruana 2005 [#nm05]_
13
+ - :func:`fit_platt_calibrator` — Platt 1999 [#platt]_ sigmoid scaling; returns a
14
+ :class:`PlattFit` dataclass exposing the fitted ``(a, b)`` parameters alongside
15
+ the transform callable (frozen, ``__call__``-able for back-compat with v0.11).
16
+ - :func:`fit_temperature` — Guo et al. 2017 [#guo]_ — fits T on val *logits* (literature standard)
17
+ - :func:`fit_temperature_oracle` — Guo et al. 2017 [#guo]_ — fits T on *probabilities*; diagnostic
18
+ upper-bound only (T is fit on the data it then scores).
19
+
20
+ References
21
+ ----------
22
+ .. [#degroot] DeGroot, M. H. & Fienberg, S. E. "The Comparison and Evaluation of Forecasters."
23
+ *The Statistician* 32 (1/2): 12-22, 1983.
24
+ .. [#elkan] Elkan, C. "The Foundations of Cost-Sensitive Learning." IJCAI 2001.
25
+ .. [#guo] Guo, C., Pleiss, G., Sun, Y. & Weinberger, K. "On Calibration of Modern Neural Networks."
26
+ ICML 2017. arXiv:1706.04599.
27
+ .. [#mce] Naeini, M. P. & Cooper, G. F. "Binary Classifier Calibration: A Bayesian Non-Parametric
28
+ Approach." SDM 2014.
29
+ .. [#nm05] Niculescu-Mizil, A. & Caruana, R. "Predicting Good Probabilities With Supervised
30
+ Learning." ICML 2005.
31
+ .. [#platt] Platt, J. "Probabilistic Outputs for Support Vector Machines and Comparisons to
32
+ Regularized Likelihood Methods." *Advances in Large Margin Classifiers*, 1999.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from collections.abc import Callable
38
+ from dataclasses import dataclass
39
+ from typing import Final, Literal
40
+
41
+ import numpy as np
42
+ from scipy.optimize import minimize, minimize_scalar
43
+ from scipy.special import log_softmax
44
+ from sklearn.calibration import calibration_curve
45
+ from sklearn.isotonic import IsotonicRegression
46
+
47
+ __all__ = [
48
+ "DEFAULT_FN_COST",
49
+ "DEFAULT_FP_COST",
50
+ "DEFAULT_N_BINS",
51
+ "DEFAULT_PRIOR",
52
+ "DEFAULT_STRATEGY",
53
+ "CostMatrix",
54
+ "PlattFit",
55
+ "bayes_optimal_threshold",
56
+ "fit_beta_calibrator",
57
+ "fit_isotonic_calibrator",
58
+ "fit_platt_calibrator",
59
+ "fit_temperature",
60
+ "fit_temperature_oracle",
61
+ "maximum_calibration_error",
62
+ "reliability_curve",
63
+ "reliability_diagram_data",
64
+ ]
65
+
66
+ DEFAULT_N_BINS: Final[int] = 10
67
+ DEFAULT_STRATEGY: Final[Literal["uniform", "quantile"]] = "quantile"
68
+
69
+ # Example cost-matrix defaults (rare-positive deployment surface). These are
70
+ # illustrative scaffolding; a real cost matrix should come from stakeholder
71
+ # elicitation, not library defaults.
72
+ DEFAULT_PRIOR: Final[float] = 0.01
73
+ DEFAULT_FP_COST: Final[float] = 1.0
74
+ DEFAULT_FN_COST: Final[float] = 10.0
75
+
76
+
77
+ def reliability_curve(
78
+ y_true: np.ndarray,
79
+ y_score: np.ndarray,
80
+ *,
81
+ n_bins: int = DEFAULT_N_BINS,
82
+ strategy: Literal["uniform", "quantile"] = DEFAULT_STRATEGY,
83
+ ) -> dict[str, object]:
84
+ """Bin-level calibration data wrapping :func:`sklearn.calibration.calibration_curve`.
85
+
86
+ Returns a JSON-friendly dict with bin centers, observed positive rates,
87
+ per-bin counts, and both equal-width and equal-mass ECE summaries.
88
+ Single-class slices are skipped with an explicit marker.
89
+
90
+ Parameters
91
+ ----------
92
+ y_true : np.ndarray, shape (n,)
93
+ Binary labels in {0, 1}.
94
+ y_score : np.ndarray, shape (n,)
95
+ Predicted probabilities in [0, 1].
96
+ n_bins : int, optional
97
+ Number of bins (default 10).
98
+ strategy : {"uniform", "quantile"}, optional
99
+ Equal-width vs equal-mass binning. Default "quantile".
100
+
101
+ Returns
102
+ -------
103
+ dict
104
+ Either the calibration record with keys ``prob_true``, ``prob_pred``,
105
+ ``bin_edges``, ``n_per_bin``, ``ece_equal_mass``, ``ece_equal_width``,
106
+ ``n_bins``, ``strategy``, ``n``, ``n_positive``,
107
+ or ``{"skipped": "...", "n", "n_positive"}`` for a single-class slice.
108
+
109
+ Raises
110
+ ------
111
+ ValueError
112
+ On shape mismatch, empty input, ``n_bins <= 1``, or unknown strategy.
113
+
114
+ Examples
115
+ --------
116
+ >>> import numpy as np
117
+ >>> rng = np.random.default_rng(42)
118
+ >>> y = rng.integers(0, 2, size=200)
119
+ >>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
120
+ >>> result = reliability_curve(y, s, n_bins=5, strategy="uniform")
121
+ >>> sorted(result.keys())[:5]
122
+ ['bin_edges', 'ece_equal_mass', 'ece_equal_width', 'n', 'n_bins']
123
+ """
124
+ y_true_arr = np.asarray(y_true).astype(int)
125
+ y_score_arr = np.asarray(y_score).astype(float)
126
+ if y_true_arr.shape != y_score_arr.shape:
127
+ raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
128
+ if y_true_arr.size == 0:
129
+ raise ValueError("y_true is empty")
130
+ if n_bins <= 1:
131
+ raise ValueError(f"n_bins must be > 1, got {n_bins}")
132
+ if strategy not in {"uniform", "quantile"}:
133
+ raise ValueError(f"strategy must be 'uniform' or 'quantile', got {strategy!r}")
134
+
135
+ n = int(y_true_arr.size)
136
+ n_positive = int(y_true_arr.sum())
137
+ if n_positive == 0 or n_positive == n:
138
+ return {
139
+ "skipped": (
140
+ "single-class slice; calibration is degenerate (per-bin observed "
141
+ "rates are constant 0 or 1)."
142
+ ),
143
+ "n": n,
144
+ "n_positive": n_positive,
145
+ }
146
+
147
+ prob_true, prob_pred = calibration_curve(
148
+ y_true_arr, y_score_arr, n_bins=n_bins, strategy=strategy
149
+ )
150
+
151
+ if strategy == "uniform":
152
+ bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
153
+ else:
154
+ bin_edges = np.quantile(y_score_arr, np.linspace(0.0, 1.0, n_bins + 1))
155
+ n_per_bin, _ = np.histogram(y_score_arr, bins=bin_edges)
156
+
157
+ ece_equal_mass = _ece_via_calibration_curve(y_true_arr, y_score_arr, n_bins, "quantile")
158
+ ece_equal_width = _ece_via_calibration_curve(y_true_arr, y_score_arr, n_bins, "uniform")
159
+
160
+ return {
161
+ "n": n,
162
+ "n_positive": n_positive,
163
+ "n_bins": int(n_bins),
164
+ "strategy": strategy,
165
+ "prob_true": [float(x) for x in prob_true],
166
+ "prob_pred": [float(x) for x in prob_pred],
167
+ "bin_edges": [float(x) for x in bin_edges],
168
+ "n_per_bin": [int(x) for x in n_per_bin],
169
+ "ece_equal_mass": float(ece_equal_mass),
170
+ "ece_equal_width": float(ece_equal_width),
171
+ }
172
+
173
+
174
+ def reliability_diagram_data(
175
+ y_true: np.ndarray,
176
+ y_score: np.ndarray,
177
+ *,
178
+ n_bins: int = 10,
179
+ strategy: Literal["uniform", "quantile"] = "quantile",
180
+ ) -> list[dict[str, float | int]]:
181
+ """Structured per-bin reliability rows for serialization or plotting.
182
+
183
+ Wraps :func:`reliability_curve` and reshapes its outputs into a list
184
+ of bin records suitable for direct parquet / JSON serialization or
185
+ for handing to :func:`eval_toolkit.plotting.plot_reliability_diagram`.
186
+
187
+ Schema (each dict):
188
+ - ``bin_lower``, ``bin_upper`` — bin edges (float).
189
+ - ``mean_pred`` — mean predicted probability inside the bin.
190
+ - ``frac_positive`` — fraction of positives inside the bin.
191
+ - ``n`` — number of rows in the bin (int).
192
+
193
+ Matplotlib is *not* required: the helper lives in
194
+ :mod:`eval_toolkit.calibration` so it can be imported by serializing
195
+ callers that don't pull in plotting deps.
196
+
197
+ Parameters
198
+ ----------
199
+ y_true, y_score : np.ndarray
200
+ Binary labels and predicted probabilities.
201
+ n_bins : int, optional
202
+ Number of bins. Default 10.
203
+ strategy : {"uniform", "quantile"}, optional
204
+ Quantile (equal-mass; default) or uniform (equal-width).
205
+
206
+ Returns
207
+ -------
208
+ list[dict[str, float | int]]
209
+ Empty list for degenerate slices (single-class or empty;
210
+ :func:`reliability_curve` returns a ``skipped`` sentinel which
211
+ this helper flattens to ``[]``).
212
+
213
+ Examples
214
+ --------
215
+ >>> import numpy as np
216
+ >>> rng = np.random.default_rng(0)
217
+ >>> y = rng.integers(0, 2, size=200)
218
+ >>> s = rng.uniform(0, 1, size=200)
219
+ >>> rows = reliability_diagram_data(y, s, n_bins=5)
220
+ >>> sorted(rows[0].keys())
221
+ ['bin_lower', 'bin_upper', 'frac_positive', 'mean_pred', 'n']
222
+ """
223
+ if len(y_true) == 0 or len(np.unique(y_true)) < 2:
224
+ return []
225
+ rc = reliability_curve(y_true, y_score, n_bins=n_bins, strategy=strategy)
226
+ if "skipped" in rc:
227
+ return []
228
+ prob_true = np.asarray(rc["prob_true"])
229
+ prob_pred = np.asarray(rc["prob_pred"])
230
+ bin_edges = np.asarray(rc["bin_edges"])
231
+ n_per_bin = np.asarray(rc["n_per_bin"])
232
+ rows: list[dict[str, float | int]] = []
233
+ for i in range(len(prob_true)):
234
+ lo = float(bin_edges[i])
235
+ hi = float(bin_edges[i + 1]) if i + 1 < len(bin_edges) else float(bin_edges[-1])
236
+ rows.append(
237
+ {
238
+ "bin_lower": lo,
239
+ "bin_upper": hi,
240
+ "mean_pred": float(prob_pred[i]),
241
+ "frac_positive": float(prob_true[i]),
242
+ "n": int(n_per_bin[i]),
243
+ }
244
+ )
245
+ return rows
246
+
247
+
248
+ def _ece_via_calibration_curve(
249
+ y_true: np.ndarray,
250
+ y_score: np.ndarray,
251
+ n_bins: int,
252
+ strategy: Literal["uniform", "quantile"],
253
+ ) -> float:
254
+ """ECE computed via sklearn's ``calibration_curve`` (handles empty bins).
255
+
256
+ Used internally by :func:`reliability_curve`. For metric-only ECE in
257
+ bootstrap contexts, use ``eval_toolkit.metrics.expected_calibration_error``.
258
+ """
259
+ prob_true, prob_pred = calibration_curve(y_true, y_score, n_bins=n_bins, strategy=strategy)
260
+ if strategy == "uniform":
261
+ bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
262
+ else:
263
+ bin_edges = np.quantile(y_score, np.linspace(0.0, 1.0, n_bins + 1))
264
+ n_per_bin, _ = np.histogram(y_score, bins=bin_edges)
265
+ non_empty_mask = n_per_bin > 0
266
+ n_per_bin_nonempty = n_per_bin[non_empty_mask]
267
+ if len(n_per_bin_nonempty) != len(prob_true):
268
+ n_per_bin_nonempty = np.full(len(prob_true), len(y_score) / max(len(prob_true), 1))
269
+ weights = n_per_bin_nonempty / max(int(n_per_bin_nonempty.sum()), 1)
270
+ return float((weights * np.abs(prob_true - prob_pred)).sum())
271
+
272
+
273
+ def maximum_calibration_error(
274
+ y_true: np.ndarray,
275
+ y_score: np.ndarray,
276
+ *,
277
+ n_bins: int = DEFAULT_N_BINS,
278
+ strategy: Literal["uniform", "quantile"] = DEFAULT_STRATEGY,
279
+ ) -> float | None:
280
+ r"""Maximum Calibration Error — worst-bin |observed_rate − mean_predicted|.
281
+
282
+ Companion scalar to ECE: where ECE is the *weighted-average* calibration gap,
283
+ MCE is the *worst-bin* gap. Surfaces the worst-calibrated bin so a model with
284
+ low ECE but one very-poorly-calibrated bin is not given a clean bill of
285
+ health (Naeini & Cooper 2014 [#mce]_).
286
+
287
+ Single-class slices return ``None`` (calibration is degenerate when one
288
+ class is absent — per-bin observed rates are constant 0 or 1).
289
+
290
+ Parameters
291
+ ----------
292
+ y_true : np.ndarray, shape (n,)
293
+ Binary labels in {0, 1}.
294
+ y_score : np.ndarray, shape (n,)
295
+ Predicted probabilities in [0, 1].
296
+ n_bins : int, optional
297
+ Number of bins (default 10).
298
+ strategy : {"uniform", "quantile"}, optional
299
+ Equal-width vs equal-mass binning. Default "quantile" (matches
300
+ :func:`reliability_curve` and yields more robust per-bin estimates on
301
+ imbalanced score distributions).
302
+
303
+ Returns
304
+ -------
305
+ float | None
306
+ Worst-bin calibration gap in [0, 1], or ``None`` for single-class
307
+ slices.
308
+
309
+ Raises
310
+ ------
311
+ ValueError
312
+ On shape mismatch, empty input, ``n_bins <= 1``, or unknown strategy.
313
+
314
+ Examples
315
+ --------
316
+ >>> import numpy as np
317
+ >>> rng = np.random.default_rng(42)
318
+ >>> y = rng.integers(0, 2, size=200)
319
+ >>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
320
+ >>> mce = maximum_calibration_error(y, s, n_bins=5, strategy="quantile")
321
+ >>> 0.0 <= mce <= 1.0
322
+ True
323
+ """
324
+ y_true_arr = np.asarray(y_true).astype(int)
325
+ y_score_arr = np.asarray(y_score).astype(float)
326
+ if y_true_arr.shape != y_score_arr.shape:
327
+ raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
328
+ if y_true_arr.size == 0:
329
+ raise ValueError("y_true is empty")
330
+ if n_bins <= 1:
331
+ raise ValueError(f"n_bins must be > 1, got {n_bins}")
332
+ if strategy not in {"uniform", "quantile"}:
333
+ raise ValueError(f"strategy must be 'uniform' or 'quantile', got {strategy!r}")
334
+
335
+ n_positive = int(y_true_arr.sum())
336
+ if n_positive == 0 or n_positive == y_true_arr.size:
337
+ return None
338
+
339
+ prob_true, prob_pred = calibration_curve(
340
+ y_true_arr, y_score_arr, n_bins=n_bins, strategy=strategy
341
+ )
342
+ if len(prob_true) == 0:
343
+ return None
344
+ return float(np.abs(prob_true - prob_pred).max())
345
+
346
+
347
+ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
348
+ r"""Bayes-optimal threshold per Elkan 2001 [#elkan]_ cost-sensitive derivation.
349
+
350
+ For a calibrated probabilistic classifier P(y=1 | x), the cost-minimizing
351
+ decision rule is "predict 1 iff score ≥ t*" with:
352
+
353
+ .. math:: t^* = \frac{c_{FP} \cdot (1 - π)}{c_{FP} \cdot (1 - π) + c_{FN} \cdot π}
354
+
355
+ Parameters
356
+ ----------
357
+ π : float
358
+ Deployment positive-class prior P(y=1) ∈ [0, 1].
359
+ (π = empirical positive prior; English alias on first appearance per the
360
+ Unicode-identifier convention in STYLE.md.)
361
+ c_fp : float
362
+ Cost of a false positive. Must be > 0.
363
+ c_fn : float
364
+ Cost of a false negative. Must be > 0.
365
+
366
+ Returns
367
+ -------
368
+ float
369
+ Optimal threshold ∈ [0, 1].
370
+
371
+ Raises
372
+ ------
373
+ ValueError
374
+ If π is outside [0, 1] or costs are non-positive.
375
+
376
+ Examples
377
+ --------
378
+ Symmetric costs at prior=0.5: threshold should equal the prior.
379
+
380
+ >>> bayes_optimal_threshold(0.5, c_fp=1.0, c_fn=1.0)
381
+ 0.5
382
+
383
+ Rare-positive case with FN 10× more expensive than FP:
384
+
385
+ >>> round(bayes_optimal_threshold(0.01, c_fp=1.0, c_fn=10.0), 4)
386
+ 0.9083
387
+
388
+ Edge cases:
389
+
390
+ >>> bayes_optimal_threshold(0.0, c_fp=1.0, c_fn=1.0)
391
+ 1.0
392
+ >>> bayes_optimal_threshold(1.0, c_fp=1.0, c_fn=1.0)
393
+ 0.0
394
+
395
+ Notes
396
+ -----
397
+ Symmetric costs (c_fp == c_fn) collapse the formula to t* = 1 - π.
398
+ Equivalently, when costs are equal the optimal threshold is the *negative*
399
+ prior — predicting 1 whenever P(y=1 | x) > P(y=0).
400
+
401
+ Attribution caveat: Elkan 2001 §4 derives the **prior-independent**
402
+ posterior-formula ``t* = c_fp / (c_fp + c_fn)`` for thresholding a
403
+ *Bayes-calibrated* posterior P(y=1 | x). The formula implemented here
404
+ is the **prior-corrected** form for thresholding raw scores at a known
405
+ deployment prior π, which agrees with Elkan only under symmetric costs.
406
+ For our intended use (deployment prior + asymmetric costs) the
407
+ prior-corrected form is what the user wants — but the citation should
408
+ be read as "Elkan 2001 cost-sensitive framework", not literal §4.
409
+
410
+ References
411
+ ----------
412
+ .. [#elkan] Elkan, C. "The foundations of cost-sensitive learning." IJCAI
413
+ 2001.
414
+ """
415
+ if not 0.0 <= π <= 1.0:
416
+ raise ValueError(f"π (prior) must be in [0, 1], got {π}")
417
+ if c_fp <= 0:
418
+ raise ValueError(f"c_fp must be > 0, got {c_fp}")
419
+ if c_fn <= 0:
420
+ raise ValueError(f"c_fn must be > 0, got {c_fn}")
421
+
422
+ if π == 0.0:
423
+ return 1.0
424
+ if π == 1.0:
425
+ return 0.0
426
+ numerator = c_fp * (1.0 - π)
427
+ denominator = numerator + c_fn * π
428
+ return float(numerator / denominator)
429
+
430
+
431
+ @dataclass(frozen=True, slots=True)
432
+ class CostMatrix:
433
+ r"""Frozen scaffolding for FP/FN/abstain costs at an assumed prior.
434
+
435
+ Pairs a deployment prior with FP/FN costs (and optionally an abstain cost
436
+ for selective classification). The :attr:`bayes_threshold` property
437
+ composes :func:`bayes_optimal_threshold`.
438
+
439
+ Parameters
440
+ ----------
441
+ prior : float, optional
442
+ Assumed deployment prevalence P(y=1). Default 0.01.
443
+ fp_cost : float, optional
444
+ Cost of a false positive. Default 1.0.
445
+ fn_cost : float, optional
446
+ Cost of a false negative. Default 10.0.
447
+ abstain_cost : float or None, optional
448
+ Optional cost of abstaining/escalating. ``None`` means abstention is
449
+ not allowed in this policy.
450
+ notes : str, optional
451
+ Free-form annotation.
452
+
453
+ Examples
454
+ --------
455
+ >>> cm = CostMatrix(prior=0.5, fp_cost=1.0, fn_cost=1.0)
456
+ >>> cm.bayes_threshold
457
+ 0.5
458
+ """
459
+
460
+ prior: float = DEFAULT_PRIOR
461
+ fp_cost: float = DEFAULT_FP_COST
462
+ fn_cost: float = DEFAULT_FN_COST
463
+ abstain_cost: float | None = None
464
+ notes: str = ""
465
+
466
+ def __post_init__(self) -> None:
467
+ """Validate the cost-matrix triple."""
468
+ if not 0.0 <= self.prior <= 1.0:
469
+ raise ValueError(f"prior must be in [0, 1], got {self.prior}")
470
+ if self.fp_cost <= 0:
471
+ raise ValueError(f"fp_cost must be > 0, got {self.fp_cost}")
472
+ if self.fn_cost <= 0:
473
+ raise ValueError(f"fn_cost must be > 0, got {self.fn_cost}")
474
+ if self.abstain_cost is not None and self.abstain_cost < 0:
475
+ raise ValueError(f"abstain_cost must be >= 0 if set, got {self.abstain_cost}")
476
+
477
+ @property
478
+ def bayes_threshold(self) -> float:
479
+ """Compose :func:`bayes_optimal_threshold` using this matrix's fields."""
480
+ return bayes_optimal_threshold(self.prior, self.fp_cost, self.fn_cost)
481
+
482
+ def expected_cost(
483
+ self, y_true: np.ndarray, y_score: np.ndarray, threshold: float | None = None
484
+ ) -> float:
485
+ r"""Empirical expected cost on labeled data at a given (or Bayes-optimal) threshold.
486
+
487
+ For each row, the cost of the prediction at ``threshold`` is:
488
+
489
+ - True positive (y=1, pred=1): 0
490
+ - True negative (y=0, pred=0): 0
491
+ - False positive (y=0, pred=1): ``fp_cost``
492
+ - False negative (y=1, pred=0): ``fn_cost``
493
+
494
+ Returns the mean cost across the dataset, weighted equally per row.
495
+
496
+ Parameters
497
+ ----------
498
+ y_true : np.ndarray, shape (n,)
499
+ Binary labels in {0, 1}.
500
+ y_score : np.ndarray, shape (n,)
501
+ Predicted probabilities in [0, 1].
502
+ threshold : float or None, optional
503
+ Decision threshold; if ``None``, uses :attr:`bayes_threshold`.
504
+
505
+ Returns
506
+ -------
507
+ float
508
+ Mean cost per row.
509
+
510
+ Raises
511
+ ------
512
+ ValueError
513
+ If ``y_true`` and ``y_score`` have mismatched shapes.
514
+
515
+ Examples
516
+ --------
517
+ >>> cm = CostMatrix(prior=0.5, fp_cost=1.0, fn_cost=10.0)
518
+ >>> y = np.array([0, 1, 0, 1])
519
+ >>> s = np.array([0.6, 0.4, 0.1, 0.9]) # 1 FP, 1 FN
520
+ >>> # At threshold=0.5: pred = [1, 0, 0, 1]; FP at idx 0, FN at idx 1
521
+ >>> cm.expected_cost(y, s, threshold=0.5)
522
+ 2.75
523
+ """
524
+ t = threshold if threshold is not None else self.bayes_threshold
525
+ y_arr = np.asarray(y_true).astype(int)
526
+ s_arr = np.asarray(y_score, dtype=float)
527
+ if y_arr.shape != s_arr.shape:
528
+ raise ValueError(f"y_true shape {y_arr.shape} != y_score shape {s_arr.shape}")
529
+ y_pred = (s_arr >= t).astype(int)
530
+ fp = ((y_pred == 1) & (y_arr == 0)).sum()
531
+ fn = ((y_pred == 0) & (y_arr == 1)).sum()
532
+ n = max(len(y_arr), 1)
533
+ return float((fp * self.fp_cost + fn * self.fn_cost) / n)
534
+
535
+ def to_dict(self) -> dict[str, object]:
536
+ """JSON-serializable form with the derived threshold."""
537
+ return {
538
+ "prior": self.prior,
539
+ "fp_cost": self.fp_cost,
540
+ "fn_cost": self.fn_cost,
541
+ "abstain_cost": self.abstain_cost,
542
+ "notes": self.notes,
543
+ "bayes_threshold": self.bayes_threshold,
544
+ }
545
+
546
+
547
+ _SCORE_CLIP_LO = 1e-7
548
+ _SCORE_CLIP_HI = 1.0 - 1e-7
549
+
550
+
551
+ def _validate_calibrator_inputs(
552
+ y_true: np.ndarray, y_score: np.ndarray
553
+ ) -> tuple[np.ndarray, np.ndarray]:
554
+ """Shared input validation for the three calibrator fitters."""
555
+ y_true_arr = np.asarray(y_true).astype(int)
556
+ y_score_arr = np.asarray(y_score).astype(float)
557
+ if y_true_arr.shape != y_score_arr.shape:
558
+ raise ValueError(f"shape mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}")
559
+ if y_true_arr.size == 0:
560
+ raise ValueError("y_true is empty")
561
+ if not np.isfinite(y_score_arr).all():
562
+ raise ValueError("y_score contains NaN or inf")
563
+ n_pos = int(y_true_arr.sum())
564
+ if n_pos == 0 or n_pos == y_true_arr.size:
565
+ raise ValueError(
566
+ f"y_true must contain both classes; got n={y_true_arr.size}, n_positive={n_pos}"
567
+ )
568
+ return y_true_arr, y_score_arr
569
+
570
+
571
+ def fit_isotonic_calibrator(
572
+ y_true: np.ndarray, y_score: np.ndarray
573
+ ) -> Callable[[np.ndarray], np.ndarray]:
574
+ """Niculescu-Mizil & Caruana 2005 [#nm05]_ isotonic regression.
575
+
576
+ Parameters
577
+ ----------
578
+ y_true : np.ndarray, shape (n,)
579
+ Binary labels in {0, 1}.
580
+ y_score : np.ndarray, shape (n,)
581
+ Predicted probabilities in [0, 1].
582
+
583
+ Returns
584
+ -------
585
+ callable
586
+ Maps raw scores to monotonically calibrated probabilities,
587
+ clipped to [0, 1] via ``out_of_bounds="clip"``.
588
+
589
+ Raises
590
+ ------
591
+ ValueError
592
+ On shape mismatch, empty input, non-finite scores, or single-class
593
+ ``y_true`` (calibration is degenerate).
594
+
595
+ Examples
596
+ --------
597
+ >>> import numpy as np
598
+ >>> rng = np.random.default_rng(42)
599
+ >>> y = rng.integers(0, 2, size=200)
600
+ >>> s = (y + rng.normal(0, 0.5, size=200)).clip(0, 1)
601
+ >>> g = fit_isotonic_calibrator(y, s)
602
+ >>> calibrated = g(s)
603
+ >>> bool(calibrated.min() >= 0.0 and calibrated.max() <= 1.0)
604
+ True
605
+
606
+ Notes
607
+ -----
608
+ Isotonic regression fits a monotonic step function from raw scores to
609
+ calibrated probabilities. The fit is non-parametric; on small fitting
610
+ sets it can overfit. Niculescu-Mizil & Caruana 2005 §5 finds isotonic
611
+ competitive with Platt only at **n ≳ 1000**; below ~1000 Platt scaling
612
+ (or :class:`fit_beta_calibrator`) typically generalizes better. Prefer
613
+ Platt / Beta for small calibration sets.
614
+
615
+ References
616
+ ----------
617
+ .. [1] Niculescu-Mizil, A. & Caruana, R. "Predicting good probabilities
618
+ with supervised learning." ICML 2005.
619
+ .. [2] Zadrozny, B. & Elkan, C. "Transforming classifier scores into
620
+ accurate multiclass probability estimates." KDD 2002.
621
+ """
622
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
623
+ iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
624
+ iso.fit(y_score_arr, y_true_arr)
625
+
626
+ def apply(scores: np.ndarray) -> np.ndarray:
627
+ arr = np.asarray(scores, dtype=float).ravel()
628
+ if not np.isfinite(arr).all():
629
+ raise ValueError("scores contains NaN or inf")
630
+ out: np.ndarray = np.asarray(iso.predict(arr), dtype=float)
631
+ return out
632
+
633
+ return apply
634
+
635
+
636
+ def _platt_loss_grad(
637
+ ab: np.ndarray, scores: np.ndarray, smoothed_targets: np.ndarray
638
+ ) -> tuple[float, np.ndarray]:
639
+ """Binomial NLL + gradient under Laplace-smoothed targets (Lin 2007 §2).
640
+
641
+ Parameters
642
+ ----------
643
+ ab : np.ndarray, shape (2,)
644
+ Sigmoid parameters ``(a, b)``; the calibrated score is
645
+ :math:`\\sigma(a \\cdot s + b)`.
646
+ scores : np.ndarray, shape (n,)
647
+ Raw scores ``F`` (Platt's notation).
648
+ smoothed_targets : np.ndarray, shape (n,)
649
+ Laplace-smoothed targets ``T`` per Lin 2007 (avoids
650
+ log-of-zero singularities under MLE).
651
+
652
+ Returns
653
+ -------
654
+ loss : float
655
+ Total NLL.
656
+ grad : np.ndarray, shape (2,)
657
+ Gradient w.r.t. ``(a, b)``.
658
+ """
659
+ a, b = ab
660
+ z = a * scores + b
661
+ # Stable: NLL_i = T·log(1+exp(-z)) + (1-T)·log(1+exp(z))
662
+ pos_part = smoothed_targets * np.logaddexp(0.0, -z)
663
+ neg_part = (1.0 - smoothed_targets) * np.logaddexp(0.0, z)
664
+ loss = float((pos_part + neg_part).sum())
665
+ # dNLL/dz_i = σ(z_i) - T_i
666
+ sigmoid_z = 1.0 / (1.0 + np.exp(-z))
667
+ err = sigmoid_z - smoothed_targets
668
+ grad = np.array([float(np.dot(err, scores)), float(err.sum())])
669
+ return loss, grad
670
+
671
+
672
+ @dataclass(frozen=True, slots=True)
673
+ class PlattFit:
674
+ r"""Fitted Platt sigmoid calibrator: ``(a, b)`` parameters + transform.
675
+
676
+ Returned by :func:`fit_platt_calibrator` so callers can both apply the
677
+ calibrator (via ``__call__``) and serialize / inspect / reuse the fitted
678
+ parameters without reverse-engineering them from the closure.
679
+
680
+ The ``__call__`` delegation preserves back-compat with eval-toolkit ≤ 0.11.0,
681
+ where ``fit_platt_calibrator`` returned a plain ``Callable``: any caller
682
+ that used the return value as ``g(scores)`` continues to work unchanged.
683
+
684
+ Attributes
685
+ ----------
686
+ transform : Callable[[np.ndarray], np.ndarray]
687
+ Maps raw scores to calibrated probabilities via :math:`\sigma(a s + b)`.
688
+ a : float
689
+ Fitted slope.
690
+ b : float
691
+ Fitted intercept.
692
+
693
+ Examples
694
+ --------
695
+ >>> import numpy as np
696
+ >>> rng = np.random.default_rng(42)
697
+ >>> y = rng.integers(0, 2, size=200)
698
+ >>> s = (y + rng.normal(0, 0.5, size=200))
699
+ >>> fit = fit_platt_calibrator(y, s)
700
+ >>> bool(fit.a > 0.0) # well-separated → positive slope
701
+ True
702
+ >>> calibrated = fit(s) # __call__ delegates to transform
703
+ >>> bool(calibrated.min() > 0.0 and calibrated.max() < 1.0)
704
+ True
705
+ """
706
+
707
+ transform: Callable[[np.ndarray], np.ndarray]
708
+ a: float
709
+ b: float
710
+
711
+ def __call__(self, scores: np.ndarray) -> np.ndarray:
712
+ return self.transform(scores)
713
+
714
+
715
+ def fit_platt_calibrator(y_true: np.ndarray, y_score: np.ndarray) -> PlattFit:
716
+ r"""Platt 1999 [#platt]_ sigmoid scaling with Lin 2007 [#lin]_ Laplace-smoothed targets.
717
+
718
+ Canonical Platt scaling: fits :math:`\sigma(a \cdot s + b)` to maximize
719
+ the binomial likelihood under Laplace-smoothed targets
720
+
721
+ .. math::
722
+
723
+ T_i = \frac{n_+ + 1}{n_+ + 2} \quad \text{if } y_i = 1, \qquad
724
+ T_i = \frac{1}{n_- + 2} \quad \text{if } y_i = 0,
725
+
726
+ where :math:`n_+` and :math:`n_-` are the positive and negative counts.
727
+ The smoothing avoids the MLE singularity at zero/one counts and matches
728
+ :class:`sklearn.calibration._SigmoidCalibration` to within optimizer
729
+ tolerance.
730
+
731
+ Parameters
732
+ ----------
733
+ y_true : np.ndarray, shape (n,)
734
+ Binary labels in {0, 1}.
735
+ y_score : np.ndarray, shape (n,)
736
+ Predicted probabilities or scores.
737
+
738
+ Returns
739
+ -------
740
+ PlattFit
741
+ Frozen dataclass exposing the fitted ``a`` (slope) and ``b``
742
+ (intercept) parameters alongside a ``transform`` callable. The
743
+ instance itself is ``__call__``-able for back-compat with v0.11
744
+ and earlier (a plain ``Callable`` annotation accepts a ``PlattFit``).
745
+
746
+ Raises
747
+ ------
748
+ ValueError
749
+ On shape mismatch, empty input, non-finite scores, or single-class
750
+ ``y_true``.
751
+ RuntimeError
752
+ If the L-BFGS-B optimizer fails to converge. The error message
753
+ includes the SciPy optimizer message for diagnostics.
754
+
755
+ Examples
756
+ --------
757
+ >>> import numpy as np
758
+ >>> rng = np.random.default_rng(42)
759
+ >>> y = rng.integers(0, 2, size=200)
760
+ >>> s = (y + rng.normal(0, 0.5, size=200))
761
+ >>> fit = fit_platt_calibrator(y, s)
762
+ >>> isinstance(fit.a, float) and isinstance(fit.b, float)
763
+ True
764
+ >>> out = fit(s) # __call__ delegates to transform
765
+ >>> bool(out.min() > 0.0 and out.max() < 1.0)
766
+ True
767
+
768
+ Notes
769
+ -----
770
+ Platt scaling fits the two-parameter sigmoid
771
+
772
+ .. math:: P(y=1 \mid s) = \sigma(a \cdot s + b) = \frac{1}{1 + \exp(-(a s + b))}
773
+
774
+ by maximum-likelihood under Lin 2007's Laplace-smoothed targets. Unlike
775
+ isotonic, the parametric form regularizes small samples but cannot
776
+ correct strongly non-monotone miscalibration.
777
+
778
+ Initialization follows sklearn / Lin 2007: ``a₀ = 0``, ``b₀ = log((n_- + 1) / (n_+ + 1))``;
779
+ the optimizer is L-BFGS-B with analytic gradient.
780
+
781
+ Behavior change vs eval-toolkit ≤ 0.2.0: previous versions wrapped
782
+ :class:`sklearn.linear_model.LogisticRegression` with default L2
783
+ regularization. v0.3.0 implements canonical Platt directly to match
784
+ :class:`sklearn.calibration._SigmoidCalibration` (Lin 2007). Empirical
785
+ delta on imbalanced data is ~1–3% ECE.
786
+
787
+ Return-type change vs eval-toolkit ≤ 0.11.0: previously returned a plain
788
+ ``Callable[[np.ndarray], np.ndarray]``. v0.12.0 returns a :class:`PlattFit`
789
+ dataclass that exposes the fitted ``(a, b)`` and delegates ``__call__``
790
+ to the transform. Existing callers typed as ``Callable`` keep working;
791
+ new callers can read ``fit.a`` / ``fit.b`` directly (no logit-probe).
792
+
793
+ References
794
+ ----------
795
+ .. [#platt] Platt, J. "Probabilistic outputs for support vector machines
796
+ and comparisons to regularized likelihood methods." Advances in Large
797
+ Margin Classifiers, 1999.
798
+ .. [#lin] Lin, H. T., Lin, C. J., & Weng, R. C. "A note on Platt's
799
+ probabilistic outputs for support vector machines." Machine Learning
800
+ 68(3), 2007.
801
+ """
802
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
803
+
804
+ n_pos = float(np.sum(y_true_arr > 0))
805
+ n_neg = float(y_true_arr.size - n_pos)
806
+ smoothed = np.empty_like(y_score_arr)
807
+ smoothed[y_true_arr > 0] = (n_pos + 1.0) / (n_pos + 2.0)
808
+ smoothed[y_true_arr <= 0] = 1.0 / (n_neg + 2.0)
809
+
810
+ ab_init = np.array([0.0, float(np.log((n_neg + 1.0) / (n_pos + 1.0)))])
811
+ result = minimize(
812
+ _platt_loss_grad,
813
+ ab_init,
814
+ args=(y_score_arr, smoothed),
815
+ method="L-BFGS-B",
816
+ jac=True,
817
+ )
818
+ if not result.success:
819
+ raise RuntimeError(f"Platt calibration optimization failed: {result.message}")
820
+ a_fit, b_fit = float(result.x[0]), float(result.x[1])
821
+
822
+ def apply(scores: np.ndarray) -> np.ndarray:
823
+ arr = np.asarray(scores, dtype=float).ravel()
824
+ if not np.isfinite(arr).all():
825
+ raise ValueError("scores contains NaN or inf")
826
+ z = a_fit * arr + b_fit
827
+ out: np.ndarray = (1.0 / (1.0 + np.exp(-z))).astype(float)
828
+ return out
829
+
830
+ return PlattFit(transform=apply, a=a_fit, b=b_fit)
831
+
832
+
833
+ def fit_beta_calibrator(
834
+ y_true: np.ndarray, y_score: np.ndarray
835
+ ) -> Callable[[np.ndarray], np.ndarray]:
836
+ r"""Beta calibration per Kull et al. 2017 [#kull]_.
837
+
838
+ Three-parameter generalization of Platt scaling using the Beta family:
839
+
840
+ .. math::
841
+
842
+ P(y=1 \mid s) = \frac{1}{1 + \exp(-(a \log s - b \log (1 - s) + c))}
843
+
844
+ Equivalently, fit a logistic regression on the 2-feature transform
845
+ :math:`(\log s, \log(1 - s))`. The slope coefficient on :math:`\log s`
846
+ is ``a``; the (negated) slope on :math:`\log(1-s)` is ``b``; the
847
+ intercept is ``c``.
848
+
849
+ Beta calibration empirically dominates Platt scaling on most
850
+ real-world classifiers (Kull et al. 2017 §5), at the cost of one
851
+ extra parameter and a slightly more complex feature transform. It
852
+ is *not* monotone in the score (unlike Platt and isotonic) — for
853
+ that, use :func:`fit_isotonic_calibrator`.
854
+
855
+ Parameters
856
+ ----------
857
+ y_true : np.ndarray, shape (n,)
858
+ Binary labels in {0, 1}.
859
+ y_score : np.ndarray, shape (n,)
860
+ Predicted probabilities in (0, 1). Scores at the extremes
861
+ ``{0, 1}`` are clipped to ``[1e-7, 1 - 1e-7]`` so the log-link
862
+ is finite.
863
+
864
+ Returns
865
+ -------
866
+ callable
867
+ Maps raw scores to calibrated probabilities via the fitted
868
+ 3-parameter Beta sigmoid.
869
+
870
+ Raises
871
+ ------
872
+ ValueError
873
+ On shape mismatch, empty input, non-finite scores, or
874
+ single-class ``y_true``.
875
+
876
+ Examples
877
+ --------
878
+ >>> import numpy as np
879
+ >>> rng = np.random.default_rng(42)
880
+ >>> y = rng.integers(0, 2, size=300)
881
+ >>> s = (y + rng.normal(0, 0.4, size=300)).clip(0.01, 0.99)
882
+ >>> g = fit_beta_calibrator(y, s)
883
+ >>> out = g(s)
884
+ >>> bool(out.min() >= 0.0 and out.max() <= 1.0)
885
+ True
886
+
887
+ See Also
888
+ --------
889
+ eval_toolkit.calibration.fit_platt_calibrator :
890
+ 2-parameter sigmoid; Beta is a strict generalization.
891
+ eval_toolkit.calibration.fit_isotonic_calibrator :
892
+ Non-parametric monotone alternative.
893
+
894
+ Notes
895
+ -----
896
+ Implementation: build features ``(log s, log(1 - s))`` and fit
897
+ sklearn ``LogisticRegression`` with no regularization (``C=1e9``)
898
+ to recover the Beta-calibration MLE. This matches the reference
899
+ `betacal <https://github.com/betacal/python>`_ implementation up to
900
+ optimizer tolerance for typical data.
901
+
902
+ References
903
+ ----------
904
+ .. [#kull] Kull, M., Filho, T. S., & Flach, P. "Beta calibration: a
905
+ well-founded and easily implemented improvement on logistic
906
+ calibration for binary classifiers." AISTATS 2017.
907
+ arXiv:1607.06770.
908
+ """
909
+ from sklearn.linear_model import LogisticRegression # noqa: PLC0415
910
+
911
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
912
+ s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
913
+ features = np.column_stack([np.log(s_clipped), np.log(1.0 - s_clipped)])
914
+ # Effectively unregularized (C very large) — matches Kull 2017 MLE.
915
+ lr = LogisticRegression(C=1e9, solver="lbfgs", max_iter=2000)
916
+ lr.fit(features, y_true_arr)
917
+
918
+ def apply(scores: np.ndarray) -> np.ndarray:
919
+ arr = np.asarray(scores, dtype=float).ravel()
920
+ if not np.isfinite(arr).all():
921
+ raise ValueError("scores contains NaN or inf")
922
+ clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
923
+ f = np.column_stack([np.log(clipped), np.log(1.0 - clipped)])
924
+ out: np.ndarray = lr.predict_proba(f)[:, 1].astype(float)
925
+ return out
926
+
927
+ return apply
928
+
929
+
930
+ def fit_temperature(
931
+ val_logits: np.ndarray,
932
+ val_labels: np.ndarray,
933
+ bounds: tuple[float, float] = (0.05, 20.0),
934
+ ) -> dict[str, float]:
935
+ r"""Single-parameter temperature scaling per Guo et al. 2017 [#guo]_.
936
+
937
+ Fits a scalar T > 0 on validation logits to minimize negative log-likelihood:
938
+
939
+ .. math:: T^* = \arg\min_T - \frac{1}{n}\sum_i \log p_{y_i}(x_i / T)
940
+
941
+ where :math:`p_y(x / T) = \mathrm{softmax}(x/T)_y`. T scales the entire
942
+ logit vector before softmax, so accuracy (argmax) is preserved exactly
943
+ while the confidence distribution flattens (T > 1) or sharpens (T < 1).
944
+
945
+ Parameters
946
+ ----------
947
+ val_logits : np.ndarray, shape (n, 2)
948
+ Validation logits for the binary classifier (column 0 = negative class,
949
+ column 1 = positive class).
950
+ val_labels : np.ndarray, shape (n,)
951
+ Binary labels in {0, 1}.
952
+ bounds : tuple of float, optional
953
+ ``(lo, hi)`` bracket for ``T``. Default ``(0.05, 20.0)``.
954
+
955
+ Returns
956
+ -------
957
+ dict
958
+ Keys: ``temperature`` (T*), ``nll_pre`` (NLL at T=1), ``nll_post``
959
+ (NLL at T=T*), ``improvement`` (nll_pre - nll_post; ≥ 0 always).
960
+
961
+ Raises
962
+ ------
963
+ ValueError
964
+ If ``val_logits`` shape is not (n, 2), shapes mismatch, or labels are
965
+ not binary.
966
+ RuntimeError
967
+ If the bounded scalar optimizer fails to converge.
968
+
969
+ Examples
970
+ --------
971
+ >>> import numpy as np
972
+ >>> rng = np.random.default_rng(42)
973
+ >>> # synthesize uncalibrated logits with known T_true = 3.0
974
+ >>> base = rng.normal(size=(500, 2))
975
+ >>> labels = (base[:, 1] > base[:, 0]).astype(int)
976
+ >>> logits = base * 3.0 # makes scores overconfident
977
+ >>> result = fit_temperature(logits, labels)
978
+ >>> 0.05 <= result['temperature'] <= 20.0
979
+ True
980
+ >>> result['nll_post'] <= result['nll_pre'] # always non-increasing
981
+ True
982
+
983
+ Notes
984
+ -----
985
+ Temperature scaling preserves accuracy exactly because dividing all
986
+ logits by the same scalar does not change the argmax. It only rescales
987
+ the *confidence* (max softmax probability), which is what miscalibration
988
+ in modern overconfident networks measures.
989
+
990
+ References
991
+ ----------
992
+ .. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
993
+ calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
994
+ """
995
+ if val_logits.ndim != 2 or val_logits.shape[1] != 2:
996
+ raise ValueError(f"val_logits must be (n, 2), got shape {val_logits.shape}")
997
+ if val_logits.shape[0] != val_labels.shape[0]:
998
+ raise ValueError(
999
+ f"length mismatch: logits {val_logits.shape[0]} vs labels {val_labels.shape[0]}"
1000
+ )
1001
+ if val_logits.shape[0] == 0:
1002
+ raise ValueError("val_logits is empty")
1003
+ if not np.isfinite(val_logits).all():
1004
+ raise ValueError("val_logits contains NaN or inf")
1005
+ if not set(np.unique(val_labels).tolist()).issubset({0, 1}):
1006
+ raise ValueError("val_labels must be binary (0/1)")
1007
+ n_pos = int(np.sum(val_labels))
1008
+ if n_pos == 0 or n_pos == val_labels.shape[0]:
1009
+ raise ValueError(
1010
+ f"val_labels must contain both classes; got n={val_labels.shape[0]}, "
1011
+ f"n_positive={n_pos}"
1012
+ )
1013
+
1014
+ nll_pre = _negative_log_likelihood(1.0, val_logits, val_labels)
1015
+ res = minimize_scalar(
1016
+ _negative_log_likelihood,
1017
+ bounds=bounds,
1018
+ method="bounded",
1019
+ args=(val_logits, val_labels),
1020
+ )
1021
+ if not res.success:
1022
+ raise RuntimeError(f"temperature optimization failed: {res.message}")
1023
+ t_opt = float(res.x)
1024
+ nll_post = _negative_log_likelihood(t_opt, val_logits, val_labels)
1025
+ return {
1026
+ "temperature": t_opt,
1027
+ "nll_pre": nll_pre,
1028
+ "nll_post": nll_post,
1029
+ "improvement": nll_pre - nll_post,
1030
+ }
1031
+
1032
+
1033
+ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -> float:
1034
+ """NLL of softmax(logits / T) against true labels."""
1035
+ if t <= 0:
1036
+ return float("inf")
1037
+ log_probs = log_softmax(logits / t, axis=-1)
1038
+ return float(-log_probs[np.arange(len(labels)), labels].mean())
1039
+
1040
+
1041
+ def fit_temperature_oracle(
1042
+ y_true: np.ndarray, y_score: np.ndarray
1043
+ ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
1044
+ r"""**DIAGNOSTIC ONLY** — fit-on-test oracle T-scaling per Guo et al. 2017 [#guo]_.
1045
+
1046
+ .. warning::
1047
+
1048
+ **Do not use this function as a deployment policy.** It fits ``T``
1049
+ on the same data the returned callable scores — the canonical
1050
+ "fit-on-test" methodological pitfall. ECE measured on the fitted
1051
+ scores is systematically **under**-estimated, sometimes by 50% or
1052
+ more (Vaicenavicius 2019 §3, Kumar 2019 §5, Roelofs 2022). Use
1053
+ :func:`fit_temperature` (fit on a separate validation set) for
1054
+ deployment; use this function only to compute a diagnostic
1055
+ upper bound on what *any* single-T recalibration could achieve
1056
+ if T were chosen optimally per slice.
1057
+
1058
+ Internally inverts probabilities to logits via :math:`\log(p / (1 - p))`,
1059
+ fits T to minimize NLL on the T-scaled logits, then exposes a callable
1060
+ that applies :math:`\sigma(\mathrm{logit} / T)`.
1061
+
1062
+ Parameters
1063
+ ----------
1064
+ y_true : np.ndarray, shape (n,)
1065
+ Binary labels in {0, 1}.
1066
+ y_score : np.ndarray, shape (n,)
1067
+ Predicted probabilities in (0, 1). Scores at the extremes {0, 1} are
1068
+ clipped to [1e-7, 1-1e-7] so the logit inversion is finite.
1069
+
1070
+ Returns
1071
+ -------
1072
+ tuple
1073
+ ``(T_optimal, apply)`` where ``apply`` maps any input probability array
1074
+ through :math:`\sigma(\mathrm{logit}(p) / T_{optimal})`.
1075
+
1076
+ Raises
1077
+ ------
1078
+ ValueError
1079
+ On shape mismatch, empty input, non-finite scores, or single-class
1080
+ ``y_true``.
1081
+
1082
+ Examples
1083
+ --------
1084
+ >>> import numpy as np
1085
+ >>> rng = np.random.default_rng(42)
1086
+ >>> y = rng.integers(0, 2, size=200)
1087
+ >>> s = (y + rng.normal(0, 0.5, size=200)).clip(0.01, 0.99)
1088
+ >>> import warnings
1089
+ >>> with warnings.catch_warnings():
1090
+ ... warnings.simplefilter("ignore", UserWarning)
1091
+ ... T_opt, apply = fit_temperature_oracle(y, s)
1092
+ >>> T_opt > 0
1093
+ True
1094
+ """
1095
+ import warnings as _warnings # noqa: PLC0415 (deferred to keep top of file lean)
1096
+
1097
+ _warnings.warn(
1098
+ "fit_temperature_oracle is fit-on-test and produces an under-estimated "
1099
+ "ECE; use fit_temperature with a held-out validation set for deployment. "
1100
+ "This warning may be suppressed in test contexts: "
1101
+ "`warnings.simplefilter('ignore', UserWarning)`.",
1102
+ UserWarning,
1103
+ stacklevel=2,
1104
+ )
1105
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
1106
+
1107
+ def _logits_from_probs(p: np.ndarray) -> np.ndarray:
1108
+ clipped = np.clip(p, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1109
+ out: np.ndarray = np.log(clipped / (1.0 - clipped))
1110
+ return out
1111
+
1112
+ def _sigmoid(z: np.ndarray) -> np.ndarray:
1113
+ out: np.ndarray = 1.0 / (1.0 + np.exp(-z))
1114
+ return out
1115
+
1116
+ logits = _logits_from_probs(y_score_arr)
1117
+
1118
+ def nll_at_t(t: float) -> float:
1119
+ if t <= 0:
1120
+ return float("inf")
1121
+ scaled = logits / t
1122
+ # Stable log-sigmoid via softplus identity.
1123
+ log_p1 = -np.logaddexp(0.0, -scaled)
1124
+ log_p0 = -np.logaddexp(0.0, scaled)
1125
+ return float(-(y_true_arr * log_p1 + (1 - y_true_arr) * log_p0).sum())
1126
+
1127
+ result = minimize_scalar(
1128
+ nll_at_t,
1129
+ bounds=(0.05, 20.0),
1130
+ method="bounded",
1131
+ options={"xatol": 1e-4},
1132
+ )
1133
+ t_optimal = float(result.x)
1134
+
1135
+ def apply(scores: np.ndarray) -> np.ndarray:
1136
+ arr = np.asarray(scores, dtype=float).ravel()
1137
+ if not np.isfinite(arr).all():
1138
+ raise ValueError("scores contains NaN or inf")
1139
+ scaled = _logits_from_probs(arr) / t_optimal
1140
+ out: np.ndarray = _sigmoid(scaled).astype(float)
1141
+ return out
1142
+
1143
+ return t_optimal, apply