eval-toolkit 0.27.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1344 @@
1
+ r"""Bootstrap confidence intervals: BCa per-condition, paired-difference, MDE.
2
+
3
+ - :class:`BootstrapCI` — 95% CI on a single metric on one condition (BCa or percentile)
4
+ - :class:`PairedBootstrapCI` — paired CI on metric(B) − metric(A) using shared resample indices
5
+ - :func:`paired_bootstrap_op_point_diff` — two-level bootstrap that re-fits operating-point
6
+ thresholds within each resample (correctly accounts for threshold-selection variance)
7
+ - :func:`paired_bootstrap_ece_diff` — paired CI on ECE deltas; metric-agnostic via dependency
8
+ injection (caller supplies an ``ece_fn`` callable)
9
+ - :class:`MDEEstimate` and :func:`paired_mde` — minimum detectable Δ at requested (α, power)
10
+
11
+ The math kernels depend only on numpy + scipy.stats; no other module in this toolkit imports
12
+ into bootstrap.
13
+
14
+ References
15
+ ----------
16
+ .. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
17
+ .. [2] DiCiccio, T. & Efron, B. "Bootstrap Confidence Intervals." Statistical Science, 1996.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from collections.abc import Callable
23
+ from dataclasses import dataclass
24
+ from typing import Final, Literal
25
+
26
+ import numpy as np
27
+ from scipy.stats import bootstrap as _scipy_bootstrap
28
+ from scipy.stats import norm as _scipy_norm
29
+ from scipy.stats import rankdata as _scipy_rankdata
30
+
31
+ __all__ = [
32
+ "DEFAULT_CONFIDENCE",
33
+ "DEFAULT_METHOD",
34
+ "DEFAULT_N_RESAMPLES",
35
+ "DEFAULT_SEED",
36
+ "BootstrapCI",
37
+ "DeLongResult",
38
+ "MDEEstimate",
39
+ "MetricFn",
40
+ "PairedBootstrapCI",
41
+ "ThresholdFn",
42
+ "ThresholdedMetricFn",
43
+ "bootstrap_ci",
44
+ "cross_validate_metric",
45
+ "cv_clt_ci",
46
+ "delong_roc_variance",
47
+ "mde_from_ci",
48
+ "paired_bootstrap_diff",
49
+ "paired_bootstrap_ece_diff",
50
+ "paired_bootstrap_op_point_diff",
51
+ "paired_mde",
52
+ ]
53
+
54
+ DEFAULT_N_RESAMPLES: Final[int] = 1000
55
+ DEFAULT_CONFIDENCE: Final[float] = 0.95
56
+ DEFAULT_METHOD: Final[Literal["BCa", "percentile"]] = "BCa"
57
+ DEFAULT_SEED: Final[int] = 42
58
+
59
+ MetricFn = Callable[[np.ndarray, np.ndarray], float]
60
+ ThresholdFn = Callable[[np.ndarray, np.ndarray], float]
61
+ ThresholdedMetricFn = Callable[[np.ndarray, np.ndarray, float], float]
62
+
63
+
64
+ @dataclass(frozen=True, slots=True)
65
+ class BootstrapCI:
66
+ """95% CI for a metric on a single condition.
67
+
68
+ Parameters
69
+ ----------
70
+ point_estimate : float
71
+ Metric value on the original (non-resampled) data.
72
+ ci_low, ci_high : float
73
+ Lower / upper bound of the confidence interval.
74
+ confidence : float
75
+ Two-sided confidence level ∈ (0, 1) (typically 0.95).
76
+ n_resamples : int
77
+ Number of bootstrap resamples used.
78
+ method : str
79
+ Either ``"BCa"`` (bias-corrected accelerated) or ``"percentile"``.
80
+
81
+ Examples
82
+ --------
83
+ >>> ci = BootstrapCI(
84
+ ... point_estimate=0.85, ci_low=0.78, ci_high=0.91,
85
+ ... confidence=0.95, n_resamples=1000, method="BCa",
86
+ ... )
87
+ >>> ci.ci_low <= ci.point_estimate <= ci.ci_high
88
+ True
89
+
90
+ Notes
91
+ -----
92
+ Frozen value-type. The BCa interval does **not** guarantee
93
+ ``ci_low ≤ point_estimate ≤ ci_high`` — the bias correction can shift
94
+ the interval off-center. Callers that need that invariant should use
95
+ ``method="percentile"``.
96
+
97
+ References
98
+ ----------
99
+ .. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap."
100
+ Chapman & Hall, 1993. (Chapter 14: BCa.)
101
+ """
102
+
103
+ point_estimate: float
104
+ ci_low: float
105
+ ci_high: float
106
+ confidence: float
107
+ n_resamples: int
108
+ method: str
109
+
110
+ def to_dict(self) -> dict[str, object]:
111
+ """Serialize to a stable dict schema for JSON output."""
112
+ return {
113
+ "point_estimate": self.point_estimate,
114
+ "ci_95": [self.ci_low, self.ci_high],
115
+ "confidence": self.confidence,
116
+ "n_resamples": self.n_resamples,
117
+ "method": self.method,
118
+ }
119
+
120
+
121
+ @dataclass(frozen=True, slots=True)
122
+ class PairedBootstrapCI:
123
+ """95% CI for ``metric(B) − metric(A)`` on shared resample indices.
124
+
125
+ The lift Δ is the headline statistic for an anti-overengineering stopping
126
+ rule: if ``ci_low <= 0 <= ci_high`` (``overlaps_zero`` is True), the
127
+ improvement is not statistically significant.
128
+
129
+ Parameters
130
+ ----------
131
+ delta : float
132
+ Point estimate of ``metric(B) − metric(A)`` on the original data.
133
+ ci_low, ci_high : float
134
+ Lower / upper paired-bootstrap CI bounds on the difference.
135
+ overlaps_zero : bool
136
+ True iff ``ci_low <= 0 <= ci_high`` (inclusive). Encodes the
137
+ zero-effect null result, including the degenerate case where
138
+ ``ci_low == ci_high == 0``.
139
+ confidence : float
140
+ Two-sided confidence level ∈ (0, 1).
141
+ n_resamples : int
142
+ Number of paired bootstrap resamples.
143
+
144
+ Examples
145
+ --------
146
+ >>> pci = PairedBootstrapCI(
147
+ ... delta=0.05, ci_low=0.02, ci_high=0.08,
148
+ ... overlaps_zero=False, confidence=0.95, n_resamples=1000,
149
+ ... )
150
+ >>> pci.overlaps_zero, pci.delta
151
+ (False, 0.05)
152
+
153
+ Notes
154
+ -----
155
+ Paired resampling shares the resample indices between the A and B score
156
+ arrays, so the variance of the difference is reduced by the
157
+ cross-condition correlation — typically a much tighter CI than
158
+ differencing two unpaired CIs would produce.
159
+
160
+ References
161
+ ----------
162
+ .. [1] Efron, B. "Bootstrap methods: Another look at the jackknife."
163
+ Annals of Statistics 7(1), 1979.
164
+ .. [2] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap."
165
+ Chapman & Hall, 1993. (§10.3 paired bootstrap.)
166
+ """
167
+
168
+ delta: float
169
+ ci_low: float
170
+ ci_high: float
171
+ overlaps_zero: bool
172
+ confidence: float
173
+ n_resamples: int
174
+
175
+ def to_dict(self) -> dict[str, object]:
176
+ """Serialize to a stable dict schema for JSON output."""
177
+ return {
178
+ "delta": self.delta,
179
+ "ci_95": [self.ci_low, self.ci_high],
180
+ "overlaps_zero": self.overlaps_zero,
181
+ "confidence": self.confidence,
182
+ "n_resamples": self.n_resamples,
183
+ }
184
+
185
+
186
+ def bootstrap_ci(
187
+ y_true: np.ndarray,
188
+ y_score: np.ndarray,
189
+ metric: MetricFn,
190
+ *,
191
+ n_resamples: int = DEFAULT_N_RESAMPLES,
192
+ confidence: float = DEFAULT_CONFIDENCE,
193
+ method: Literal["BCa", "percentile", "studentized"] = DEFAULT_METHOD,
194
+ seed: int = DEFAULT_SEED,
195
+ ) -> BootstrapCI:
196
+ """Per-condition CI via :func:`scipy.stats.bootstrap`.
197
+
198
+ Resamples paired ``(y_true, y_score)`` indices with replacement. Standard
199
+ BCa unless ``method='percentile'`` is forced (recommended fallback for
200
+ very small slices where BCa jackknife may misbehave).
201
+
202
+ Parameters
203
+ ----------
204
+ y_true, y_score : np.ndarray, shape (n,)
205
+ Labels and scores.
206
+ metric : callable ``(y_true, y_score) -> float``
207
+ Any metric. ``pr_auc``, ``roc_auc``, etc.
208
+ n_resamples : int, optional
209
+ Default 1000.
210
+ confidence : float, optional
211
+ Two-sided confidence level (default 0.95).
212
+ method : {"BCa", "percentile"}, optional
213
+ Default "BCa".
214
+ seed : int, optional
215
+ RNG seed for reproducibility.
216
+
217
+ Returns
218
+ -------
219
+ BootstrapCI
220
+
221
+ Raises
222
+ ------
223
+ ValueError
224
+ If shapes mismatch, ``n < 10``, or ``confidence ∉ (0, 1)``.
225
+
226
+ Examples
227
+ --------
228
+ >>> import numpy as np
229
+ >>> from eval_toolkit.metrics import pr_auc
230
+ >>> rng = np.random.default_rng(42)
231
+ >>> y = rng.integers(0, 2, size=200)
232
+ >>> s = y + rng.normal(0, 0.3, size=200)
233
+ >>> ci = bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, seed=42)
234
+ >>> ci.ci_low <= ci.point_estimate <= ci.ci_high
235
+ True
236
+
237
+ Notes
238
+ -----
239
+ The bias-corrected and accelerated (BCa) interval [1]_ is recommended over
240
+ plain percentile for asymmetric statistics. For very small samples, BCa
241
+ jackknife can degenerate; percentile is the safe fallback.
242
+
243
+ References
244
+ ----------
245
+ .. [1] Efron, B. "Better bootstrap confidence intervals." JASA 82(397),
246
+ 1987.
247
+ .. [2] DiCiccio, T. J. & Efron, B. "Bootstrap confidence intervals."
248
+ Statistical Science 11(3), 1996.
249
+ """
250
+ y_true_arr = np.asarray(y_true)
251
+ y_score_arr = np.asarray(y_score)
252
+ if y_true_arr.shape != y_score_arr.shape:
253
+ raise ValueError(f"y_true shape {y_true_arr.shape} != y_score shape {y_score_arr.shape}")
254
+ n = len(y_true_arr)
255
+ if n < 10:
256
+ raise ValueError(f"n={n} too small for bootstrap; need ≥ 10")
257
+ if not 0 < confidence < 1:
258
+ raise ValueError(f"confidence must be in (0, 1), got {confidence}")
259
+
260
+ point = float(metric(y_true_arr, y_score_arr))
261
+
262
+ def _statistic(yt: np.ndarray, ys: np.ndarray) -> float:
263
+ return float(metric(yt, ys))
264
+
265
+ rng = np.random.default_rng(seed)
266
+ if method == "studentized":
267
+ ci_low, ci_high = _bootstrap_t_ci(
268
+ y_true_arr,
269
+ y_score_arr,
270
+ metric,
271
+ point,
272
+ n_resamples=n_resamples,
273
+ confidence=confidence,
274
+ rng=rng,
275
+ )
276
+ else:
277
+ res = _scipy_bootstrap(
278
+ (y_true_arr, y_score_arr),
279
+ statistic=_statistic,
280
+ n_resamples=n_resamples,
281
+ confidence_level=confidence,
282
+ method=method,
283
+ paired=True,
284
+ random_state=rng,
285
+ )
286
+ ci_low = float(res.confidence_interval.low)
287
+ ci_high = float(res.confidence_interval.high)
288
+ return BootstrapCI(
289
+ point_estimate=point,
290
+ ci_low=ci_low,
291
+ ci_high=ci_high,
292
+ confidence=confidence,
293
+ n_resamples=n_resamples,
294
+ method=method,
295
+ )
296
+
297
+
298
+ def _bootstrap_t_ci(
299
+ y_true: np.ndarray,
300
+ y_score: np.ndarray,
301
+ metric: MetricFn,
302
+ point: float,
303
+ *,
304
+ n_resamples: int,
305
+ confidence: float,
306
+ rng: np.random.Generator,
307
+ ) -> tuple[float, float]:
308
+ r"""Studentized bootstrap-t CI per Algeshiemer 2024 / Davison & Hinkley §5.2.
309
+
310
+ Outer loop: B bootstrap resamples → ``θ̂_b`` per resample.
311
+ Inner loop: jackknife within each resample → ``SE_b`` per resample.
312
+ Pivot: ``T_b = (θ̂_b - θ̂) / SE_b``.
313
+ CI: ``[θ̂ - q_{1-α/2}(T) · SE, θ̂ - q_{α/2}(T) · SE]`` where ``SE`` is
314
+ the bootstrap standard error of ``θ̂``.
315
+
316
+ Best CI coverage of any non-nested method per Algeshiemer 2024
317
+ simulations, at the cost of an extra factor ~n compute (per-resample
318
+ jackknife). Use for high-stakes inference where coverage matters.
319
+
320
+ Skips degenerate resamples (single-class draws causing the metric to
321
+ raise); raises if > 5% of resamples are degenerate.
322
+ """
323
+ n = int(len(y_true))
324
+ theta_stars = np.full(n_resamples, np.nan, dtype=np.float64)
325
+ se_stars = np.full(n_resamples, np.nan, dtype=np.float64)
326
+ # Capture first underlying exception so the n_valid raise can name it
327
+ # (was silent contextlib.suppress; per-resample logging would be noise
328
+ # in a thousands-iteration loop, but aggregate diagnostic is essential).
329
+ first_failure: str | None = None
330
+
331
+ for b in range(n_resamples):
332
+ idx = rng.integers(0, n, size=n)
333
+ y_b = y_true[idx]
334
+ s_b = y_score[idx]
335
+ try:
336
+ theta_b = float(metric(y_b, s_b))
337
+ except (ValueError, RuntimeError) as exc:
338
+ if first_failure is None:
339
+ first_failure = f"{type(exc).__name__}: {exc}"
340
+ continue
341
+ # Inner jackknife: leave-one-out within the resample.
342
+ loo = np.full(n, np.nan, dtype=np.float64)
343
+ for i in range(n):
344
+ try:
345
+ loo[i] = float(metric(np.delete(y_b, i), np.delete(s_b, i)))
346
+ except (ValueError, RuntimeError) as exc:
347
+ if first_failure is None:
348
+ first_failure = f"{type(exc).__name__}: {exc}"
349
+ valid = ~np.isnan(loo)
350
+ if int(valid.sum()) < 2:
351
+ continue
352
+ loo_mean = float(np.nanmean(loo))
353
+ jack_var = (n - 1.0) / n * float(np.nansum((loo[valid] - loo_mean) ** 2))
354
+ if jack_var <= 0.0:
355
+ continue
356
+ theta_stars[b] = theta_b
357
+ se_stars[b] = float(np.sqrt(jack_var))
358
+
359
+ valid_mask = ~np.isnan(theta_stars) & ~np.isnan(se_stars) & (se_stars > 0.0)
360
+ n_valid = int(valid_mask.sum())
361
+ if n_valid < n_resamples * 0.95:
362
+ first_msg = (
363
+ f"; first underlying failure: {first_failure}" if first_failure is not None else ""
364
+ )
365
+ raise ValueError(
366
+ f"_bootstrap_t_ci: {n_resamples - n_valid}/{n_resamples} resamples "
367
+ f"degenerate (single-class draws or zero jackknife variance); "
368
+ f"refusing to compute studentized CI on > 5% degenerate resamples"
369
+ f"{first_msg}"
370
+ )
371
+
372
+ theta_v = theta_stars[valid_mask]
373
+ se_v = se_stars[valid_mask]
374
+ pivots = (theta_v - point) / se_v
375
+ se_overall = float(np.std(theta_v, ddof=1))
376
+ alpha = (1.0 - confidence) / 2.0
377
+ q_lo = float(np.quantile(pivots, alpha))
378
+ q_hi = float(np.quantile(pivots, 1.0 - alpha))
379
+ # CI is asymmetric — pivot quantiles are subtracted in reverse order.
380
+ return point - q_hi * se_overall, point - q_lo * se_overall
381
+
382
+
383
+ def paired_bootstrap_diff(
384
+ y_true: np.ndarray,
385
+ y_score_a: np.ndarray,
386
+ y_score_b: np.ndarray,
387
+ metric: MetricFn,
388
+ *,
389
+ n_resamples: int = DEFAULT_N_RESAMPLES,
390
+ confidence: float = DEFAULT_CONFIDENCE,
391
+ seed: int = DEFAULT_SEED,
392
+ ) -> PairedBootstrapCI:
393
+ """Paired-bootstrap CI on ``metric(B) − metric(A)`` using the same resample indices.
394
+
395
+ Parameters
396
+ ----------
397
+ y_true : np.ndarray, shape (n,)
398
+ Binary labels.
399
+ y_score_a, y_score_b : np.ndarray, shape (n,)
400
+ Scores from two scorers on the same rows.
401
+ metric : callable ``(y_true, y_score) -> float``
402
+ n_resamples, confidence, seed : standard bootstrap params.
403
+
404
+ Returns
405
+ -------
406
+ PairedBootstrapCI
407
+
408
+ Raises
409
+ ------
410
+ ValueError
411
+ If ``y_true``, ``y_score_a``, ``y_score_b`` do not share the same
412
+ shape; if ``n < 10`` (too small for paired bootstrap); if more
413
+ than 5% of resamples raised in ``metric`` (rare-positive
414
+ degeneracy); or if no resamples produced a usable Δ.
415
+
416
+ Examples
417
+ --------
418
+ >>> import numpy as np
419
+ >>> from eval_toolkit.metrics import pr_auc
420
+ >>> rng = np.random.default_rng(42)
421
+ >>> y = rng.integers(0, 2, size=200)
422
+ >>> s_a = rng.normal(0, 1, size=200) # random scorer
423
+ >>> s_b = y + rng.normal(0, 0.3, size=200) # signal scorer
424
+ >>> diff = paired_bootstrap_diff(y, s_a, s_b, pr_auc, n_resamples=200, seed=42)
425
+ >>> diff.delta > 0 # B beats A
426
+ True
427
+
428
+ Notes
429
+ -----
430
+ Resampling indices once and computing both metrics on the same resample
431
+ correlates the two bootstrap distributions, producing a tighter CI on Δ
432
+ than independent unpaired bootstraps would.
433
+
434
+ References
435
+ ----------
436
+ .. [1] Efron, B. "Bootstrap methods: Another look at the jackknife."
437
+ Annals of Statistics 7(1), 1979.
438
+ .. [2] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap."
439
+ Chapman & Hall, 1993. (§10.3.)
440
+ """
441
+ y_true_arr = np.asarray(y_true)
442
+ a = np.asarray(y_score_a)
443
+ b = np.asarray(y_score_b)
444
+ if not (y_true_arr.shape == a.shape == b.shape):
445
+ raise ValueError(f"shapes mismatch: y_true {y_true_arr.shape}, a {a.shape}, b {b.shape}")
446
+ n = len(y_true_arr)
447
+ if n < 10:
448
+ raise ValueError(f"n={n} too small for paired bootstrap; need ≥ 10")
449
+
450
+ delta_point = float(metric(y_true_arr, b)) - float(metric(y_true_arr, a))
451
+ rng = np.random.default_rng(seed)
452
+ deltas: list[float] = []
453
+ failures = 0
454
+ for _ in range(n_resamples):
455
+ idx = rng.integers(0, n, size=n)
456
+ try:
457
+ metric_b = float(metric(y_true_arr[idx], b[idx]))
458
+ metric_a = float(metric(y_true_arr[idx], a[idx]))
459
+ except (ValueError, RuntimeError):
460
+ # Single-class resamples raise ValueError on PR/ROC-AUC; rare-positive
461
+ # data can also trigger sklearn's UndefinedMetric. Skip + audit.
462
+ failures += 1
463
+ continue
464
+ deltas.append(metric_b - metric_a)
465
+
466
+ if failures > 0.05 * n_resamples:
467
+ raise ValueError(
468
+ f"paired_bootstrap_diff: {failures}/{n_resamples} resamples raised "
469
+ "the metric function (likely single-class draws on rare-positive "
470
+ "data); refusing to compute CI on > 5% degenerate resamples"
471
+ )
472
+ if not deltas:
473
+ raise ValueError("paired_bootstrap_diff: no usable resamples")
474
+
475
+ deltas_arr = np.asarray(deltas, dtype=np.float64)
476
+ alpha = (1.0 - confidence) / 2.0
477
+ ci_low = float(np.quantile(deltas_arr, alpha))
478
+ ci_high = float(np.quantile(deltas_arr, 1.0 - alpha))
479
+ return PairedBootstrapCI(
480
+ delta=delta_point,
481
+ ci_low=ci_low,
482
+ ci_high=ci_high,
483
+ overlaps_zero=ci_low <= 0.0 <= ci_high,
484
+ confidence=confidence,
485
+ n_resamples=int(len(deltas_arr)),
486
+ )
487
+
488
+
489
+ def paired_bootstrap_ece_diff(
490
+ y_true: np.ndarray,
491
+ y_score_a: np.ndarray,
492
+ y_score_b: np.ndarray,
493
+ *,
494
+ ece_fn: Callable[[np.ndarray, np.ndarray, int], float],
495
+ n_resamples: int = DEFAULT_N_RESAMPLES,
496
+ confidence: float = DEFAULT_CONFIDENCE,
497
+ seed: int = DEFAULT_SEED,
498
+ n_bins: int = 10,
499
+ ) -> PairedBootstrapCI:
500
+ r"""Paired-bootstrap CI on ``ECE(B) − ECE(A)`` for two calibrated outputs.
501
+
502
+ Uses the same resample indices for both calibrators so the Δ is paired
503
+ across calibration methods (correlated → tighter CI). Skips degenerate
504
+ single-class resamples (which have undefined ECE) and raises if the
505
+ failure rate exceeds 5%.
506
+
507
+ Parameters
508
+ ----------
509
+ y_true : np.ndarray, shape (n,)
510
+ Binary labels.
511
+ y_score_a, y_score_b : np.ndarray, shape (n,)
512
+ Calibrated probabilities from method A and method B.
513
+ ece_fn : callable ``(y_true, y_score, n_bins) -> float``
514
+ ECE function to use. Bootstrap is metric-agnostic; caller injects the
515
+ specific ECE variant (equal-width, equal-mass, etc.) so this module
516
+ does not depend on calibration. Typical use:
517
+ ``from eval_toolkit.metrics import expected_calibration_error``,
518
+ then pass ``ece_fn=expected_calibration_error``.
519
+ n_resamples, confidence, seed : standard bootstrap params.
520
+ n_bins : int, optional
521
+ Number of ECE bins (passed through to ``ece_fn``).
522
+
523
+ Returns
524
+ -------
525
+ PairedBootstrapCI
526
+ ``Δ = ECE_B − ECE_A`` with paired-percentile CI. Lower delta means
527
+ method B is *better calibrated* than A.
528
+
529
+ Raises
530
+ ------
531
+ ValueError
532
+ On shape mismatch, ``n < 10``, or > 5% degenerate resamples.
533
+
534
+ Notes
535
+ -----
536
+ The dependency injection of ``ece_fn`` is intentional: bootstrap math is
537
+ independent of which ECE variant is being compared, so this module stays
538
+ metric-agnostic and depends only on numpy + scipy.
539
+ """
540
+ y_true_arr = np.asarray(y_true).astype(int)
541
+ a = np.asarray(y_score_a, dtype=float)
542
+ b = np.asarray(y_score_b, dtype=float)
543
+ if not (y_true_arr.shape == a.shape == b.shape):
544
+ raise ValueError(f"shapes mismatch: y_true {y_true_arr.shape}, a {a.shape}, b {b.shape}")
545
+ n = int(y_true_arr.size)
546
+ if n < 10:
547
+ raise ValueError(f"n={n} too small for paired bootstrap; need >= 10")
548
+
549
+ delta_point = float(ece_fn(y_true_arr, b, n_bins)) - float(ece_fn(y_true_arr, a, n_bins))
550
+ rng = np.random.default_rng(seed)
551
+ deltas: list[float] = []
552
+ failures = 0
553
+ for _ in range(n_resamples):
554
+ idx = rng.integers(0, n, size=n)
555
+ y_re = y_true_arr[idx]
556
+ n_pos = int(y_re.sum())
557
+ if n_pos == 0 or n_pos == n:
558
+ failures += 1
559
+ continue
560
+ try:
561
+ ece_a = ece_fn(y_re, a[idx], n_bins)
562
+ ece_b = ece_fn(y_re, b[idx], n_bins)
563
+ except (ValueError, ZeroDivisionError):
564
+ failures += 1
565
+ continue
566
+ deltas.append(float(ece_b - ece_a))
567
+
568
+ if failures > 0.05 * n_resamples:
569
+ raise ValueError(
570
+ f"paired_bootstrap_ece_diff: {failures}/{n_resamples} resamples degenerate; "
571
+ "input may be too small or too imbalanced"
572
+ )
573
+ if not deltas:
574
+ raise ValueError("paired_bootstrap_ece_diff: no usable resamples")
575
+
576
+ deltas_arr = np.asarray(deltas, dtype=float)
577
+ alpha = (1.0 - confidence) / 2.0
578
+ ci_low = float(np.quantile(deltas_arr, alpha))
579
+ ci_high = float(np.quantile(deltas_arr, 1.0 - alpha))
580
+ return PairedBootstrapCI(
581
+ delta=delta_point,
582
+ ci_low=ci_low,
583
+ ci_high=ci_high,
584
+ overlaps_zero=ci_low <= 0.0 <= ci_high,
585
+ confidence=confidence,
586
+ n_resamples=len(deltas),
587
+ )
588
+
589
+
590
+ def paired_bootstrap_op_point_diff(
591
+ val_y: np.ndarray,
592
+ val_score_a: np.ndarray,
593
+ val_score_b: np.ndarray,
594
+ test_y: np.ndarray,
595
+ test_score_a: np.ndarray,
596
+ test_score_b: np.ndarray,
597
+ threshold_fn: ThresholdFn,
598
+ metric_fn: ThresholdedMetricFn,
599
+ *,
600
+ n_resamples: int = DEFAULT_N_RESAMPLES,
601
+ confidence: float = DEFAULT_CONFIDENCE,
602
+ seed: int = DEFAULT_SEED,
603
+ ) -> PairedBootstrapCI:
604
+ r"""Two-level paired bootstrap for operating-point lifts.
605
+
606
+ Operating-point metrics (F1@threshold, precision@threshold, recall@
607
+ threshold) depend on a threshold *chosen on val*. The single-level
608
+ paired bootstrap re-uses one fixed val-derived threshold across all
609
+ resamples, which under-counts variance from threshold selection. This
610
+ helper resamples val + test independently per iteration, refits the
611
+ threshold on the val resample (via ``threshold_fn``), then evaluates
612
+ ``metric_fn`` at that threshold on the test resample for both scorers.
613
+
614
+ Both scorers share the val resample (so threshold differences stay
615
+ apples-to-apples); each scorer fits its *own* threshold from that
616
+ shared resample. Test indices are likewise shared across scorers.
617
+
618
+ Parameters
619
+ ----------
620
+ val_y, val_score_a, val_score_b : np.ndarray
621
+ Validation labels and scores for both scorers.
622
+ test_y, test_score_a, test_score_b : np.ndarray
623
+ Test labels and scores for both scorers.
624
+ threshold_fn : callable ``(y_true, y_score) -> threshold``
625
+ Typically wraps ``ThresholdSelector.select(...).threshold`` (e.g.
626
+ ``lambda y, s: MaxF1Selector().select(y, s).threshold``).
627
+ metric_fn : callable ``(y_true, y_score, threshold) -> float``
628
+ Operating-point metric (e.g., F1, precision) at the given threshold.
629
+ n_resamples, confidence, seed : standard bootstrap params.
630
+
631
+ Returns
632
+ -------
633
+ PairedBootstrapCI
634
+ ``Δ = metric_B(test) − metric_A(test)`` with both val-threshold variance
635
+ and test-metric variance baked in.
636
+
637
+ Raises
638
+ ------
639
+ ValueError
640
+ On shape mismatch or insufficient sample size.
641
+ RuntimeError
642
+ If > 50% of resamples are degenerate (e.g., single-class val draws).
643
+
644
+ Notes
645
+ -----
646
+ Two-level structure: outer level resamples val + test indices; inner level
647
+ refits the threshold on the val resample, then evaluates on the test
648
+ resample. The combined CI is wider than the fixed-threshold paired CI
649
+ because it absorbs threshold-selection noise.
650
+
651
+ Methodological caveats:
652
+
653
+ 1. **Variance-only simplification**: this is a *variance-correction*
654
+ nested bootstrap — it does not implement the double-bootstrap bias
655
+ correction in Davison & Hinkley §4.2 eq. 4.6. Acceptable for most
656
+ ML applications but matters at small val sets or near boundary
657
+ prevalences (e.g., precision@99% recall).
658
+ 2. **Independent val/test resampling**: deliberately drops any
659
+ correlation structure between val and test (correct under i.i.d.
660
+ splits; conservative under deliberate-OOD splits).
661
+ 3. **Replicability caveat**: paired bootstrap tests with re-used data
662
+ have lower replicability than naive degrees-of-freedom suggest
663
+ (Bouckaert 2003).
664
+
665
+ References
666
+ ----------
667
+ .. [1] Davison, A. C. & Hinkley, D. V. "Bootstrap Methods and their
668
+ Application." Cambridge, 1997. (§4.2 Nested bootstrap.)
669
+ .. [2] Bouckaert, R. R. "Choosing between two learning algorithms
670
+ based on calibrated tests." ICML 2003.
671
+ """
672
+ val_y_arr = np.asarray(val_y)
673
+ val_a, val_b = np.asarray(val_score_a), np.asarray(val_score_b)
674
+ test_y_arr = np.asarray(test_y)
675
+ test_a, test_b = np.asarray(test_score_a), np.asarray(test_score_b)
676
+ if not (val_y_arr.shape == val_a.shape == val_b.shape):
677
+ raise ValueError(
678
+ f"val shape mismatch: y={val_y_arr.shape}, a={val_a.shape}, b={val_b.shape}"
679
+ )
680
+ if not (test_y_arr.shape == test_a.shape == test_b.shape):
681
+ raise ValueError(
682
+ f"test shape mismatch: y={test_y_arr.shape}, a={test_a.shape}, b={test_b.shape}"
683
+ )
684
+ n_val, n_test = len(val_y_arr), len(test_y_arr)
685
+ if n_val < 10 or n_test < 10:
686
+ raise ValueError(f"need ≥ 10 rows in val and test; got val={n_val}, test={n_test}")
687
+
688
+ thr_a_full = float(threshold_fn(val_y_arr, val_a))
689
+ thr_b_full = float(threshold_fn(val_y_arr, val_b))
690
+ delta_point = float(metric_fn(test_y_arr, test_b, thr_b_full)) - float(
691
+ metric_fn(test_y_arr, test_a, thr_a_full)
692
+ )
693
+
694
+ rng = np.random.default_rng(seed)
695
+ deltas = np.empty(n_resamples, dtype=np.float64)
696
+ failures = 0
697
+ for r in range(n_resamples):
698
+ val_idx = rng.integers(0, n_val, size=n_val)
699
+ test_idx = rng.integers(0, n_test, size=n_test)
700
+ try:
701
+ thr_a = float(threshold_fn(val_y_arr[val_idx], val_a[val_idx]))
702
+ thr_b = float(threshold_fn(val_y_arr[val_idx], val_b[val_idx]))
703
+ m_a = float(metric_fn(test_y_arr[test_idx], test_a[test_idx], thr_a))
704
+ m_b = float(metric_fn(test_y_arr[test_idx], test_b[test_idx], thr_b))
705
+ deltas[r] = m_b - m_a
706
+ except (ValueError, RuntimeError):
707
+ deltas[r] = np.nan
708
+ failures += 1
709
+ valid = deltas[~np.isnan(deltas)]
710
+ if len(valid) < n_resamples // 2:
711
+ raise RuntimeError(
712
+ f"paired_bootstrap_op_point_diff: {failures}/{n_resamples} resamples degenerate; "
713
+ "refusing to compute CI on < 50% of requested resamples"
714
+ )
715
+
716
+ alpha = (1.0 - confidence) / 2.0
717
+ ci_low = float(np.quantile(valid, alpha))
718
+ ci_high = float(np.quantile(valid, 1.0 - alpha))
719
+ return PairedBootstrapCI(
720
+ delta=delta_point,
721
+ ci_low=ci_low,
722
+ ci_high=ci_high,
723
+ overlaps_zero=ci_low <= 0.0 <= ci_high,
724
+ confidence=confidence,
725
+ n_resamples=int(len(valid)),
726
+ )
727
+
728
+
729
+ @dataclass(frozen=True, slots=True)
730
+ class MDEEstimate:
731
+ r"""Minimum detectable Δ at the requested (α, 1-β).
732
+
733
+ ``mde`` is the smallest true Δ that the paired bootstrap on this
734
+ ``(y, a, b)`` configuration would detect with probability ≥ ``power`` at
735
+ significance ``alpha`` (two-sided). Computed analytically from the
736
+ bootstrap-estimated standard error of Δ:
737
+
738
+ .. math::
739
+
740
+ \mathrm{MDE} = (z_{\alpha/2} + z_{\beta}) \cdot \sigma_\Delta
741
+
742
+ where :math:`\sigma_\Delta = (\mathrm{ci\_high} - \mathrm{ci\_low}) / (2 \cdot 1.96)`.
743
+ Assumes asymptotic normality of the bootstrap distribution; for small N
744
+ this is a reasonable but not exact approximation.
745
+
746
+ Parameters
747
+ ----------
748
+ mde : float
749
+ Minimum detectable difference at the configured (α, power).
750
+ sigma_delta : float
751
+ Standard error of Δ inferred from the paired-bootstrap CI half-width.
752
+ delta_observed : float
753
+ Observed point estimate of Δ on the original data.
754
+ alpha : float
755
+ Two-sided significance level used in the MDE calculation (typically
756
+ 0.05).
757
+ power : float
758
+ Detection probability used in the MDE calculation (typically 0.80).
759
+ n_resamples : int
760
+ Number of paired-bootstrap resamples that produced the source CI.
761
+ n : int
762
+ Sample size used in the paired bootstrap (-1 if unknown — see
763
+ :func:`mde_from_ci`).
764
+
765
+ Examples
766
+ --------
767
+ >>> est = MDEEstimate(
768
+ ... mde=0.04, sigma_delta=0.014, delta_observed=0.02,
769
+ ... alpha=0.05, power=0.8, n_resamples=1000, n=500,
770
+ ... )
771
+ >>> est.delta_observed < est.mde # observed < MDE → underpowered
772
+ True
773
+
774
+ Notes
775
+ -----
776
+ The MDE is the minimum *true* effect size detectable; the *observed*
777
+ delta can be smaller than MDE (in which case the experiment is
778
+ underpowered) or larger (in which case the result is interpretable).
779
+
780
+ References
781
+ ----------
782
+ .. [1] Cohen, J. "Statistical Power Analysis for the Behavioral Sciences."
783
+ 2nd ed., Lawrence Erlbaum, 1988.
784
+ """
785
+
786
+ mde: float
787
+ sigma_delta: float
788
+ delta_observed: float
789
+ alpha: float
790
+ power: float
791
+ n_resamples: int
792
+ n: int
793
+
794
+ def to_dict(self) -> dict[str, object]:
795
+ """Serialize the MDE estimate."""
796
+ return {
797
+ "mde": self.mde,
798
+ "sigma_delta": self.sigma_delta,
799
+ "delta_observed": self.delta_observed,
800
+ "alpha": self.alpha,
801
+ "power": self.power,
802
+ "n_resamples": self.n_resamples,
803
+ "n": self.n,
804
+ }
805
+
806
+
807
+ def mde_from_ci(
808
+ paired: PairedBootstrapCI,
809
+ *,
810
+ alpha: float = 0.05,
811
+ power: float = 0.80,
812
+ ) -> MDEEstimate:
813
+ r"""Derive MDE from an existing ``PairedBootstrapCI`` (no second bootstrap).
814
+
815
+ Reuses the bootstrap distribution implicit in the paired CI: the
816
+ half-width at 95% gives :math:`\sigma_\Delta \approx (\mathrm{ci\_high} - \mathrm{ci\_low}) / (2 \cdot 1.96)`,
817
+ and the standard two-sided MDE formula gives
818
+ :math:`\mathrm{MDE} = (z_{\alpha/2} + z_{\beta}) \cdot \sigma_\Delta`.
819
+
820
+ Parameters
821
+ ----------
822
+ paired : PairedBootstrapCI
823
+ alpha : float, optional
824
+ Two-sided significance level (default 0.05).
825
+ power : float, optional
826
+ Detection probability at true Δ = MDE (default 0.80).
827
+
828
+ Returns
829
+ -------
830
+ MDEEstimate
831
+ ``n`` is set to -1 (unknown without source arrays).
832
+
833
+ Raises
834
+ ------
835
+ ValueError
836
+ If ``alpha`` or ``power`` is not in (0, 1).
837
+ RuntimeError
838
+ If the supplied CI has non-positive width (paired bootstrap
839
+ degenerate; no usable variance signal).
840
+
841
+ Notes
842
+ -----
843
+ Limitations of the analytical σ̂ from CI half-width:
844
+
845
+ 1. **Normality assumption**: ``σ̂_Δ = width / (2 · z_{α/2})`` assumes
846
+ the bootstrap distribution of Δ is approximately normal and
847
+ symmetric. For small ``n_resamples`` (< 200) or skewed metrics
848
+ (PR-AUC under extreme imbalance), σ̂ is biased.
849
+ 2. **Boundary-effect bias on bounded metrics**: when the true Δ is
850
+ near 0 or near the metric's max (e.g., AUC ≈ 1), the CI is
851
+ asymmetric and the half-width approximation under-estimates σ.
852
+ 3. **Skew bias**: for heavy-tailed Δ distributions the percentile-CI
853
+ half-width over-estimates σ. Use :func:`paired_mde` (which
854
+ computes σ from the deltas directly) when these effects matter.
855
+
856
+ References
857
+ ----------
858
+ .. [1] Cohen, J. "Statistical Power Analysis for the Behavioral
859
+ Sciences." 2nd ed., Lawrence Erlbaum, 1988.
860
+ """
861
+ if not 0.0 < alpha < 1.0:
862
+ raise ValueError(f"alpha must be in (0, 1), got {alpha}")
863
+ if not 0.0 < power < 1.0:
864
+ raise ValueError(f"power must be in (0, 1), got {power}")
865
+ width = paired.ci_high - paired.ci_low
866
+ if width <= 0:
867
+ raise RuntimeError(f"non-positive CI width ({width}); paired bootstrap likely degenerate")
868
+ z_at_paired_conf = _normal_quantile((1.0 + paired.confidence) / 2.0)
869
+ sigma = width / (2.0 * z_at_paired_conf)
870
+ z_alpha = _normal_quantile(1.0 - alpha / 2.0)
871
+ z_power = _normal_quantile(power)
872
+ mde = float((z_alpha + z_power) * sigma)
873
+ return MDEEstimate(
874
+ mde=mde,
875
+ sigma_delta=float(sigma),
876
+ delta_observed=float(paired.delta),
877
+ alpha=alpha,
878
+ power=power,
879
+ n_resamples=int(paired.n_resamples),
880
+ n=-1,
881
+ )
882
+
883
+
884
+ def paired_mde(
885
+ y_true: np.ndarray,
886
+ y_score_a: np.ndarray,
887
+ y_score_b: np.ndarray,
888
+ metric: MetricFn,
889
+ *,
890
+ alpha: float = 0.05,
891
+ power: float = 0.80,
892
+ n_resamples: int = DEFAULT_N_RESAMPLES,
893
+ seed: int = DEFAULT_SEED,
894
+ ) -> MDEEstimate:
895
+ r"""Minimum detectable paired Δ at (α, power).
896
+
897
+ Quantifies "the headline lift barely clears zero": given the observed
898
+ paired-bootstrap variance of Δ, the smallest true Δ this test would
899
+ reject the null on with ``power`` probability is
900
+
901
+ .. math::
902
+
903
+ \mathrm{MDE} = (z_{\alpha/2} + z_{\beta}) \cdot \sigma_\Delta
904
+
905
+ For α=0.05, power=0.80 the multiplier is ≈ 2.80
906
+ (:math:`z_{0.025} \approx 1.96`, :math:`z_{0.20} \approx 0.842`).
907
+
908
+ Parameters
909
+ ----------
910
+ y_true, y_score_a, y_score_b : np.ndarray
911
+ Labels and two scorers' outputs on the same rows.
912
+ metric : MetricFn
913
+ alpha : float, optional
914
+ Two-sided significance (default 0.05).
915
+ power : float, optional
916
+ 1 − β; probability of detection at true Δ = MDE (default 0.80).
917
+
918
+ Returns
919
+ -------
920
+ MDEEstimate
921
+ """
922
+ paired = paired_bootstrap_diff(
923
+ y_true,
924
+ y_score_a,
925
+ y_score_b,
926
+ metric,
927
+ n_resamples=n_resamples,
928
+ confidence=0.95,
929
+ seed=seed,
930
+ )
931
+ est = mde_from_ci(paired, alpha=alpha, power=power)
932
+ return MDEEstimate(
933
+ mde=est.mde,
934
+ sigma_delta=est.sigma_delta,
935
+ delta_observed=est.delta_observed,
936
+ alpha=est.alpha,
937
+ power=est.power,
938
+ n_resamples=est.n_resamples,
939
+ n=int(len(np.asarray(y_true))),
940
+ )
941
+
942
+
943
+ def _normal_quantile(p: float) -> float:
944
+ """Inverse CDF (PPF) of the standard normal — exact via :func:`scipy.stats.norm.ppf`."""
945
+ if not 0.0 < p < 1.0:
946
+ raise ValueError(f"p must be in (0, 1), got {p}")
947
+ return float(_scipy_norm.ppf(p))
948
+
949
+
950
+ def cv_clt_ci(
951
+ fold_metrics: np.ndarray,
952
+ *,
953
+ confidence: float = DEFAULT_CONFIDENCE,
954
+ ) -> BootstrapCI:
955
+ r"""CV-corrected confidence interval per Bayle et al. 2020 [#bayle]_ Theorem 3.1.
956
+
957
+ Computes a confidence interval on the cross-validation mean metric
958
+ that correctly accounts for fold-level dependence. The standard
959
+ "naive" CI (compute std-of-folds then divide by sqrt(K)) is anti-
960
+ conservative because the folds share training data; Bayle et al.
961
+ 2020 prove a CV-CLT with a correction factor that gives valid
962
+ coverage asymptotically.
963
+
964
+ The corrected variance estimator (Bayle 2020 Theorem 3.1):
965
+
966
+ .. math::
967
+
968
+ \widehat{\sigma}^2_{\mathrm{CV-CLT}} = \frac{1}{K - 1}
969
+ \sum_{f=1}^{K} (\widehat{\theta}_f - \bar{\theta})^2
970
+
971
+ where :math:`\widehat{\theta}_f` is the metric on fold :math:`f` and
972
+ :math:`\bar{\theta}` is the mean over folds. The CI is then
973
+ :math:`\bar{\theta} \pm z_{\alpha/2} \cdot \widehat{\sigma}_{\mathrm{CV-CLT}}
974
+ / \sqrt{K}`.
975
+
976
+ This helper does **not** run the CV — callers supply the already-fit
977
+ per-fold metric estimates. eval-toolkit does not currently ship a
978
+ cross-validation orchestrator (gated on a separate design conversation
979
+ about fold strategy + reproducibility); this function is the standalone
980
+ inference primitive for callers using their own CV runner.
981
+
982
+ Parameters
983
+ ----------
984
+ fold_metrics : np.ndarray, shape (K,)
985
+ Per-fold metric estimates. Need ≥ 2 folds.
986
+ confidence : float, optional
987
+ Two-sided confidence level (default 0.95).
988
+
989
+ Returns
990
+ -------
991
+ BootstrapCI
992
+ With ``method="cv_clt"`` and ``n_resamples=K``. ``point_estimate``
993
+ is the mean of ``fold_metrics``.
994
+
995
+ Raises
996
+ ------
997
+ ValueError
998
+ If ``fold_metrics`` has fewer than 2 entries, contains non-finite
999
+ values, or ``confidence`` is outside (0, 1).
1000
+
1001
+ Examples
1002
+ --------
1003
+ >>> import numpy as np
1004
+ >>> # 5-fold CV PR-AUC estimates (already computed externally):
1005
+ >>> folds = np.array([0.83, 0.81, 0.85, 0.79, 0.84])
1006
+ >>> ci = cv_clt_ci(folds, confidence=0.95)
1007
+ >>> ci.method
1008
+ 'cv_clt'
1009
+ >>> bool(ci.ci_low <= ci.point_estimate <= ci.ci_high)
1010
+ True
1011
+
1012
+ See Also
1013
+ --------
1014
+ eval_toolkit.bootstrap.bootstrap_ci :
1015
+ Single-test-set CI (no CV); use that for typical eval workflows.
1016
+
1017
+ References
1018
+ ----------
1019
+ .. [#bayle] Bayle, P., Bayle, A., Janson, L., & Mackey, L.
1020
+ "Cross-validation confidence intervals for test error." Annals
1021
+ of Statistics 48(6), 2020.
1022
+ """
1023
+ arr = np.asarray(fold_metrics, dtype=float)
1024
+ if arr.ndim != 1:
1025
+ raise ValueError(f"fold_metrics must be 1-D, got shape {arr.shape}")
1026
+ K = int(arr.size)
1027
+ if K < 2:
1028
+ raise ValueError(f"fold_metrics must have ≥ 2 entries, got K={K}")
1029
+ if not np.isfinite(arr).all():
1030
+ raise ValueError("fold_metrics contains NaN or inf")
1031
+ if not 0.0 < confidence < 1.0:
1032
+ raise ValueError(f"confidence must be in (0, 1), got {confidence}")
1033
+
1034
+ point = float(arr.mean())
1035
+ # Bayle 2020 Theorem 3.1 variance: sample variance with (K-1) denom; the
1036
+ # CV-CLT correction is captured in this estimator's asymptotic guarantee
1037
+ # (no extra fold-correlation factor needed for a balanced K-fold CV).
1038
+ sigma_hat = float(np.std(arr, ddof=1))
1039
+ z = _normal_quantile(0.5 + confidence / 2.0)
1040
+ margin = z * sigma_hat / np.sqrt(K)
1041
+ return BootstrapCI(
1042
+ point_estimate=point,
1043
+ ci_low=point - margin,
1044
+ ci_high=point + margin,
1045
+ confidence=confidence,
1046
+ n_resamples=K,
1047
+ method="cv_clt",
1048
+ )
1049
+
1050
+
1051
+ def cross_validate_metric(
1052
+ y_true: np.ndarray,
1053
+ y_score: np.ndarray,
1054
+ *,
1055
+ metric: MetricFn,
1056
+ k: int = 5,
1057
+ stratified: bool = True,
1058
+ seed: int = DEFAULT_SEED,
1059
+ ) -> np.ndarray:
1060
+ r"""K-fold cross-validation of a metric on caller-supplied scores.
1061
+
1062
+ Eval-only flavor: caller has ``(y_true, y_score)`` for the whole
1063
+ dataset (typically from a model that has already been trained); this
1064
+ helper just slices into K folds, computes ``metric`` on each, and
1065
+ returns the per-fold values. Pairs with :func:`cv_clt_ci` for valid
1066
+ Bayle 2020 confidence intervals on the CV mean.
1067
+
1068
+ .. note::
1069
+
1070
+ This does NOT re-train a model per fold. The toolkit is a pure
1071
+ eval-methodology library; for train+eval cross-validation use
1072
+ :func:`sklearn.model_selection.cross_validate` directly and feed
1073
+ the per-fold metric values to :func:`cv_clt_ci`.
1074
+
1075
+ Parameters
1076
+ ----------
1077
+ y_true : np.ndarray, shape (n,)
1078
+ Binary labels in {0, 1}.
1079
+ y_score : np.ndarray, shape (n,)
1080
+ Scores aligned with ``y_true``.
1081
+ metric : callable ``(y_true, y_score) -> float``
1082
+ Any metric. Single-class folds are skipped (NaN in result) — the
1083
+ caller filters NaNs before passing to ``cv_clt_ci`` if needed.
1084
+ k : int, optional
1085
+ Number of folds. Default ``5``. Must be ≥ 2.
1086
+ stratified : bool, optional
1087
+ If ``True`` (default), use ``StratifiedKFold`` so each fold
1088
+ preserves the class balance. Recommended for binary
1089
+ classification under class imbalance.
1090
+ seed : int, optional
1091
+ Shuffle seed for fold assignment.
1092
+
1093
+ Returns
1094
+ -------
1095
+ np.ndarray, shape (k,)
1096
+ Per-fold metric values. NaN entries indicate folds where the
1097
+ metric raised (e.g., single-class draw on rare-positive data).
1098
+
1099
+ Raises
1100
+ ------
1101
+ ValueError
1102
+ On shape mismatch, ``k < 2``, ``k > n``, or > 50% NaN folds
1103
+ (which would make the CI uninterpretable).
1104
+
1105
+ Examples
1106
+ --------
1107
+ >>> import numpy as np
1108
+ >>> from eval_toolkit.metrics import pr_auc
1109
+ >>> rng = np.random.default_rng(42)
1110
+ >>> n = 200
1111
+ >>> y = rng.binomial(1, 0.3, size=n).astype(int)
1112
+ >>> s = np.clip(y * 0.6 + rng.normal(0, 0.3, n), 0, 1)
1113
+ >>> folds = cross_validate_metric(y, s, metric=pr_auc, k=5, seed=42)
1114
+ >>> folds.shape
1115
+ (5,)
1116
+ >>> bool(np.all(0.0 <= folds[~np.isnan(folds)]))
1117
+ True
1118
+
1119
+ See Also
1120
+ --------
1121
+ eval_toolkit.bootstrap.cv_clt_ci :
1122
+ Compute a CV-corrected confidence interval from the per-fold
1123
+ values returned here.
1124
+ """
1125
+ from sklearn.model_selection import KFold, StratifiedKFold # noqa: PLC0415
1126
+
1127
+ y_arr = np.asarray(y_true).astype(int)
1128
+ s_arr = np.asarray(y_score, dtype=float)
1129
+ if y_arr.shape != s_arr.shape:
1130
+ raise ValueError(f"y_true shape {y_arr.shape} != y_score shape {s_arr.shape}")
1131
+ n = int(y_arr.size)
1132
+ if k < 2:
1133
+ raise ValueError(f"k must be ≥ 2, got {k}")
1134
+ if k > n:
1135
+ raise ValueError(f"k={k} exceeds n={n}")
1136
+
1137
+ splitter: KFold | StratifiedKFold
1138
+ if stratified:
1139
+ splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
1140
+ fold_iter = splitter.split(np.zeros(n), y_arr)
1141
+ else:
1142
+ splitter = KFold(n_splits=k, shuffle=True, random_state=seed)
1143
+ fold_iter = splitter.split(np.zeros(n))
1144
+
1145
+ fold_metrics = np.full(k, np.nan, dtype=np.float64)
1146
+ # Capture first underlying exception so the n_failed raise can quote it
1147
+ # (was silent contextlib.suppress; "likely single-class" guess is unhelpful
1148
+ # when the actual cause is a different upstream error).
1149
+ first_failure: str | None = None
1150
+ for f, (_train_idx, test_idx) in enumerate(fold_iter):
1151
+ try:
1152
+ fold_metrics[f] = float(metric(y_arr[test_idx], s_arr[test_idx]))
1153
+ except (ValueError, RuntimeError) as exc:
1154
+ if first_failure is None:
1155
+ first_failure = f"fold {f}: {type(exc).__name__}: {exc}"
1156
+
1157
+ n_failed = int(np.isnan(fold_metrics).sum())
1158
+ if n_failed > k // 2:
1159
+ first_msg = (
1160
+ f"; first underlying failure: {first_failure}" if first_failure is not None else ""
1161
+ )
1162
+ raise ValueError(
1163
+ f"cross_validate_metric: {n_failed}/{k} folds raised the metric "
1164
+ f"(likely single-class folds on rare-positive data); refusing to "
1165
+ f"return CV result with > 50% degenerate folds"
1166
+ f"{first_msg}"
1167
+ )
1168
+ return fold_metrics
1169
+
1170
+
1171
+ # ---------------------------------------------------------------------------
1172
+ # DeLong correlated-ROC ΔAUC z-test (v0.20.0)
1173
+ # ---------------------------------------------------------------------------
1174
+
1175
+
1176
+ @dataclass(frozen=True, slots=True)
1177
+ class DeLongResult:
1178
+ """Result of a DeLong paired ROC-AUC comparison.
1179
+
1180
+ Returned by :func:`delong_roc_variance`. Carries point AUCs for both
1181
+ conditions, the variance of their difference, a two-sided ``z`` and
1182
+ ``p_value`` against the null ``AUC_a == AUC_b``, and a 95% CI on the
1183
+ delta.
1184
+
1185
+ Parameters
1186
+ ----------
1187
+ auc_a, auc_b : float
1188
+ Per-condition ROC-AUC point estimates.
1189
+ delta_auc : float
1190
+ ``auc_a - auc_b``.
1191
+ var : float
1192
+ DeLong variance estimate of ``delta_auc``.
1193
+ z : float
1194
+ ``delta_auc / sqrt(var)`` (NaN if var is zero).
1195
+ p_value : float
1196
+ Two-sided p-value against ``H0: delta_auc == 0``.
1197
+ ci_low, ci_high : float
1198
+ 95% normal-approx CI on ``delta_auc``
1199
+ (``delta_auc ± 1.96 * sqrt(var)``).
1200
+ """
1201
+
1202
+ auc_a: float
1203
+ auc_b: float
1204
+ delta_auc: float
1205
+ var: float
1206
+ z: float
1207
+ p_value: float
1208
+ ci_low: float
1209
+ ci_high: float
1210
+
1211
+ def to_dict(self) -> dict[str, float]:
1212
+ """JSON-serializable dict; NaN/Inf become :func:`float`."""
1213
+ return {
1214
+ "auc_a": self.auc_a,
1215
+ "auc_b": self.auc_b,
1216
+ "delta_auc": self.delta_auc,
1217
+ "var": self.var,
1218
+ "z": self.z,
1219
+ "p_value": self.p_value,
1220
+ "ci_low": self.ci_low,
1221
+ "ci_high": self.ci_high,
1222
+ }
1223
+
1224
+
1225
+ def _delong_structural(
1226
+ pos_scores: np.ndarray, neg_scores: np.ndarray
1227
+ ) -> tuple[np.ndarray, np.ndarray, float]:
1228
+ """Compute Sun & Xu 2014 structural components for one condition.
1229
+
1230
+ Returns ``(V10, V01, auc)`` where ``V10`` is length ``m = len(pos)``,
1231
+ ``V01`` is length ``n = len(neg)``, and ``auc = mean(V10) =
1232
+ 1 - mean(V01)``. Uses midranks (``scipy.stats.rankdata`` average
1233
+ method) to handle ties.
1234
+ """
1235
+ m = len(pos_scores)
1236
+ n = len(neg_scores)
1237
+ if m == 0 or n == 0:
1238
+ raise ValueError("delong_roc_variance requires at least one positive and one negative")
1239
+ combined = np.concatenate([pos_scores, neg_scores])
1240
+ combined_ranks = _scipy_rankdata(combined, method="average")
1241
+ tx10 = combined_ranks[:m]
1242
+ tx01 = combined_ranks[m:]
1243
+ tx11 = _scipy_rankdata(pos_scores, method="average")
1244
+ tx00 = _scipy_rankdata(neg_scores, method="average")
1245
+ v10 = (tx10 - tx11) / n
1246
+ v01 = 1.0 - (tx01 - tx00) / m
1247
+ auc = float(np.mean(v10))
1248
+ return v10, v01, auc
1249
+
1250
+
1251
+ def delong_roc_variance(
1252
+ y_true: np.ndarray,
1253
+ y_score_a: np.ndarray,
1254
+ y_score_b: np.ndarray,
1255
+ ) -> DeLongResult:
1256
+ """DeLong's variance of the paired ROC-AUC difference.
1257
+
1258
+ Implements the Sun & Xu 2014 fast variant of DeLong's correlated-AUC
1259
+ test. Returns a :class:`DeLongResult` with point AUCs, ``delta_auc``,
1260
+ DeLong variance, z, two-sided p-value, and a 95% normal-approx CI on
1261
+ the delta.
1262
+
1263
+ Parameters
1264
+ ----------
1265
+ y_true : np.ndarray
1266
+ Binary labels in ``{0, 1}``. Must contain at least one of each.
1267
+ y_score_a, y_score_b : np.ndarray
1268
+ Scores for the two conditions on the SAME rows (paired).
1269
+
1270
+ Returns
1271
+ -------
1272
+ DeLongResult
1273
+
1274
+ Raises
1275
+ ------
1276
+ ValueError
1277
+ If shapes mismatch, labels are not binary, or fewer than one
1278
+ positive or negative is present.
1279
+
1280
+ Examples
1281
+ --------
1282
+ >>> import numpy as np
1283
+ >>> rng = np.random.default_rng(42)
1284
+ >>> y = np.array([0]*50 + [1]*50)
1285
+ >>> sa = np.concatenate([rng.normal(0, 1, 50), rng.normal(1.0, 1, 50)])
1286
+ >>> sb = np.concatenate([rng.normal(0, 1, 50), rng.normal(1.2, 1, 50)])
1287
+ >>> result = delong_roc_variance(y, sa, sb)
1288
+ >>> bool(result.delta_auc <= 0) # B is stronger
1289
+ True
1290
+ """
1291
+ y_true_arr = np.asarray(y_true)
1292
+ y_a = np.asarray(y_score_a, dtype=float)
1293
+ y_b = np.asarray(y_score_b, dtype=float)
1294
+ if y_true_arr.shape != y_a.shape or y_a.shape != y_b.shape:
1295
+ raise ValueError(
1296
+ "delong_roc_variance: y_true, y_score_a, y_score_b must share shape "
1297
+ f"(got {y_true_arr.shape}, {y_a.shape}, {y_b.shape})"
1298
+ )
1299
+ unique = {int(v) for v in np.unique(y_true_arr).tolist()}
1300
+ if not unique.issubset({0, 1}):
1301
+ raise ValueError(f"delong_roc_variance: y_true must be binary {{0, 1}}, got {unique}")
1302
+ pos_mask = y_true_arr == 1
1303
+ neg_mask = y_true_arr == 0
1304
+ m = int(pos_mask.sum())
1305
+ n = int(neg_mask.sum())
1306
+ if m == 0 or n == 0:
1307
+ raise ValueError(
1308
+ "delong_roc_variance: need at least one positive and one negative "
1309
+ f"row (got m={m}, n={n})"
1310
+ )
1311
+ v10_a, v01_a, auc_a = _delong_structural(y_a[pos_mask], y_a[neg_mask])
1312
+ v10_b, v01_b, auc_b = _delong_structural(y_b[pos_mask], y_b[neg_mask])
1313
+ delta = auc_a - auc_b
1314
+
1315
+ # 2x2 covariance matrices for V10 and V01 (between A and B).
1316
+ s10 = np.cov(np.vstack([v10_a, v10_b]), ddof=1)
1317
+ s01 = np.cov(np.vstack([v01_a, v01_b]), ddof=1)
1318
+ # Var(AUC_A - AUC_B) = (s10[0,0] - 2*s10[0,1] + s10[1,1])/m
1319
+ # + (s01[0,0] - 2*s01[0,1] + s01[1,1])/n
1320
+ var_delta = (s10[0, 0] - 2.0 * s10[0, 1] + s10[1, 1]) / m + (
1321
+ s01[0, 0] - 2.0 * s01[0, 1] + s01[1, 1]
1322
+ ) / n
1323
+ var_delta = float(max(var_delta, 0.0)) # clamp tiny negative FP noise
1324
+
1325
+ if var_delta == 0.0:
1326
+ z = float("nan")
1327
+ p_value = float("nan")
1328
+ half_ci = 0.0
1329
+ else:
1330
+ se = float(np.sqrt(var_delta))
1331
+ z = delta / se
1332
+ p_value = 2.0 * float(1.0 - _scipy_norm.cdf(abs(z)))
1333
+ half_ci = 1.959963984540054 * se # 1.96 to higher precision
1334
+
1335
+ return DeLongResult(
1336
+ auc_a=float(auc_a),
1337
+ auc_b=float(auc_b),
1338
+ delta_auc=float(delta),
1339
+ var=var_delta,
1340
+ z=float(z),
1341
+ p_value=float(p_value),
1342
+ ci_low=float(delta - half_ci),
1343
+ ci_high=float(delta + half_ci),
1344
+ )