eval-toolkit 0.27.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1193 @@
1
+ """Slice-aware evaluation harness for binary scorers.
2
+
3
+ Public surface:
4
+
5
+ - :class:`Scorer` Protocol — anything with ``predict_proba(X) -> np.ndarray``
6
+ - :class:`SliceAwareScorer` Protocol — optional ``should_score_slice(name)`` hook
7
+ - :class:`EvalSlice` — DataFrame wrapper with configurable column names
8
+ - :class:`RunResult` — JSON-serializable run container (versioned schema)
9
+ - :func:`evaluate_scorer_on_slice` — score one model on one slice
10
+ - :func:`evaluate` — pure orchestrator: scores × slices → RunResult (no IO)
11
+ - :func:`evaluate_folded` — fold aggregator: Splitter × seeds → RunResult
12
+ with ``by_fold`` and auto-CV-CI ``fold_summary``
13
+ - :func:`with_claim_report` — attach generic claim-gate evidence to a
14
+ frozen ``RunResult``
15
+ - :func:`write_run_result` — IO wrapper: write RunResult to ``run_dir/results.json``
16
+
17
+ The pure/IO split lets callers test :func:`evaluate` deterministically without
18
+ touching the filesystem; :func:`write_run_result` is the only IO sink.
19
+
20
+ v0.7.0 additions:
21
+
22
+ - ``leakage_checks`` / ``on_leakage`` params on :func:`evaluate`: run a
23
+ sequence of :class:`~eval_toolkit.leakage.LeakageCheck` over the input
24
+ slices before evaluation; raise on error-severity findings by default.
25
+ - ``on_scorer_error`` param: when ``"record"``, captures any
26
+ ``Scorer.predict_proba`` exception per (slice, scorer) instead of failing
27
+ the whole run.
28
+ - ``RunResult.by_fold`` / ``fold_summary`` / ``schema_version="v1"`` fields
29
+ (additive; defaults preserve backward compat).
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import logging
35
+ import time
36
+ import traceback
37
+ from collections.abc import Mapping, Sequence
38
+ from dataclasses import dataclass, field
39
+ from pathlib import Path
40
+ from typing import TYPE_CHECKING, Final, Literal, cast
41
+
42
+ import numpy as np
43
+ import pandas as pd
44
+
45
+ from eval_toolkit.artifacts import (
46
+ error_metric,
47
+ sanitize_for_json,
48
+ skipped_metric,
49
+ write_json_strict,
50
+ )
51
+ from eval_toolkit.bootstrap import (
52
+ bootstrap_ci,
53
+ cv_clt_ci,
54
+ mde_from_ci,
55
+ paired_bootstrap_diff,
56
+ )
57
+ from eval_toolkit.calibration import PlattFit, maximum_calibration_error
58
+ from eval_toolkit.metrics import brier_score, headline_metrics, pr_auc, roc_auc
59
+ from eval_toolkit.operating_points import (
60
+ FittedOperatingPoint,
61
+ OperatingPointSpec,
62
+ apply_operating_points,
63
+ fit_operating_points,
64
+ )
65
+ from eval_toolkit.protocols import Scorer, SliceAwareScorer
66
+ from eval_toolkit.thresholds import TargetFPRSelector
67
+
68
+ if TYPE_CHECKING:
69
+ from eval_toolkit.leakage import LeakageCheck
70
+ from eval_toolkit.splits import Splitter
71
+
72
+ __all__ = [
73
+ "DEFAULT_BOOTSTRAP_RESAMPLES",
74
+ "RUN_RESULT_SCHEMA_VERSION",
75
+ "EvalSlice",
76
+ "RunResult",
77
+ "Scorer",
78
+ "SliceAwareScorer",
79
+ "evaluate",
80
+ "evaluate_folded",
81
+ "evaluate_scorer_on_slice",
82
+ "with_claim_report",
83
+ "write_run_result",
84
+ ]
85
+
86
+ DEFAULT_BOOTSTRAP_RESAMPLES: Final[int] = 1000
87
+ RUN_RESULT_SCHEMA_VERSION: Final[str] = "v1"
88
+
89
+ _logger = logging.getLogger(__name__)
90
+
91
+
92
+ @dataclass(frozen=True, slots=True)
93
+ class EvalSlice:
94
+ """A single eval slice (dev test, OOD slice, ablation slice, etc.).
95
+
96
+ Parameters
97
+ ----------
98
+ name : str
99
+ Slice identifier.
100
+ df : pandas.DataFrame
101
+ Must contain ``feature_col`` and ``label_col``; ``strata_col`` if set.
102
+ description : str, optional
103
+ Human-readable slice description.
104
+ feature_col : str, optional
105
+ Column holding the feature passed to ``Scorer.predict_proba``.
106
+ Default ``"text"``.
107
+ label_col : str, optional
108
+ Column holding binary labels in {0, 1}. Default ``"label"``.
109
+ strata_col : str or None, optional
110
+ Optional categorical column for stratified recall reporting.
111
+ Default ``None``.
112
+ """
113
+
114
+ name: str
115
+ df: pd.DataFrame
116
+ description: str = ""
117
+ feature_col: str = "text"
118
+ label_col: str = "label"
119
+ strata_col: str | None = None
120
+
121
+ def __post_init__(self) -> None:
122
+ """Validate the minimum column and label contract."""
123
+ for col in (self.feature_col, self.label_col):
124
+ if col not in self.df.columns:
125
+ raise KeyError(f"slice {self.name!r}: missing column {col!r}")
126
+ if self.strata_col is not None and self.strata_col not in self.df.columns:
127
+ raise KeyError(f"slice {self.name!r}: missing strata column {self.strata_col!r}")
128
+ if (~self.df[self.label_col].isin({0, 1})).any():
129
+ raise ValueError(f"slice {self.name!r}: labels must be in {{0, 1}}")
130
+
131
+ @property
132
+ def y_true(self) -> np.ndarray:
133
+ """Binary labels as a 1-D NumPy array."""
134
+ arr: np.ndarray = self.df[self.label_col].astype(int).to_numpy()
135
+ return arr
136
+
137
+ @property
138
+ def features(self) -> list[str]:
139
+ """Feature column as a plain list for scorer compatibility."""
140
+ out: list[str] = self.df[self.feature_col].tolist()
141
+ return out
142
+
143
+ @property
144
+ def strata(self) -> np.ndarray | None:
145
+ """Stratifier column as np.ndarray, or None if unset."""
146
+ if self.strata_col is None:
147
+ return None
148
+ out: np.ndarray = self.df[self.strata_col].to_numpy()
149
+ return out
150
+
151
+
152
+ @dataclass(frozen=True, slots=True)
153
+ class RunResult:
154
+ """Outcome of a full evaluation run.
155
+
156
+ Frozen dataclass: result fields must be fully populated before construction.
157
+ Callers building results incrementally should accumulate into local dicts
158
+ and pass them to the constructor.
159
+
160
+ Parameters
161
+ ----------
162
+ run_id : str
163
+ Caller-supplied run identifier (timestamp / UUID).
164
+ git_sha : str or None
165
+ Repo HEAD commit SHA at run time, or ``None``.
166
+ config : dict[str, object]
167
+ Eval-time configuration (n_resamples, seed, scorer / slice names,
168
+ paired_diffs). Distinct from :class:`~eval_toolkit.manifest.RunManifest`
169
+ which captures *environment* fingerprint.
170
+ by_slice : dict[str, dict[str, object]]
171
+ Per-slice results. Empty for fold-aggregated runs (see ``by_fold``).
172
+ by_fold : dict[str, "RunResult"], optional
173
+ Per-fold raw :class:`RunResult` keyed by composite ID
174
+ (``"seed=42/fold=0"``). Populated by :func:`evaluate_folded`;
175
+ empty for non-folded runs (backward compat).
176
+ fold_summary : dict[str, dict[str, object]], optional
177
+ Auto-computed CV-CI summary per (slice, scorer, metric), keyed
178
+ ``[slice_name][scorer_name][metric] = {"mean", "ci_low", "ci_high",
179
+ "n_folds"}``. Populated by :func:`evaluate_folded`; empty otherwise.
180
+ claim_report : dict[str, object], optional
181
+ Optional generic :class:`eval_toolkit.claims.ClaimReport` payload.
182
+ Empty means no claim gates were evaluated for this run.
183
+ schema_version : str
184
+ JSON schema version. ``"v1"`` for v0.7.0+; downstream parsers gate
185
+ on this.
186
+
187
+ .. versionchanged:: 0.7.0
188
+ Added ``by_fold``, ``fold_summary``, ``schema_version`` (additive,
189
+ defaults empty / ``"v1"`` — backward compatible).
190
+ """
191
+
192
+ run_id: str
193
+ git_sha: str | None
194
+ config: dict[str, object]
195
+ by_slice: dict[str, dict[str, object]] = field(default_factory=dict)
196
+ by_fold: dict[str, RunResult] = field(default_factory=dict)
197
+ fold_summary: dict[str, dict[str, object]] = field(default_factory=dict)
198
+ claim_report: dict[str, object] = field(default_factory=dict)
199
+ prediction_artifacts: list[dict[str, object]] = field(default_factory=list)
200
+ evidence_axes: list[dict[str, object]] = field(default_factory=list)
201
+ pairing_metadata: dict[str, object] = field(default_factory=dict)
202
+ aggregate_evidence: dict[str, object] = field(default_factory=dict)
203
+ threshold_policy: dict[str, object] = field(default_factory=dict)
204
+ schema_version: str = RUN_RESULT_SCHEMA_VERSION
205
+
206
+ def to_dict(self) -> dict[str, object]:
207
+ """Serialize using the stable JSON schema (v1 — see ``schema_version``).
208
+
209
+ Raises
210
+ ------
211
+ TypeError
212
+ If JSON-sanitization returns a non-mapping payload (defensive;
213
+ ``sanitize_for_json`` normally preserves dict shape).
214
+ """
215
+ out = sanitize_for_json(
216
+ {
217
+ "schema_version": self.schema_version,
218
+ "run_id": self.run_id,
219
+ "git_sha": self.git_sha,
220
+ "config": self.config,
221
+ "by_slice": self.by_slice,
222
+ "by_fold": {k: v.to_dict() for k, v in self.by_fold.items()},
223
+ "fold_summary": self.fold_summary,
224
+ "claim_report": self.claim_report,
225
+ "prediction_artifacts": self.prediction_artifacts,
226
+ "evidence_axes": self.evidence_axes,
227
+ "pairing_metadata": self.pairing_metadata,
228
+ "aggregate_evidence": self.aggregate_evidence,
229
+ "threshold_policy": self.threshold_policy,
230
+ }
231
+ )
232
+ if not isinstance(out, dict):
233
+ raise TypeError("RunResult.to_dict expected a mapping payload")
234
+ return out
235
+
236
+
237
+ def with_claim_report(result: RunResult, report: object) -> RunResult:
238
+ """Return a copy of ``result`` with a serialized claim report attached.
239
+
240
+ ``RunResult`` is frozen, so claim evidence is attached by value rather than
241
+ mutation. ``report`` may be a mapping or any object exposing ``to_dict()``,
242
+ including :class:`eval_toolkit.claims.ClaimReport`.
243
+ """
244
+ claim_report = _object_to_dict(report, what="claim report")
245
+ return RunResult(
246
+ run_id=result.run_id,
247
+ git_sha=result.git_sha,
248
+ config=result.config,
249
+ by_slice=result.by_slice,
250
+ by_fold=result.by_fold,
251
+ fold_summary=result.fold_summary,
252
+ claim_report=claim_report,
253
+ prediction_artifacts=result.prediction_artifacts,
254
+ evidence_axes=result.evidence_axes,
255
+ pairing_metadata=result.pairing_metadata,
256
+ aggregate_evidence=result.aggregate_evidence,
257
+ threshold_policy=result.threshold_policy,
258
+ schema_version=result.schema_version,
259
+ )
260
+
261
+
262
+ def _object_to_dict(obj: object, *, what: str) -> dict[str, object]:
263
+ """Normalize a mapping or ``to_dict`` object to a plain dict."""
264
+ if isinstance(obj, Mapping):
265
+ return dict(obj)
266
+ to_dict = getattr(obj, "to_dict", None)
267
+ if callable(to_dict):
268
+ out = to_dict()
269
+ if isinstance(out, Mapping):
270
+ return dict(out)
271
+ raise TypeError(f"expected {what} mapping or object with to_dict(), got {type(obj).__name__}")
272
+
273
+
274
+ def _should_score_slice(scorer: Scorer, slice_name: str) -> bool:
275
+ """Honor optional slice-aware scorer hooks without widening the base Protocol."""
276
+ should_score = getattr(scorer, "should_score_slice", None)
277
+ if should_score is None:
278
+ return True
279
+ result = should_score(slice_name)
280
+ if not isinstance(result, bool):
281
+ raise TypeError(
282
+ f"{type(scorer).__name__}.should_score_slice() must return bool, "
283
+ f"got {type(result).__name__}"
284
+ )
285
+ return result
286
+
287
+
288
+ def _skipped_scorer_result(slice_: EvalSlice, reason: str) -> dict[str, object]:
289
+ """Schema-compatible placeholder for a scorer intentionally skipped on a slice."""
290
+ return {
291
+ "skipped": reason,
292
+ "n": int(len(slice_.df)),
293
+ "n_positive": int(slice_.y_true.sum()),
294
+ "scores": [],
295
+ }
296
+
297
+
298
+ def _bootstrap_auc_ci(
299
+ y_true: np.ndarray,
300
+ y_score: np.ndarray,
301
+ metric_fn: object,
302
+ *,
303
+ n_resamples: int,
304
+ seed: int,
305
+ ) -> dict[str, object]:
306
+ """Bootstrap (low, high) CI on an AUC metric; return BootstrapCI.to_dict() or sentinel.
307
+
308
+ Mirrors :func:`evaluate_scorer_on_slice`'s existing PR-AUC bootstrap
309
+ logic so :func:`_evaluate_scores` can compute ROC-AUC CI on the same
310
+ code path (closes V4's bootstrap-roc-auc need for C11).
311
+ """
312
+ if len({int(v) for v in y_true}) < 2:
313
+ return skipped_metric("single-class slice; AUC CI is not meaningful")
314
+ if len(y_true) < 30:
315
+ return skipped_metric(f"n={len(y_true)} < 30")
316
+ try:
317
+ ci = bootstrap_ci(
318
+ y_true,
319
+ y_score,
320
+ metric_fn, # type: ignore[arg-type]
321
+ n_resamples=n_resamples,
322
+ method="BCa",
323
+ seed=seed,
324
+ )
325
+ return ci.to_dict()
326
+ except (ValueError, RuntimeError) as exc:
327
+ return error_metric(str(exc))
328
+
329
+
330
+ def _evaluate_scores(
331
+ y_true: np.ndarray,
332
+ y_score: np.ndarray,
333
+ *,
334
+ strata: np.ndarray | None,
335
+ n_resamples: int,
336
+ seed: int,
337
+ fpr_ladder: list[float] | None,
338
+ compute_mce: bool,
339
+ compute_brier: bool,
340
+ bootstrap_roc_auc: bool,
341
+ ) -> dict[str, object]:
342
+ """Compute the harness metric block for a (y_true, y_score) pair.
343
+
344
+ v0.22.0 private helper used by :func:`evaluate_scorer_on_slice` to
345
+ produce a single metric block. Called once with the raw scores and
346
+ optionally again with calibrated scores; the calibrated-side dict is
347
+ merged under ``*_calibrated`` keys by the public function.
348
+
349
+ Always includes the v0.7.0 baseline (headline_metrics + pr_auc_ci +
350
+ scores + is_single_class). Conditionally adds ``roc_auc_ci``,
351
+ ``tpr_at_fpr``, ``mce``, ``brier_score`` keys per kwargs.
352
+ """
353
+ metrics = headline_metrics(y_true, y_score, strata=strata)
354
+ is_single_class = len({int(v) for v in y_true}) == 1
355
+ metrics["is_single_class"] = is_single_class
356
+ metrics["pr_auc_ci"] = _bootstrap_auc_ci(
357
+ y_true, y_score, pr_auc, n_resamples=n_resamples, seed=seed
358
+ )
359
+ if bootstrap_roc_auc:
360
+ metrics["roc_auc_ci"] = _bootstrap_auc_ci(
361
+ y_true, y_score, roc_auc, n_resamples=n_resamples, seed=seed
362
+ )
363
+ if fpr_ladder is not None:
364
+ tpr_at_fpr: dict[str, object] = {}
365
+ if is_single_class:
366
+ for target in fpr_ladder:
367
+ tpr_at_fpr[f"{target}"] = None
368
+ else:
369
+ for target in fpr_ladder:
370
+ try:
371
+ result = TargetFPRSelector(fpr=target).select(y_true, y_score)
372
+ tpr_at_fpr[f"{target}"] = float(result.recall)
373
+ except RuntimeError:
374
+ tpr_at_fpr[f"{target}"] = None
375
+ metrics["tpr_at_fpr"] = tpr_at_fpr
376
+ if compute_brier:
377
+ try:
378
+ metrics["brier_score"] = brier_score(y_true, y_score, empty_strategy="return_none")
379
+ except (ValueError, RuntimeError) as exc:
380
+ metrics["brier_score"] = error_metric(str(exc))
381
+ if compute_mce:
382
+ try:
383
+ metrics["mce"] = maximum_calibration_error(y_true, y_score)
384
+ except (ValueError, RuntimeError) as exc:
385
+ metrics["mce"] = error_metric(str(exc))
386
+ metrics["scores"] = y_score.tolist()
387
+ return dict(metrics)
388
+
389
+
390
+ def _resolve_y_score(
391
+ scorer: Scorer,
392
+ slice_: EvalSlice,
393
+ precomputed_scores: np.ndarray | None,
394
+ *,
395
+ on_scorer_error: Literal["raise", "record"],
396
+ attack_style: str | None,
397
+ ) -> np.ndarray | dict[str, object]:
398
+ """Resolve ``y_score`` for a (scorer, slice) pair.
399
+
400
+ Returns the ndarray on success. When ``on_scorer_error='record'``
401
+ and the scorer raises, returns the full error-dict that
402
+ :func:`evaluate_scorer_on_slice` would have returned (same shape
403
+ downstream consumers expect).
404
+
405
+ ``MemoryError`` and ``AssertionError`` propagate even in
406
+ ``'record'`` mode. The former signals an environment failure
407
+ (OOM, resource exhaustion); the latter signals an internal-invariant
408
+ violation. Neither belongs in per-scorer error recording —
409
+ surfacing them lets the run fail loudly with the correct cause.
410
+
411
+ Raises
412
+ ------
413
+ ValueError
414
+ If ``precomputed_scores`` shape does not match the slice length.
415
+ MemoryError
416
+ Always re-raised (environment failure).
417
+ AssertionError
418
+ Always re-raised (internal-invariant violation).
419
+ Exception
420
+ Re-raised when ``on_scorer_error='raise'``; otherwise returned
421
+ as an error-dict.
422
+ """
423
+ y_true = slice_.y_true
424
+ if precomputed_scores is not None:
425
+ if precomputed_scores.shape != (len(slice_.df),):
426
+ raise ValueError(
427
+ f"precomputed_scores shape {precomputed_scores.shape} does not "
428
+ f"match slice length {len(slice_.df)}"
429
+ )
430
+ return np.asarray(precomputed_scores)
431
+ try:
432
+ return scorer.predict_proba(slice_.features)
433
+ except MemoryError:
434
+ raise
435
+ except AssertionError:
436
+ raise
437
+ except Exception as exc:
438
+ if on_scorer_error == "raise":
439
+ raise
440
+ err: dict[str, object] = {
441
+ "error": str(exc),
442
+ "error_state": error_metric(str(exc), exc_type=type(exc).__name__),
443
+ "exc_type": type(exc).__name__,
444
+ "traceback": traceback.format_exc(),
445
+ "n": int(len(slice_.df)),
446
+ "n_positive": int(y_true.sum()),
447
+ "scores": [],
448
+ }
449
+ if attack_style is not None:
450
+ err["attack_style"] = attack_style
451
+ return err
452
+
453
+
454
+ def _compute_paired_diffs(
455
+ slice_: EvalSlice,
456
+ scores_by_scorer: Mapping[str, np.ndarray],
457
+ scorers: Mapping[str, Scorer],
458
+ paired_diffs: list[tuple[str, str]],
459
+ *,
460
+ n_resamples: int,
461
+ seed: int,
462
+ ) -> dict[str, dict[str, object]]:
463
+ """Per-slice paired bootstrap on ``pr_auc(b) - pr_auc(a)``.
464
+
465
+ Returns a dict keyed by ``f"{b}_minus_{a}"`` with either a
466
+ :class:`~eval_toolkit.bootstrap.PairedDiff` payload (point/ci/mde)
467
+ or a ``{"skipped": <reason>}`` marker.
468
+
469
+ Skip conditions (checked in order):
470
+
471
+ 1. Either scorer not in ``scorers`` — silently skipped (no entry).
472
+ 2. Either scorer has no scores for this slice (skipped/errored)
473
+ → ``{"skipped": "one or both scorers skipped this slice"}``.
474
+ 3. Single-class slice (PR-AUC Δ degenerate)
475
+ → ``{"skipped": "single-class slice; PR-AUC Δ degenerate"}``.
476
+ 4. ``len(slice_.y_true) < 30`` → ``{"skipped": "n=N < 30"}``.
477
+
478
+ Otherwise: ``paired_bootstrap_diff`` payload plus
479
+ ``mde_at_80_power`` (or error sentinel if MDE estimation fails).
480
+ Pure: no caches mutated, no side effects.
481
+ """
482
+ diffs: dict[str, dict[str, object]] = {}
483
+ is_single_class = len({int(v) for v in slice_.y_true}) == 1
484
+ for a, b in paired_diffs:
485
+ if a not in scorers or b not in scorers:
486
+ continue
487
+ if a not in scores_by_scorer or b not in scores_by_scorer:
488
+ diffs[f"{b}_minus_{a}"] = {"skipped": "one or both scorers skipped this slice"}
489
+ continue
490
+ if is_single_class:
491
+ diffs[f"{b}_minus_{a}"] = {"skipped": "single-class slice; PR-AUC Δ degenerate"}
492
+ continue
493
+ if len(slice_.y_true) < 30:
494
+ diffs[f"{b}_minus_{a}"] = {"skipped": f"n={len(slice_.y_true)} < 30"}
495
+ continue
496
+ pdiff = paired_bootstrap_diff(
497
+ slice_.y_true,
498
+ scores_by_scorer[a],
499
+ scores_by_scorer[b],
500
+ pr_auc,
501
+ n_resamples=n_resamples,
502
+ seed=seed,
503
+ )
504
+ pdiff_dict = pdiff.to_dict()
505
+ try:
506
+ pdiff_dict["mde_at_80_power"] = mde_from_ci(pdiff, alpha=0.05, power=0.80).to_dict()
507
+ except (ValueError, RuntimeError) as exc:
508
+ pdiff_dict["mde_at_80_power"] = {"error": str(exc)}
509
+ diffs[f"{b}_minus_{a}"] = pdiff_dict
510
+ return diffs
511
+
512
+
513
+ def evaluate_scorer_on_slice(
514
+ scorer: Scorer,
515
+ slice_: EvalSlice,
516
+ *,
517
+ n_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES,
518
+ seed: int = 42,
519
+ on_scorer_error: Literal["raise", "record"] = "raise",
520
+ precomputed_scores: np.ndarray | None = None,
521
+ attack_style: str | None = None,
522
+ fpr_ladder: list[float] | None = None,
523
+ compute_mce: bool = False,
524
+ compute_brier: bool = False,
525
+ calibrator: PlattFit | None = None,
526
+ bootstrap_roc_auc: bool = False,
527
+ ) -> dict[str, object]:
528
+ """Score one scorer on one slice; return headline + bootstrap CI on PR-AUC.
529
+
530
+ Single-class slices (all-positive or all-negative): PR-AUC, ROC-AUC, and
531
+ threshold-selected F1 are not meaningful; the result includes a
532
+ ``"skipped"`` field for those metrics.
533
+
534
+ Parameters
535
+ ----------
536
+ scorer : Scorer
537
+ slice_ : EvalSlice
538
+ n_resamples : int, optional
539
+ Bootstrap resamples for PR-AUC CI. Default 1000.
540
+ seed : int, optional
541
+ RNG seed. Default 42.
542
+ on_scorer_error : {"raise", "record"}, optional
543
+ v0.7.0 — when ``"record"``, catch any ``Scorer.predict_proba``
544
+ exception and return a ``{"error", "exc_type", "traceback"}`` dict
545
+ instead of failing. Default ``"raise"`` (loud during dev/CI).
546
+ precomputed_scores : np.ndarray or None, optional
547
+ v0.22.0 — if provided, skip ``scorer.predict_proba`` and use this
548
+ array as ``y_score``. Shape must match ``len(slice_.df)``. Used by
549
+ callers that cache scores across per-slice variants (e.g. V4's
550
+ per-attack-style decomposition).
551
+ attack_style : str or None, optional
552
+ v0.22.0 — pass-through label that lands in the result dict's
553
+ ``attack_style`` key. No metric effect.
554
+ fpr_ladder : list[float] or None, optional
555
+ v0.22.0 — when set, also compute TPR at each FPR via
556
+ :class:`TargetFPRSelector`; emitted under ``tpr_at_fpr`` as
557
+ ``{str(fpr): tpr_value_or_None}``.
558
+ compute_mce : bool, optional
559
+ v0.22.0 — when True, also compute
560
+ :func:`maximum_calibration_error`; emitted under ``mce``.
561
+ compute_brier : bool, optional
562
+ v0.22.0 — when True, also compute :func:`brier_score`; emitted
563
+ under ``brier_score``.
564
+ calibrator : PlattFit or None, optional
565
+ v0.22.0 — when provided, apply to ``y_score`` to produce
566
+ ``y_score_calibrated``, then recompute every requested metric on
567
+ the calibrated scores; merged into the result under
568
+ ``*_calibrated`` keys (``pr_auc_calibrated``,
569
+ ``roc_auc_calibrated``, ``brier_score_calibrated``,
570
+ ``ece_calibrated``, ``mce_calibrated``, ``tpr_at_fpr_calibrated``,
571
+ ``scores_calibrated``, plus the ``pr_auc_ci`` /
572
+ ``roc_auc_ci`` companions).
573
+ bootstrap_roc_auc : bool, optional
574
+ v0.22.0 — when True (and ``n_resamples > 0`` and the slice is
575
+ mixed-class), also bootstrap ROC-AUC CI; emitted under
576
+ ``roc_auc_ci``. Default ``False`` preserves the v0.7-v0.21
577
+ contract (PR-AUC CI only).
578
+
579
+ Returns
580
+ -------
581
+ dict
582
+ Headline metrics + ``pr_auc_ci`` + raw scores. On caught error
583
+ (``on_scorer_error="record"``), the dict carries
584
+ ``{"error", "exc_type", "traceback", "n", "n_positive", "scores": []}``
585
+ — same shape downstream consumers expect, plus the error fields.
586
+
587
+ Raises
588
+ ------
589
+ ValueError
590
+ If ``precomputed_scores`` shape does not match the slice length.
591
+ MemoryError
592
+ Always re-raised — environment failure (e.g., OOM), not a scorer
593
+ bug. v0.27.0 carve-out from ``on_scorer_error='record'``.
594
+ AssertionError
595
+ Always re-raised — internal-invariant violations should surface
596
+ loudly. v0.27.0 carve-out from ``on_scorer_error='record'``.
597
+ Exception
598
+ Re-raises any *other* scorer exception when
599
+ ``on_scorer_error="raise"`` (the default). Set
600
+ ``on_scorer_error="record"`` to capture scorer failures in the
601
+ result dict instead. ``KeyboardInterrupt`` and ``SystemExit``
602
+ also propagate (they inherit from ``BaseException``).
603
+ """
604
+ y_true = slice_.y_true
605
+ resolved = _resolve_y_score(
606
+ scorer,
607
+ slice_,
608
+ precomputed_scores,
609
+ on_scorer_error=on_scorer_error,
610
+ attack_style=attack_style,
611
+ )
612
+ if isinstance(resolved, dict):
613
+ return resolved
614
+ y_score = resolved
615
+
616
+ metrics = _evaluate_scores(
617
+ y_true,
618
+ y_score,
619
+ strata=slice_.strata,
620
+ n_resamples=n_resamples,
621
+ seed=seed,
622
+ fpr_ladder=fpr_ladder,
623
+ compute_mce=compute_mce,
624
+ compute_brier=compute_brier,
625
+ bootstrap_roc_auc=bootstrap_roc_auc,
626
+ )
627
+
628
+ if calibrator is not None:
629
+ y_score_calibrated = np.asarray(calibrator(y_score))
630
+ calibrated = _evaluate_scores(
631
+ y_true,
632
+ y_score_calibrated,
633
+ strata=slice_.strata,
634
+ n_resamples=n_resamples,
635
+ seed=seed,
636
+ fpr_ladder=fpr_ladder,
637
+ compute_mce=compute_mce,
638
+ compute_brier=compute_brier,
639
+ bootstrap_roc_auc=bootstrap_roc_auc,
640
+ )
641
+ # Merge calibrated block under *_calibrated keys; preserve raw keys.
642
+ for k, v in calibrated.items():
643
+ if k in ("n", "n_positive", "is_single_class", "metric_note"):
644
+ continue # invariant across raw / calibrated; skip duplicate
645
+ metrics[f"{k}_calibrated"] = v
646
+
647
+ if attack_style is not None:
648
+ metrics["attack_style"] = attack_style
649
+ return metrics
650
+
651
+
652
+ def evaluate(
653
+ scorers: dict[str, Scorer],
654
+ slices: Sequence[EvalSlice],
655
+ *,
656
+ run_id: str,
657
+ git_sha: str | None = None,
658
+ n_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES,
659
+ paired_diffs: list[tuple[str, str]] | None = None,
660
+ seed: int = 42,
661
+ extra_config: Mapping[str, object] | None = None,
662
+ leakage_checks: Sequence[LeakageCheck] = (),
663
+ on_leakage: Literal["raise", "record", "skip"] = "raise",
664
+ on_scorer_error: Literal["raise", "record"] = "raise",
665
+ operating_point_specs: Sequence[OperatingPointSpec] = (),
666
+ ) -> RunResult:
667
+ """Run every scorer on every slice; return a pure :class:`RunResult` (no IO).
668
+
669
+ Parameters
670
+ ----------
671
+ scorers : dict[str, Scorer]
672
+ Named scorers to evaluate.
673
+ slices : sequence of EvalSlice
674
+ run_id : str
675
+ Caller-supplied run identifier (e.g., a timestamp). Pure functions don't
676
+ capture the wall-clock; pass an ID built once outside.
677
+ git_sha : str or None, optional
678
+ Optional git SHA for provenance. Caller computes this if needed; pass
679
+ ``None`` to omit. Pure functions don't shell out to git.
680
+ n_resamples : int, optional
681
+ Bootstrap resamples per CI. Default 1000.
682
+ paired_diffs : list of (str, str) tuples, optional
683
+ Pairs ``(a, b)`` for which to compute paired bootstrap on
684
+ ``pr_auc(b) - pr_auc(a)`` per slice.
685
+ seed : int, optional
686
+ RNG seed. Default 42.
687
+ extra_config : Mapping or None, optional
688
+ Additional config keys to record in the result.
689
+ leakage_checks : sequence of LeakageCheck, optional
690
+ v0.7.0 — Sequence of pluggable leakage validators run over the
691
+ slices before evaluation. Slices are exposed to the checks as a
692
+ ``{slice.name: slice}`` mapping. Default empty (skip).
693
+ on_leakage : {"raise", "record", "skip"}, optional
694
+ v0.7.0 — Behavior when ``leakage_checks`` produces error-severity
695
+ findings. ``"raise"`` (default) raises ``RuntimeError`` listing the
696
+ findings; ``"record"`` records the report in
697
+ ``RunResult.config["leakage_report"]`` and continues; ``"skip"``
698
+ records nothing and continues.
699
+ on_scorer_error : {"raise", "record"}, optional
700
+ v0.7.0 — Threaded into every :func:`evaluate_scorer_on_slice` call.
701
+ ``"record"`` captures Scorer exceptions per (slice, scorer) instead
702
+ of failing the whole run.
703
+ operating_point_specs : sequence of OperatingPointSpec, optional
704
+ Fit thresholds on one mixed-class slice and apply them to named target
705
+ slices. Results are attached under each scorer's
706
+ ``"transferred_operating_points"`` block. Default empty (skip).
707
+
708
+ Returns
709
+ -------
710
+ RunResult
711
+ Pure result; no filesystem touched. Pass to :func:`write_run_result`
712
+ to persist.
713
+
714
+ Raises
715
+ ------
716
+ ValueError
717
+ If ``scorers`` or ``slices`` is empty.
718
+ RuntimeError
719
+ If ``on_leakage="raise"`` and any leakage check produced an
720
+ error-severity finding.
721
+ """
722
+ if not scorers:
723
+ raise ValueError("at least one scorer required")
724
+ if not slices:
725
+ raise ValueError("at least one slice required")
726
+
727
+ config: dict[str, object] = {
728
+ "n_resamples": n_resamples,
729
+ "seed": seed,
730
+ "scorers": list(scorers.keys()),
731
+ "slices": [s.name for s in slices],
732
+ "paired_diffs": paired_diffs or [],
733
+ "on_scorer_error": on_scorer_error,
734
+ }
735
+ if extra_config:
736
+ config.update(dict(extra_config))
737
+
738
+ # Run leakage checks before any scoring (per Q2 decision).
739
+ if leakage_checks:
740
+ # Late import to avoid circular dependency: leakage.py imports EvalSlice.
741
+ from eval_toolkit.leakage import run_leakage_checks
742
+
743
+ slices_dict = {s.name: s for s in slices}
744
+ report = run_leakage_checks(list(leakage_checks), slices_dict)
745
+ config["on_leakage"] = on_leakage
746
+ if on_leakage != "skip":
747
+ config["leakage_report"] = report.to_dict()
748
+ if on_leakage == "raise" and report.has_errors():
749
+ errors_summary = "; ".join(f"{f.check_name}: {f.message}" for f in report.errors())
750
+ raise RuntimeError(
751
+ f"Leakage checks produced {len(report.errors())} error finding(s): "
752
+ f"{errors_summary}. Pass on_leakage='record' to continue with the "
753
+ "report captured in RunResult.config, or 'skip' to drop the report."
754
+ )
755
+
756
+ by_slice: dict[str, dict[str, object]] = {}
757
+ score_cache: dict[tuple[str, str], np.ndarray] = {}
758
+ slices_by_name = {s.name: s for s in slices}
759
+
760
+ for slice_ in slices:
761
+ _logger.info(
762
+ "[slice %s] n=%d, positives=%d",
763
+ slice_.name,
764
+ len(slice_.df),
765
+ int(slice_.y_true.sum()),
766
+ )
767
+ slice_data: dict[str, dict[str, object]] = {}
768
+ scores_by_scorer: dict[str, np.ndarray] = {}
769
+ for sname, scorer in scorers.items():
770
+ if not _should_score_slice(scorer, slice_.name):
771
+ reason = f"slice {slice_.name!r} not in scorer allow-list"
772
+ slice_data[sname] = _skipped_scorer_result(slice_, reason)
773
+ _logger.info(" skipped %s: %s", sname, reason)
774
+ continue
775
+ t0 = time.time()
776
+ slice_data[sname] = evaluate_scorer_on_slice(
777
+ scorer,
778
+ slice_,
779
+ n_resamples=n_resamples,
780
+ seed=seed,
781
+ on_scorer_error=on_scorer_error,
782
+ )
783
+ # If the scorer raised under on_scorer_error="record", scores is [].
784
+ # Subsequent paired-diff machinery sees the empty array and will
785
+ # short-circuit on the same len-check it already does for skipped
786
+ # scorers; no special-case needed.
787
+ scores_by_scorer[sname] = np.asarray(slice_data[sname]["scores"], dtype=np.float64)
788
+ score_cache[(slice_.name, sname)] = scores_by_scorer[sname]
789
+ elapsed = time.time() - t0
790
+ pr = slice_data[sname].get("pr_auc")
791
+ pr_display = f"{pr:.4f}" if isinstance(pr, float) else "N/A"
792
+ _logger.info(" %s: PR-AUC=%s (%.1fs)", sname, pr_display, elapsed)
793
+
794
+ diffs = (
795
+ _compute_paired_diffs(
796
+ slice_,
797
+ scores_by_scorer,
798
+ scorers,
799
+ paired_diffs,
800
+ n_resamples=n_resamples,
801
+ seed=seed,
802
+ )
803
+ if paired_diffs
804
+ else {}
805
+ )
806
+
807
+ by_slice[slice_.name] = {
808
+ "n": int(len(slice_.df)),
809
+ "n_positive": int(slice_.y_true.sum()),
810
+ "by_scorer": slice_data,
811
+ "paired_diffs": diffs,
812
+ }
813
+
814
+ if operating_point_specs:
815
+ _attach_transferred_operating_points(
816
+ by_slice=by_slice,
817
+ slices_by_name=slices_by_name,
818
+ score_cache=score_cache,
819
+ scorer_names=list(scorers.keys()),
820
+ specs=operating_point_specs,
821
+ )
822
+
823
+ return RunResult(run_id=run_id, git_sha=git_sha, config=config, by_slice=by_slice)
824
+
825
+
826
+ def _attach_transferred_operating_points(
827
+ *,
828
+ by_slice: dict[str, dict[str, object]],
829
+ slices_by_name: Mapping[str, EvalSlice],
830
+ score_cache: Mapping[tuple[str, str], np.ndarray],
831
+ scorer_names: Sequence[str],
832
+ specs: Sequence[OperatingPointSpec],
833
+ ) -> None:
834
+ """Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics."""
835
+ for spec in specs:
836
+ names = list(spec.scorer_names) if spec.scorer_names else list(scorer_names)
837
+ if spec.fit_slice not in slices_by_name:
838
+ _record_spec_error(by_slice, spec, names, f"fit slice {spec.fit_slice!r} not found")
839
+ continue
840
+
841
+ fit_slice = slices_by_name[spec.fit_slice]
842
+ fitted_by_scorer: dict[str, object] = {}
843
+ for scorer_name in names:
844
+ fit_scores = score_cache.get((spec.fit_slice, scorer_name))
845
+ if fit_scores is None or len(fit_scores) != len(fit_slice.y_true):
846
+ fitted_by_scorer[scorer_name] = {
847
+ "error": "fit scorer skipped, errored, or produced no scores"
848
+ }
849
+ continue
850
+ try:
851
+ fitted_by_scorer[scorer_name] = fit_operating_points(
852
+ fit_slice.y_true,
853
+ fit_scores,
854
+ spec.selectors,
855
+ fitted_on_slice=spec.fit_slice,
856
+ scorer_name=scorer_name,
857
+ )
858
+ except (ValueError, RuntimeError) as exc:
859
+ fitted_by_scorer[scorer_name] = {"error": str(exc)}
860
+
861
+ for target_name in spec.apply_slices:
862
+ if target_name not in slices_by_name:
863
+ _record_spec_error(
864
+ by_slice,
865
+ spec,
866
+ names,
867
+ f"apply slice {target_name!r} not found",
868
+ target_slice=target_name,
869
+ )
870
+ continue
871
+ target_slice = slices_by_name[target_name]
872
+ for scorer_name in names:
873
+ scorer_block = _scorer_result_block(by_slice, target_name, scorer_name)
874
+ transfer_block = _transfer_result_block(scorer_block)
875
+ spec_block: dict[str, object] = {}
876
+ transfer_block[spec.name] = spec_block
877
+
878
+ fitted = fitted_by_scorer.get(scorer_name)
879
+ if not isinstance(fitted, dict) or "error" in fitted:
880
+ spec_block["error"] = (
881
+ str(fitted.get("error", "threshold fitting failed"))
882
+ if isinstance(fitted, dict)
883
+ else "threshold fitting failed"
884
+ )
885
+ continue
886
+
887
+ target_scores = score_cache.get((target_name, scorer_name))
888
+ if target_scores is None or len(target_scores) != len(target_slice.y_true):
889
+ spec_block["skipped"] = "target scorer skipped, errored, or produced no scores"
890
+ continue
891
+ try:
892
+ spec_block.update(
893
+ apply_operating_points(
894
+ target_slice.y_true,
895
+ target_scores,
896
+ cast(Mapping[str, FittedOperatingPoint], fitted),
897
+ applied_to_slice=target_name,
898
+ scorer_name=scorer_name,
899
+ )
900
+ )
901
+ except (ValueError, RuntimeError) as exc:
902
+ spec_block["error"] = str(exc)
903
+
904
+
905
+ def _scorer_result_block(
906
+ by_slice: dict[str, dict[str, object]],
907
+ slice_name: str,
908
+ scorer_name: str,
909
+ ) -> dict[str, object]:
910
+ """Return the mutable scorer result block, creating a minimal one if absent."""
911
+ slice_block = by_slice.get(slice_name)
912
+ if not isinstance(slice_block, dict):
913
+ slice_block = {}
914
+ by_slice[slice_name] = slice_block
915
+
916
+ raw_by_scorer = slice_block.get("by_scorer")
917
+ if not isinstance(raw_by_scorer, dict):
918
+ slice_block["by_scorer"] = {}
919
+ raw_by_scorer = slice_block["by_scorer"]
920
+ by_scorer = cast(dict[str, object], raw_by_scorer)
921
+
922
+ raw_scorer_block = by_scorer.get(scorer_name)
923
+ if not isinstance(raw_scorer_block, dict):
924
+ raw_scorer_block = {}
925
+ by_scorer[scorer_name] = raw_scorer_block
926
+ scorer_block = cast(dict[str, object], raw_scorer_block)
927
+ return scorer_block
928
+
929
+
930
+ def _transfer_result_block(scorer_block: dict[str, object]) -> dict[str, object]:
931
+ """Return/create the mutable transferred-operating-points block."""
932
+ raw_transfer = scorer_block.get("transferred_operating_points")
933
+ if not isinstance(raw_transfer, dict):
934
+ raw_transfer = {}
935
+ scorer_block["transferred_operating_points"] = raw_transfer
936
+ transfer_block: dict[str, object] = raw_transfer
937
+ return transfer_block
938
+
939
+
940
+ def _record_spec_error(
941
+ by_slice: dict[str, dict[str, object]],
942
+ spec: OperatingPointSpec,
943
+ scorer_names: Sequence[str],
944
+ message: str,
945
+ *,
946
+ target_slice: str | None = None,
947
+ ) -> None:
948
+ """Attach a spec-level error under target scorer blocks."""
949
+ targets = [target_slice] if target_slice is not None else list(spec.apply_slices)
950
+ for slice_name in targets:
951
+ by_slice.setdefault(
952
+ slice_name,
953
+ {"n": 0, "n_positive": 0, "by_scorer": {}, "paired_diffs": {}},
954
+ )
955
+ for scorer_name in scorer_names:
956
+ scorer_block = _scorer_result_block(by_slice, slice_name, scorer_name)
957
+ transfer_block = _transfer_result_block(scorer_block)
958
+ transfer_block[spec.name] = {"error": message}
959
+
960
+
961
+ def _extract_metric_value(slice_dict: object, metric: str) -> float | None:
962
+ """Pull a numeric metric from one ``by_slice[scorer]`` dict, or ``None``."""
963
+ if not isinstance(slice_dict, dict):
964
+ return None
965
+ val = slice_dict.get(metric)
966
+ if isinstance(val, (int, float)) and not isinstance(val, bool):
967
+ return float(val)
968
+ return None
969
+
970
+
971
+ def _build_fold_summary(
972
+ by_fold: Mapping[str, RunResult],
973
+ slice_names: Sequence[str],
974
+ scorer_names: Sequence[str],
975
+ summary_metrics: Sequence[str] = ("pr_auc", "roc_auc"),
976
+ ) -> dict[str, dict[str, object]]:
977
+ """Aggregate per-fold metrics into ``[slice][scorer][metric] = CV-CI dict``.
978
+
979
+ For each (slice, scorer, metric) triple, collect the per-fold values and
980
+ pass them to :func:`eval_toolkit.bootstrap.cv_clt_ci`. Folds where the
981
+ scorer was skipped or errored contribute ``np.nan`` (graceful per-fold
982
+ degradation) and ``cv_clt_ci`` failures (fewer than 2 numeric folds, etc.)
983
+ degrade to ``{"skipped": "<reason>"}``.
984
+ """
985
+ summary: dict[str, dict[str, object]] = {}
986
+ for slice_name in slice_names:
987
+ per_scorer: dict[str, object] = {}
988
+ for scorer_name in scorer_names:
989
+ per_metric: dict[str, object] = {}
990
+ for metric in summary_metrics:
991
+ fold_values: list[float] = []
992
+ for fold_result in by_fold.values():
993
+ slice_block = fold_result.by_slice.get(slice_name, {})
994
+ if not isinstance(slice_block, dict):
995
+ continue
996
+ by_scorer = slice_block.get("by_scorer", {})
997
+ if not isinstance(by_scorer, dict):
998
+ continue
999
+ value = _extract_metric_value(by_scorer.get(scorer_name), metric)
1000
+ fold_values.append(value if value is not None else float("nan"))
1001
+ arr = np.asarray(fold_values, dtype=np.float64)
1002
+ numeric = arr[~np.isnan(arr)]
1003
+ if len(numeric) < 2:
1004
+ per_metric[metric] = {
1005
+ "skipped": f"only {len(numeric)} numeric fold(s); CV-CI needs >=2"
1006
+ }
1007
+ continue
1008
+ try:
1009
+ ci = cv_clt_ci(arr)
1010
+ per_metric[metric] = {
1011
+ "mean": ci.point_estimate,
1012
+ "ci_low": ci.ci_low,
1013
+ "ci_high": ci.ci_high,
1014
+ "n_folds": int(len(numeric)),
1015
+ }
1016
+ except (ValueError, RuntimeError) as exc:
1017
+ per_metric[metric] = {"skipped": str(exc)}
1018
+ per_scorer[scorer_name] = per_metric
1019
+ summary[slice_name] = per_scorer
1020
+ return summary
1021
+
1022
+
1023
+ def evaluate_folded(
1024
+ scorers: dict[str, Scorer],
1025
+ splitter: Splitter,
1026
+ slice_: EvalSlice,
1027
+ *,
1028
+ run_id: str,
1029
+ git_sha: str | None = None,
1030
+ seeds: Sequence[int] = (42,),
1031
+ n_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES,
1032
+ paired_diffs: list[tuple[str, str]] | None = None,
1033
+ leakage_checks: Sequence[LeakageCheck] = (),
1034
+ on_leakage: Literal["raise", "record", "skip"] = "raise",
1035
+ on_scorer_error: Literal["raise", "record"] = "raise",
1036
+ eval_split_names: Sequence[str] = ("test",),
1037
+ summary_metrics: Sequence[str] = ("pr_auc", "roc_auc"),
1038
+ ) -> RunResult:
1039
+ """Run a fold aggregator: ``Splitter × seeds → RunResult`` with CV-CI summary.
1040
+
1041
+ For each ``seed`` in ``seeds``, iterate ``splitter.iter_folds(slice_)``,
1042
+ delegate to :func:`evaluate` per fold (passing only the splits named in
1043
+ ``eval_split_names``), and aggregate. Both raw per-fold results and an
1044
+ auto-computed CV-CI summary land on the returned :class:`RunResult`:
1045
+
1046
+ - ``RunResult.by_fold[fold_id]`` — raw :class:`RunResult` per
1047
+ (seed, fold), keyed ``"seed=<seed>/fold=<i>"``.
1048
+ - ``RunResult.fold_summary[slice_name][scorer_name][metric]`` —
1049
+ ``{mean, ci_low, ci_high, n_folds}`` from
1050
+ :func:`eval_toolkit.bootstrap.cv_clt_ci`. Falls back to
1051
+ ``{"skipped": "<reason>"}`` when fewer than 2 numeric folds.
1052
+
1053
+ Parameters
1054
+ ----------
1055
+ scorers : dict[str, Scorer]
1056
+ splitter : Splitter
1057
+ Any object implementing
1058
+ :meth:`~eval_toolkit.splits.Splitter.iter_folds`.
1059
+ slice_ : EvalSlice
1060
+ Parent dataset; the splitter partitions it.
1061
+ run_id : str
1062
+ git_sha : str or None
1063
+ seeds : sequence of int, optional
1064
+ RNG seeds for multi-seed × CV. Default ``(42,)`` (single seed).
1065
+ n_resamples, paired_diffs, leakage_checks, on_leakage, on_scorer_error :
1066
+ Forwarded to :func:`evaluate` per fold.
1067
+ eval_split_names : sequence of str, optional
1068
+ Subset of each fold-dict's keys to actually evaluate. Default
1069
+ ``("test",)`` — train sets are skipped (eval-only K-fold). Pass
1070
+ ``("val", "test")`` to evaluate both.
1071
+ summary_metrics : sequence of str, optional
1072
+ Metrics aggregated into :attr:`RunResult.fold_summary`. Default
1073
+ ``("pr_auc", "roc_auc")``.
1074
+
1075
+ Returns
1076
+ -------
1077
+ RunResult
1078
+ ``by_slice`` empty (per-fold details live in ``by_fold``);
1079
+ ``fold_summary`` populated.
1080
+
1081
+ Raises
1082
+ ------
1083
+ ValueError
1084
+ If ``scorers`` is empty or no ``eval_split_names`` are present in
1085
+ any fold.
1086
+
1087
+ Notes
1088
+ -----
1089
+ Eval-only K-fold semantics: the same scorer instance runs on each fold's
1090
+ test partition. For "different trained model per fold" workflows, train
1091
+ K models externally and wrap each as a :class:`Scorer` whose
1092
+ ``predict_proba`` dispatches to the right underlying model based on
1093
+ the slice's content.
1094
+ """
1095
+ if not scorers:
1096
+ raise ValueError("at least one scorer required")
1097
+
1098
+ by_fold: dict[str, RunResult] = {}
1099
+ fold_slice_names_seen: set[str] = set()
1100
+ for seed in seeds:
1101
+ for fold_idx, fold_dict in enumerate(splitter.iter_folds(slice_)):
1102
+ fold_id = f"seed={seed}/fold={fold_idx}"
1103
+ eval_slices = [fold_dict[name] for name in eval_split_names if name in fold_dict]
1104
+ if not eval_slices:
1105
+ raise ValueError(
1106
+ f"fold {fold_id}: none of eval_split_names={list(eval_split_names)} "
1107
+ f"present in fold keys={list(fold_dict.keys())}"
1108
+ )
1109
+ for s in eval_slices:
1110
+ fold_slice_names_seen.add(s.name)
1111
+ fold_result = evaluate(
1112
+ scorers,
1113
+ eval_slices,
1114
+ run_id=fold_id,
1115
+ git_sha=git_sha,
1116
+ n_resamples=n_resamples,
1117
+ paired_diffs=paired_diffs,
1118
+ seed=seed,
1119
+ leakage_checks=leakage_checks,
1120
+ on_leakage=on_leakage,
1121
+ on_scorer_error=on_scorer_error,
1122
+ )
1123
+ by_fold[fold_id] = fold_result
1124
+
1125
+ fold_summary = _build_fold_summary(
1126
+ by_fold,
1127
+ slice_names=sorted(fold_slice_names_seen),
1128
+ scorer_names=list(scorers.keys()),
1129
+ summary_metrics=summary_metrics,
1130
+ )
1131
+
1132
+ config: dict[str, object] = {
1133
+ "n_resamples": n_resamples,
1134
+ "seeds": list(seeds),
1135
+ "scorers": list(scorers.keys()),
1136
+ "splitter": type(splitter).__name__,
1137
+ "eval_split_names": list(eval_split_names),
1138
+ "summary_metrics": list(summary_metrics),
1139
+ "n_folds": int(len(by_fold)),
1140
+ }
1141
+ return RunResult(
1142
+ run_id=run_id,
1143
+ git_sha=git_sha,
1144
+ config=config,
1145
+ by_slice={},
1146
+ by_fold=by_fold,
1147
+ fold_summary=fold_summary,
1148
+ )
1149
+
1150
+
1151
+ def write_run_result(result: RunResult, run_dir: Path) -> tuple[Path, Path]:
1152
+ """Write a :class:`RunResult` to ``run_dir`` as two JSON files (compact + full).
1153
+
1154
+ Parameters
1155
+ ----------
1156
+ result : RunResult
1157
+ run_dir : pathlib.Path
1158
+ Directory to write into. Created if it doesn't exist.
1159
+
1160
+ Returns
1161
+ -------
1162
+ tuple[pathlib.Path, pathlib.Path]
1163
+ ``(results_json_path, results_full_json_path)``.
1164
+
1165
+ Notes
1166
+ -----
1167
+ The compact ``results.json`` strips per-prompt ``scores`` arrays from the
1168
+ headline output to keep it small; the full ``results_full.json`` retains
1169
+ them.
1170
+ """
1171
+ run_dir.mkdir(parents=True, exist_ok=True)
1172
+ full_path = run_dir / "results_full.json"
1173
+ compact_path = run_dir / "results.json"
1174
+ write_json_strict(result.to_dict(), full_path)
1175
+ write_json_strict(_strip_scores(result.to_dict()), compact_path)
1176
+ return compact_path, full_path
1177
+
1178
+
1179
+ def _strip_scores(d: dict[str, object]) -> dict[str, object]:
1180
+ """Drop the per-row ``scores`` arrays from the headline JSON."""
1181
+ out = sanitize_for_json(d)
1182
+ if not isinstance(out, dict):
1183
+ raise TypeError("_strip_scores expected a mapping payload")
1184
+ by_slice = out.get("by_slice", {})
1185
+ if isinstance(by_slice, dict):
1186
+ for slice_data in by_slice.values():
1187
+ if isinstance(slice_data, dict):
1188
+ by_scorer = slice_data.get("by_scorer", {})
1189
+ if isinstance(by_scorer, dict):
1190
+ for scorer_data in by_scorer.values():
1191
+ if isinstance(scorer_data, dict):
1192
+ scorer_data.pop("scores", None)
1193
+ return out