eval-toolkit 0.34.0__tar.gz → 0.36.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/CHANGELOG.md +83 -0
  2. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/__init__.py +1 -0
  4. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/calibration.py +97 -0
  6. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/harness.py +227 -35
  7. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/protocols.py +10 -0
  8. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/public_api/snapshot.json +9 -3
  9. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_unit.py +126 -0
  10. eval_toolkit-0.36.0/tests/test_harness_parallelism.py +266 -0
  11. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/.gitignore +0 -0
  12. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/LICENSE +0 -0
  13. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/README.md +0 -0
  14. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/STYLE.md +0 -0
  15. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/archive/README.md +0 -0
  16. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/README.md +0 -0
  17. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/datasets/README.md +0 -0
  18. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/papers/data-integrity/README.md +0 -0
  19. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  20. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/papers/inference/README.md +0 -0
  21. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/research/papers/prompt-injection/README.md +0 -0
  22. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/docs/source/methodology/README.md +0 -0
  23. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/pyproject.toml +0 -0
  24. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_parallel.py +0 -0
  27. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/analysis.py +0 -0
  28. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/artifacts.py +0 -0
  29. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/bootstrap.py +0 -0
  30. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/claims.py +0 -0
  31. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/config.py +0 -0
  32. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/docs.py +0 -0
  33. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/embeddings.py +0 -0
  34. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/evidence.py +0 -0
  35. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/leakage.py +0 -0
  36. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/loaders.py +0 -0
  37. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/manifest.py +0 -0
  38. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/metrics.py +0 -0
  39. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/operating_points.py +0 -0
  40. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/paths.py +0 -0
  41. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/plotting.py +0 -0
  42. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/provenance.py +0 -0
  43. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/py.typed +0 -0
  44. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  45. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  46. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  47. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  48. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  49. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/seeds.py +0 -0
  50. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/splits.py +0 -0
  51. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/text_dedup.py +0 -0
  52. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/src/eval_toolkit/thresholds.py +0 -0
  53. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  54. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  55. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  56. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  57. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  58. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  59. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  60. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  61. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  62. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  63. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/benchmarks/__init__.py +0 -0
  64. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  65. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/conftest.py +0 -0
  66. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  67. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  68. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  69. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  70. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/docs/expected.md +0 -0
  71. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/docs/input.md +0 -0
  72. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/docs/metrics.json +0 -0
  73. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  74. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/strategies.py +0 -0
  75. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_analysis.py +0 -0
  76. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_artifacts.py +0 -0
  77. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  78. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  79. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_edge_cases.py +0 -0
  80. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_golden.py +0 -0
  81. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_njobs.py +0 -0
  82. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_props.py +0 -0
  83. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_research_grounded.py +0 -0
  84. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_unit.py +0 -0
  85. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  86. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_determinism.py +0 -0
  87. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_optimization_failures.py +0 -0
  88. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_props.py +0 -0
  89. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_calibration_research_grounded.py +0 -0
  90. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_claims.py +0 -0
  91. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_claims_coverage.py +0 -0
  92. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_claims_props.py +0 -0
  93. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_cli.py +0 -0
  94. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_config.py +0 -0
  95. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_coverage_bootstrap.py +0 -0
  96. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_coverage_calibration.py +0 -0
  97. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_coverage_harness.py +0 -0
  98. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_coverage_metrics.py +0 -0
  99. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_coverage_plotting.py +0 -0
  100. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  101. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_deprecations.py +0 -0
  102. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_docs_golden.py +0 -0
  103. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_docs_props.py +0 -0
  104. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_embeddings.py +0 -0
  105. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_evidence_validators.py +0 -0
  106. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_edge_cases.py +0 -0
  107. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_fault_injection.py +0 -0
  108. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_folded.py +0 -0
  109. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_internals.py +0 -0
  110. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_metric_options.py +0 -0
  111. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_harness_smoke.py +0 -0
  112. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_import_boundaries.py +0 -0
  113. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_leakage.py +0 -0
  114. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_leakage_error_paths.py +0 -0
  115. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_leakage_props.py +0 -0
  116. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_loaders.py +0 -0
  117. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_loaders_coverage.py +0 -0
  118. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_loaders_props.py +0 -0
  119. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_logging.py +0 -0
  120. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_manifest.py +0 -0
  121. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  122. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_manifest_props.py +0 -0
  123. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_manifest_validation.py +0 -0
  124. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_metrics_props.py +0 -0
  125. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_metrics_stratified_subsets.py +0 -0
  126. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_metrics_unit.py +0 -0
  127. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_misc_coverage.py +0 -0
  128. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_numeric_edge_cases.py +0 -0
  129. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_operating_points.py +0 -0
  130. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_operating_points_props.py +0 -0
  131. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_parallel.py +0 -0
  132. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_paths.py +0 -0
  133. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_pipeline_e2e.py +0 -0
  134. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_plotting_edge.py +0 -0
  135. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_plotting_smoke.py +0 -0
  136. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_plotting_visual.py +0 -0
  137. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_protocol_conformance.py +0 -0
  138. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_provenance.py +0 -0
  139. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_public_api.py +0 -0
  140. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_recall_at_fpr.py +0 -0
  141. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_reference_equivalence.py +0 -0
  142. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_reproducibility_integration.py +0 -0
  143. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_schemas.py +0 -0
  144. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_seeds.py +0 -0
  145. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_splits.py +0 -0
  146. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_splits_leakage_integration.py +0 -0
  147. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_splits_props.py +0 -0
  148. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_text_dedup.py +0 -0
  149. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_coverage.py +0 -0
  150. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_props.py +0 -0
  151. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_strategies.py +0 -0
  152. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_thresholds.py +0 -0
  153. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_thresholds_constant_score.py +0 -0
  154. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_thresholds_coverage.py +0 -0
  155. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_thresholds_props.py +0 -0
  156. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_thresholds_research_grounded.py +0 -0
  157. {eval_toolkit-0.34.0 → eval_toolkit-0.36.0}/tests/test_v09_contracts.py +0 -0
@@ -7,6 +7,89 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.36.0] — 2026-05-18 — harness parallelization (#29, #30) + Node 24 actions
11
+
12
+ Wires the v0.34.0 unified parallelism pattern into the harness evaluation
13
+ loop. `evaluate()` and `evaluate_folded()` now accept an `n_jobs` kwarg
14
+ (default `1` preserves bit-identical sequential behavior); under
15
+ `n_jobs != 1`, the `(slice × scorer)` work-unit loop in
16
+ `_score_all_slices` and the `(spec × scorer)` fit phase in
17
+ `_attach_transferred_operating_points` dispatch through joblib loky via
18
+ the existing `_parallel.parallel_map` helper.
19
+
20
+ ### Added
21
+
22
+ - `evaluate(..., n_jobs: int = 1)` and `evaluate_folded(..., n_jobs: int = 1)`
23
+ — keyword-only kwarg per Principle #3 of `methodology/parallelism.md`.
24
+ `n_jobs=1` (default) runs the existing pure-Python sequential loop
25
+ (Principle #4 — bit-identical to v0.35). `n_jobs > 1` uses joblib loky;
26
+ `n_jobs=-1` uses all cores; `n_jobs=0` is rejected. Closes #29, #30.
27
+ - Strict-pickle Scorer sniff at `evaluate()` entry when `n_jobs != 1`:
28
+ raises a clean `TypeError` referencing
29
+ `methodology/parallelism.md#scorer-picklability` with the underlying
30
+ pickle error attached. Reuses the v0.35 ADR contract; no new exception
31
+ class. Catches non-picklable scorers up front rather than relying on
32
+ joblib's more permissive cloudpickle path (which would silently absorb
33
+ closures and obscure the contract documented in v0.35).
34
+
35
+ ### Internal
36
+
37
+ - New module-scope step functions `_score_one_pair` and
38
+ `_fit_one_op_point_pair` in `harness.py` (picklable; required by loky).
39
+ - `_score_all_slices` and `_attach_transferred_operating_points`
40
+ refactored to use flat work-unit dispatch via `parallel_map`.
41
+
42
+ ### Tests
43
+
44
+ - New `tests/test_harness_parallelism.py` (7 tests): bit-identical
45
+ reproducibility across `n_jobs=1` vs `n_jobs=2` for `evaluate`
46
+ (basic, paired-diffs, operating-points), `evaluate_folded`,
47
+ picklability rejection (closure scorer), `n_jobs=0` rejection,
48
+ `n_jobs=-1` smoke. All 66 harness tests pass (7 new + 59 existing).
49
+
50
+ ### Infrastructure
51
+
52
+ - Bumped `actions/upload-artifact` and `actions/download-artifact` from
53
+ `@v5` → `@v6` across `publish.yml` / `nightly-mc.yml` /
54
+ `nightly-benchmarks.yml`. The v6 majors run on Node.js 24
55
+ (GitHub deprecates Node 20 actions from 2026-06-02). Other pinned
56
+ actions (`checkout@v6`, `setup-uv@v8.1.0`, `codeql-action@v3`,
57
+ `deploy-pages@v4`, `upload-pages-artifact@v3`) were not flagged in
58
+ the v0.35 publish annotation and are deferred to a separate audit.
59
+
60
+ ## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
61
+
62
+ Small, additive release. Adds a binary-classification calibration helper
63
+ that lets consumers drop the ~50 LOC scalar-proba adapter many were
64
+ carrying, plus a design ADR that unblocks the v0.36 harness / operating-
65
+ point parallelization work (#29, #30) without re-litigating picklability.
66
+
67
+ ### Added
68
+
69
+ - `eval_toolkit.fit_temperature_binary(y_true, y_score)` — scalar-proba
70
+ adapter for the multi-class `fit_temperature` fitter. Converts `(n,)`
71
+ probabilities of class 1 to a 2-column logit array via clipped logit
72
+ (`[0, logit(p)]` so softmax row 1 reproduces `p`), delegates to the
73
+ deployment-quality fitter, and returns `(T_opt, apply)` where
74
+ `apply: (n,) -> (n,)` does scalar-in / scalar-out T-scaling. Unlike
75
+ `fit_temperature_oracle`, no warning — the contract assumes val / test
76
+ separation (deployment-quality calibration, not fit-on-test). Closes
77
+ #28.
78
+
79
+ ### Documentation
80
+
81
+ - `docs/source/methodology/parallelism.md` — new `## Scorer picklability`
82
+ sub-section documenting the Scorer protocol's picklability contract
83
+ for `n_jobs > 1` usage. Includes worked picklable / broken-closure /
84
+ fix examples plus a list of common non-picklable patterns to watch for
85
+ in user-supplied Scorers (closures, lambdas on instances, local-scope
86
+ classes, attributes holding live sockets / file handles). Anchors on
87
+ the existing v0.34.0 `parallel_map` pickle sniff + `TypeError`
88
+ channel — no new exception class. Unblocks v0.36 implementation of
89
+ #29 and #30.
90
+ - `eval_toolkit.protocols.Scorer` docstring — Notes block pointing at
91
+ the new methodology section.
92
+
10
93
  ## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
11
94
 
12
95
  Closes all 7 open backlog issues in one consumer-closing release. Also
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.34.0
3
+ Version: 0.36.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -87,6 +87,7 @@ _EXPORTS: dict[str, str] = {
87
87
  "fit_isotonic_calibrator": "eval_toolkit.calibration",
88
88
  "fit_platt_calibrator": "eval_toolkit.calibration",
89
89
  "fit_temperature": "eval_toolkit.calibration",
90
+ "fit_temperature_binary": "eval_toolkit.calibration",
90
91
  "fit_temperature_oracle": "eval_toolkit.calibration",
91
92
  "reliability_curve": "eval_toolkit.calibration",
92
93
  "reliability_diagram_data": "eval_toolkit.calibration",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.34.0"
5
+ __version__ = "0.36.0"
@@ -57,6 +57,7 @@ __all__ = [
57
57
  "fit_isotonic_calibrator",
58
58
  "fit_platt_calibrator",
59
59
  "fit_temperature",
60
+ "fit_temperature_binary",
60
61
  "fit_temperature_oracle",
61
62
  "maximum_calibration_error",
62
63
  "reliability_curve",
@@ -1038,6 +1039,102 @@ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -
1038
1039
  return float(-log_probs[np.arange(len(labels)), labels].mean())
1039
1040
 
1040
1041
 
1042
+ def fit_temperature_binary(
1043
+ y_true: np.ndarray,
1044
+ y_score: np.ndarray,
1045
+ *,
1046
+ bounds: tuple[float, float] = (0.05, 20.0),
1047
+ ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
1048
+ r"""Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).
1049
+
1050
+ Fits a scalar T > 0 on *validation* probabilities of class 1 and returns
1051
+ both T and a callable that applies the same T-scaling to test
1052
+ probabilities. Internally:
1053
+
1054
+ 1. Clips ``y_score`` to ``[1e-7, 1-1e-7]`` for finite logit inversion.
1055
+ 2. Builds a 2-column logit array ``[0, logit(p)]`` so softmax row 1
1056
+ reproduces ``p`` exactly.
1057
+ 3. Delegates to :func:`fit_temperature` for the bounded NLL minimization.
1058
+ 4. Returns ``(T, apply)`` where ``apply(p_test) = sigmoid(logit(p_test)/T)``.
1059
+
1060
+ Unlike :func:`fit_temperature_oracle`, this does NOT emit a warning — the
1061
+ contract is that ``y_true`` / ``y_score`` come from a held-out validation
1062
+ set and ``apply`` is invoked on a separate test set (deployment-quality
1063
+ calibration, not fit-on-test).
1064
+
1065
+ Parameters
1066
+ ----------
1067
+ y_true : np.ndarray, shape (n,)
1068
+ Binary validation labels in {0, 1}.
1069
+ y_score : np.ndarray, shape (n,)
1070
+ Validation predicted probabilities of class 1, in [0, 1]. Values at
1071
+ the extremes are clipped to ``[1e-7, 1 - 1e-7]``.
1072
+ bounds : tuple of float, optional
1073
+ ``(lo, hi)`` bracket for T. Default ``(0.05, 20.0)``, matches
1074
+ :func:`fit_temperature`.
1075
+
1076
+ Returns
1077
+ -------
1078
+ tuple
1079
+ ``(T_optimal, apply)`` where ``apply: (n,) -> (n,)`` maps any input
1080
+ probability array through :math:`\sigma(\mathrm{logit}(p) / T)`.
1081
+
1082
+ Raises
1083
+ ------
1084
+ ValueError
1085
+ On shape mismatch, empty input, non-finite scores, or single-class
1086
+ ``y_true``.
1087
+ RuntimeError
1088
+ If the bounded scalar optimizer fails to converge.
1089
+
1090
+ Examples
1091
+ --------
1092
+ >>> import numpy as np
1093
+ >>> rng = np.random.default_rng(0)
1094
+ >>> n = 500
1095
+ >>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
1096
+ >>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
1097
+ >>> T, apply = fit_temperature_binary(y_val, p_val)
1098
+ >>> T > 0
1099
+ True
1100
+ >>> p_test = np.array([0.1, 0.5, 0.9])
1101
+ >>> apply(p_test).shape == (3,)
1102
+ True
1103
+
1104
+ See Also
1105
+ --------
1106
+ fit_temperature : underlying multi-class fitter (operates on 2-col logits)
1107
+ fit_temperature_oracle : diagnostic-only variant that fits T on the same
1108
+ probabilities it scores
1109
+
1110
+ References
1111
+ ----------
1112
+ .. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
1113
+ calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
1114
+ """
1115
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
1116
+
1117
+ # Build 2-col logits [0, logit(p)] so softmax([0, logit(p)])[1] == p exactly.
1118
+ s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1119
+ logit_pos = np.log(s_clipped / (1.0 - s_clipped))
1120
+ val_logits_2col = np.column_stack([np.zeros_like(logit_pos), logit_pos])
1121
+
1122
+ result = fit_temperature(val_logits_2col, y_true_arr, bounds=bounds)
1123
+ t_optimal = float(result["temperature"])
1124
+
1125
+ def apply(scores: np.ndarray) -> np.ndarray:
1126
+ arr = np.asarray(scores, dtype=float).ravel()
1127
+ if not np.isfinite(arr).all():
1128
+ raise ValueError("scores contains NaN or inf")
1129
+ clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1130
+ logit = np.log(clipped / (1.0 - clipped))
1131
+ scaled = logit / t_optimal
1132
+ out: np.ndarray = (1.0 / (1.0 + np.exp(-scaled))).astype(float)
1133
+ return out
1134
+
1135
+ return t_optimal, apply
1136
+
1137
+
1041
1138
  def fit_temperature_oracle(
1042
1139
  y_true: np.ndarray, y_score: np.ndarray
1043
1140
  ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
@@ -32,6 +32,7 @@ v0.7.0 additions:
32
32
  from __future__ import annotations
33
33
 
34
34
  import logging
35
+ import pickle
35
36
  import time
36
37
  import traceback
37
38
  from collections.abc import Mapping, Sequence
@@ -41,6 +42,7 @@ from typing import TYPE_CHECKING, Final, Literal, cast
41
42
 
42
43
  import numpy as np
43
44
 
45
+ from eval_toolkit._parallel import parallel_map
44
46
  from eval_toolkit.artifacts import (
45
47
  error_metric,
46
48
  sanitize_for_json,
@@ -62,7 +64,7 @@ from eval_toolkit.operating_points import (
62
64
  fit_operating_points,
63
65
  )
64
66
  from eval_toolkit.protocols import Scorer, SliceAwareScorer
65
- from eval_toolkit.thresholds import TargetFPRSelector
67
+ from eval_toolkit.thresholds import TargetFPRSelector, ThresholdSelector
66
68
 
67
69
  if TYPE_CHECKING:
68
70
  import pandas as pd
@@ -278,6 +280,31 @@ def _object_to_dict(obj: object, *, what: str) -> dict[str, object]:
278
280
  raise TypeError(f"expected {what} mapping or object with to_dict(), got {type(obj).__name__}")
279
281
 
280
282
 
283
+ def _assert_scorers_picklable(scorers: Mapping[str, Scorer]) -> None:
284
+ """Strict-pickle sniff for Scorer args when ``n_jobs != 1``.
285
+
286
+ joblib's loky backend uses cloudpickle (which absorbs closures + local
287
+ classes), but the v0.35 Scorer picklability ADR
288
+ (``methodology/parallelism.md#scorer-picklability``) is a *strict* pickle
289
+ contract — cloudpickle behavior is platform-dependent and the more
290
+ permissive failure modes are harder to debug. Fail fast at
291
+ :func:`evaluate` entry with the same ``TypeError`` style as
292
+ :func:`eval_toolkit._parallel.parallel_map`'s fn-sniff (no new exception
293
+ class — single channel for the picklability contract).
294
+ """
295
+ for sname, scorer in scorers.items():
296
+ try:
297
+ pickle.dumps(scorer)
298
+ except (pickle.PicklingError, AttributeError, TypeError) as exc:
299
+ raise TypeError(
300
+ f"evaluate(n_jobs != 1): scorer {sname!r} "
301
+ f"({type(scorer).__name__}) is not picklable. See "
302
+ f"methodology/parallelism.md#scorer-picklability for the "
303
+ f"contract and worked picklable / broken / fix examples. "
304
+ f"Underlying error: {exc}"
305
+ ) from exc
306
+
307
+
281
308
  def _should_score_slice(scorer: Scorer, slice_name: str) -> bool:
282
309
  """Honor optional slice-aware scorer hooks without widening the base Protocol."""
283
310
  should_score = getattr(scorer, "should_score_slice", None)
@@ -696,6 +723,36 @@ def _run_leakage_phase(
696
723
  )
697
724
 
698
725
 
726
+ # Tuple shape for the flat `(slice × scorer)` work-unit dispatched to
727
+ # parallel_map by `_score_all_slices`. Defined at module scope so workers
728
+ # can pickle the function reference.
729
+ _ScoreOnePairItem = tuple[EvalSlice, str, Scorer, int, int, Literal["raise", "record"]]
730
+ _ScoreOnePairResult = tuple[str, str, dict[str, object], np.ndarray]
731
+
732
+
733
+ def _score_one_pair(item: _ScoreOnePairItem) -> _ScoreOnePairResult:
734
+ """Picklable step function for ``(slice × scorer)`` parallel dispatch.
735
+
736
+ Module-scope so loky workers can serialize the reference (closures over
737
+ enclosing locals would fail :func:`parallel_map`'s pickle sniff). All
738
+ inputs flow through the ``item`` tuple — no captured state.
739
+
740
+ Returns ``(slice_name, scorer_name, result_dict, scores_array)`` so the
741
+ caller can reassemble ``by_slice`` + ``score_cache`` in the original
742
+ iteration order.
743
+ """
744
+ slice_, sname, scorer, n_resamples, seed, on_scorer_error = item
745
+ result = evaluate_scorer_on_slice(
746
+ scorer,
747
+ slice_,
748
+ n_resamples=n_resamples,
749
+ seed=seed,
750
+ on_scorer_error=on_scorer_error,
751
+ )
752
+ scores = np.asarray(result["scores"], dtype=np.float64)
753
+ return slice_.name, sname, result, scores
754
+
755
+
699
756
  def _score_all_slices(
700
757
  scorers: dict[str, Scorer],
701
758
  slices: Sequence[EvalSlice],
@@ -704,6 +761,7 @@ def _score_all_slices(
704
761
  seed: int,
705
762
  paired_diffs: list[tuple[str, str]] | None,
706
763
  on_scorer_error: Literal["raise", "record"],
764
+ n_jobs: int = 1,
707
765
  ) -> tuple[dict[str, dict[str, object]], dict[tuple[str, str], np.ndarray]]:
708
766
  """Score every ``(slice, scorer)`` pair; return ``(by_slice, score_cache)``.
709
767
 
@@ -714,10 +772,17 @@ def _score_all_slices(
714
772
  ``score_cache`` is keyed ``(slice.name, scorer.name)`` and carries the
715
773
  raw score arrays so :func:`_attach_transferred_operating_points` can
716
774
  re-use them without re-calling scorers.
717
- """
718
- by_slice: dict[str, dict[str, object]] = {}
719
- score_cache: dict[tuple[str, str], np.ndarray] = {}
720
775
 
776
+ v0.36 added ``n_jobs``: a flat ``(slice × scorer)`` parallel dispatch
777
+ via :func:`eval_toolkit._parallel.parallel_map`. Default ``1`` preserves
778
+ bit-identical sequential behavior. ``n_jobs != 1`` requires picklable
779
+ scorers per the v0.35 ADR
780
+ (``docs/source/methodology/parallelism.md#scorer-picklability``).
781
+ """
782
+ # Pre-filter skipped pairs (allow-list miss) before dispatching parallel
783
+ # work-units. Logs the same skip messages as the pre-parallel version.
784
+ work_units: list[_ScoreOnePairItem] = []
785
+ skipped: dict[tuple[str, str], dict[str, object]] = {}
721
786
  for slice_ in slices:
722
787
  _logger.info(
723
788
  "[slice %s] n=%d, positives=%d",
@@ -725,32 +790,61 @@ def _score_all_slices(
725
790
  len(slice_.df),
726
791
  int(slice_.y_true.sum()),
727
792
  )
728
- slice_data: dict[str, dict[str, object]] = {}
729
- scores_by_scorer: dict[str, np.ndarray] = {}
730
793
  for sname, scorer in scorers.items():
731
794
  if not _should_score_slice(scorer, slice_.name):
732
795
  reason = f"slice {slice_.name!r} not in scorer allow-list"
733
- slice_data[sname] = _skipped_scorer_result(slice_, reason)
796
+ skipped[(slice_.name, sname)] = _skipped_scorer_result(slice_, reason)
734
797
  _logger.info(" skipped %s: %s", sname, reason)
735
798
  continue
736
- t0 = time.time()
737
- slice_data[sname] = evaluate_scorer_on_slice(
738
- scorer,
739
- slice_,
740
- n_resamples=n_resamples,
741
- seed=seed,
742
- on_scorer_error=on_scorer_error,
743
- )
744
- # If the scorer raised under on_scorer_error="record", scores is [].
745
- # Subsequent paired-diff machinery sees the empty array and will
746
- # short-circuit on the same len-check it already does for skipped
747
- # scorers; no special-case needed.
748
- scores_by_scorer[sname] = np.asarray(slice_data[sname]["scores"], dtype=np.float64)
749
- score_cache[(slice_.name, sname)] = scores_by_scorer[sname]
750
- elapsed = time.time() - t0
751
- pr = slice_data[sname].get("pr_auc")
799
+ work_units.append((slice_, sname, scorer, n_resamples, seed, on_scorer_error))
800
+
801
+ # Parallel scoring. parallel_map at n_jobs=1 is a pure-Python for-loop
802
+ # (Principle #4) — bit-identical to the pre-v0.36 sequential code.
803
+ if work_units:
804
+ t0_total = time.time()
805
+ results = parallel_map(
806
+ _score_one_pair,
807
+ work_units,
808
+ n_jobs=n_jobs,
809
+ description="harness _score_all_slices",
810
+ )
811
+ elapsed_total = time.time() - t0_total
812
+ _logger.info(
813
+ " scored %d (slice, scorer) pairs in %.1fs (n_jobs=%d)",
814
+ len(work_units),
815
+ elapsed_total,
816
+ n_jobs,
817
+ )
818
+ else:
819
+ results = []
820
+
821
+ # Index results for O(1) lookup during reassembly.
822
+ results_by_key: dict[tuple[str, str], _ScoreOnePairResult] = {
823
+ (slice_name, sname): (slice_name, sname, result_dict, scores_arr)
824
+ for slice_name, sname, result_dict, scores_arr in results
825
+ }
826
+
827
+ # Reassemble in the original (slices × scorers.items()) iteration order.
828
+ by_slice: dict[str, dict[str, object]] = {}
829
+ score_cache: dict[tuple[str, str], np.ndarray] = {}
830
+ for slice_ in slices:
831
+ slice_data: dict[str, dict[str, object]] = {}
832
+ scores_by_scorer: dict[str, np.ndarray] = {}
833
+ for sname in scorers:
834
+ key = (slice_.name, sname)
835
+ if key in skipped:
836
+ slice_data[sname] = skipped[key]
837
+ continue
838
+ _, _, result_dict, scores_arr = results_by_key[key]
839
+ slice_data[sname] = result_dict
840
+ # If the scorer raised under on_scorer_error="record", scores_arr is [].
841
+ # Paired-diff machinery short-circuits on the same len-check it uses
842
+ # for skipped scorers; no special-case needed.
843
+ scores_by_scorer[sname] = scores_arr
844
+ score_cache[key] = scores_arr
845
+ pr = result_dict.get("pr_auc")
752
846
  pr_display = f"{pr:.4f}" if isinstance(pr, float) else "N/A"
753
- _logger.info(" %s: PR-AUC=%s (%.1fs)", sname, pr_display, elapsed)
847
+ _logger.info(" %s: PR-AUC=%s", sname, pr_display)
754
848
 
755
849
  diffs = (
756
850
  _compute_paired_diffs(
@@ -789,6 +883,7 @@ def evaluate(
789
883
  on_leakage: Literal["raise", "record", "skip"] = "raise",
790
884
  on_scorer_error: Literal["raise", "record"] = "raise",
791
885
  operating_point_specs: Sequence[OperatingPointSpec] = (),
886
+ n_jobs: int = 1,
792
887
  ) -> RunResult:
793
888
  """Run every scorer on every slice; return a pure :class:`RunResult` (no IO).
794
889
 
@@ -830,6 +925,15 @@ def evaluate(
830
925
  Fit thresholds on one mixed-class slice and apply them to named target
831
926
  slices. Results are attached under each scorer's
832
927
  ``"transferred_operating_points"`` block. Default empty (skip).
928
+ n_jobs : int, optional
929
+ Parallel workers (default 1 — sequential). ``n_jobs > 1`` uses
930
+ joblib loky to parallelize the flat ``(slice × scorer)`` work-unit
931
+ loop in :func:`_score_all_slices` (and the operating-point fit
932
+ phase when ``operating_point_specs`` is non-empty). ``n_jobs=-1``
933
+ uses all cores; ``n_jobs=0`` is rejected. Scorers must be picklable
934
+ when ``n_jobs != 1`` — see
935
+ :doc:`methodology/parallelism` § Scorer picklability for the
936
+ contract + worked examples.
833
937
 
834
938
  Returns
835
939
  -------
@@ -850,6 +954,9 @@ def evaluate(
850
954
  if not slices:
851
955
  raise ValueError("at least one slice required")
852
956
 
957
+ if n_jobs != 1:
958
+ _assert_scorers_picklable(scorers)
959
+
853
960
  config: dict[str, object] = {
854
961
  "n_resamples": n_resamples,
855
962
  "seed": seed,
@@ -872,6 +979,7 @@ def evaluate(
872
979
  seed=seed,
873
980
  paired_diffs=paired_diffs,
874
981
  on_scorer_error=on_scorer_error,
982
+ n_jobs=n_jobs,
875
983
  )
876
984
 
877
985
  if operating_point_specs:
@@ -882,11 +990,45 @@ def evaluate(
882
990
  score_cache=score_cache,
883
991
  scorer_names=list(scorers.keys()),
884
992
  specs=operating_point_specs,
993
+ n_jobs=n_jobs,
885
994
  )
886
995
 
887
996
  return RunResult(run_id=run_id, git_sha=git_sha, config=config, by_slice=by_slice)
888
997
 
889
998
 
999
+ _OpPointFitItem = tuple[
1000
+ str, # spec_name (for reassembly key)
1001
+ str, # fit_slice_name (passed through to fit_operating_points)
1002
+ str, # scorer_name
1003
+ np.ndarray, # fit_y_true
1004
+ np.ndarray, # fit_scores
1005
+ Sequence[ThresholdSelector], # spec.selectors (passed through to fit_operating_points)
1006
+ ]
1007
+ _OpPointFitResult = tuple[str, str, object] # (spec_name, scorer_name, fitted | error_dict)
1008
+
1009
+
1010
+ def _fit_one_op_point_pair(item: _OpPointFitItem) -> _OpPointFitResult:
1011
+ """Picklable step function for ``(spec × scorer)`` operating-point fitting.
1012
+
1013
+ Module-scope so loky workers can serialize the reference. All inputs flow
1014
+ through the ``item`` tuple. Returns ``(spec_name, scorer_name, fitted)``
1015
+ where ``fitted`` is either the :func:`fit_operating_points` result or a
1016
+ ``{"error": str}`` dict matching the sequential code path.
1017
+ """
1018
+ spec_name, fit_slice_name, scorer_name, y_true, fit_scores, selectors = item
1019
+ try:
1020
+ fitted = fit_operating_points(
1021
+ y_true,
1022
+ fit_scores,
1023
+ selectors,
1024
+ fitted_on_slice=fit_slice_name,
1025
+ scorer_name=scorer_name,
1026
+ )
1027
+ except (ValueError, RuntimeError) as exc:
1028
+ return spec_name, scorer_name, {"error": str(exc)}
1029
+ return spec_name, scorer_name, fitted
1030
+
1031
+
890
1032
  def _attach_transferred_operating_points(
891
1033
  *,
892
1034
  by_slice: dict[str, dict[str, object]],
@@ -894,34 +1036,73 @@ def _attach_transferred_operating_points(
894
1036
  score_cache: Mapping[tuple[str, str], np.ndarray],
895
1037
  scorer_names: Sequence[str],
896
1038
  specs: Sequence[OperatingPointSpec],
1039
+ n_jobs: int = 1,
897
1040
  ) -> None:
898
- """Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics."""
1041
+ """Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics.
1042
+
1043
+ v0.36 added ``n_jobs``: parallelizes the ``(spec × scorer)`` fit phase
1044
+ via :func:`eval_toolkit._parallel.parallel_map`. The apply phase
1045
+ (writing into ``by_slice``) stays sequential — fitting dominates runtime.
1046
+ Default ``n_jobs=1`` preserves bit-identical sequential behavior.
1047
+ """
1048
+ # Pre-flight: handle "fit slice not found" errors (these short-circuit the
1049
+ # entire spec) + collect valid fit work-units. Tracks pre-conditions
1050
+ # ("fit scorer skipped") as separate state so the parallel dispatch only
1051
+ # carries actual work.
1052
+ fit_work: list[_OpPointFitItem] = []
1053
+ fit_skip_reasons: dict[tuple[str, str], dict[str, object]] = {}
1054
+ specs_with_valid_fit: list[OperatingPointSpec] = []
1055
+ names_per_spec: dict[str, list[str]] = {}
1056
+
899
1057
  for spec in specs:
900
1058
  names = list(spec.scorer_names) if spec.scorer_names else list(scorer_names)
1059
+ names_per_spec[spec.name] = names
901
1060
  if spec.fit_slice not in slices_by_name:
902
1061
  _record_spec_error(by_slice, spec, names, f"fit slice {spec.fit_slice!r} not found")
903
1062
  continue
904
-
1063
+ specs_with_valid_fit.append(spec)
905
1064
  fit_slice = slices_by_name[spec.fit_slice]
906
- fitted_by_scorer: dict[str, object] = {}
907
1065
  for scorer_name in names:
908
1066
  fit_scores = score_cache.get((spec.fit_slice, scorer_name))
909
1067
  if fit_scores is None or len(fit_scores) != len(fit_slice.y_true):
910
- fitted_by_scorer[scorer_name] = {
1068
+ fit_skip_reasons[(spec.name, scorer_name)] = {
911
1069
  "error": "fit scorer skipped, errored, or produced no scores"
912
1070
  }
913
1071
  continue
914
- try:
915
- fitted_by_scorer[scorer_name] = fit_operating_points(
1072
+ fit_work.append(
1073
+ (
1074
+ spec.name,
1075
+ spec.fit_slice,
1076
+ scorer_name,
916
1077
  fit_slice.y_true,
917
1078
  fit_scores,
918
1079
  spec.selectors,
919
- fitted_on_slice=spec.fit_slice,
920
- scorer_name=scorer_name,
921
1080
  )
922
- except (ValueError, RuntimeError) as exc:
923
- fitted_by_scorer[scorer_name] = {"error": str(exc)}
1081
+ )
1082
+
1083
+ # Parallel fit phase. parallel_map at n_jobs=1 is a pure-Python for-loop
1084
+ # (Principle #4) — bit-identical to the pre-v0.36 sequential code.
1085
+ fit_results: list[_OpPointFitResult] = (
1086
+ parallel_map(
1087
+ _fit_one_op_point_pair,
1088
+ fit_work,
1089
+ n_jobs=n_jobs,
1090
+ description="harness _attach_transferred_operating_points (fit)",
1091
+ )
1092
+ if fit_work
1093
+ else []
1094
+ )
1095
+
1096
+ # Index by (spec_name, scorer_name) for O(1) lookup in the apply phase.
1097
+ fitted_by_pair: dict[tuple[str, str], object] = {
1098
+ (spec_name, scorer_name): fitted for spec_name, scorer_name, fitted in fit_results
1099
+ }
1100
+ fitted_by_pair.update(fit_skip_reasons)
924
1101
 
1102
+ # Sequential apply phase — preserves the original by_slice mutation order
1103
+ # and the schema of error / skipped markers.
1104
+ for spec in specs_with_valid_fit:
1105
+ names = names_per_spec[spec.name]
925
1106
  for target_name in spec.apply_slices:
926
1107
  if target_name not in slices_by_name:
927
1108
  _record_spec_error(
@@ -939,7 +1120,7 @@ def _attach_transferred_operating_points(
939
1120
  spec_block: dict[str, object] = {}
940
1121
  transfer_block[spec.name] = spec_block
941
1122
 
942
- fitted = fitted_by_scorer.get(scorer_name)
1123
+ fitted = fitted_by_pair.get((spec.name, scorer_name))
943
1124
  if not isinstance(fitted, dict) or "error" in fitted:
944
1125
  spec_block["error"] = (
945
1126
  str(fitted.get("error", "threshold fitting failed"))
@@ -1099,6 +1280,7 @@ def evaluate_folded(
1099
1280
  on_scorer_error: Literal["raise", "record"] = "raise",
1100
1281
  eval_split_names: Sequence[str] = ("test",),
1101
1282
  summary_metrics: Sequence[str] = ("pr_auc", "roc_auc"),
1283
+ n_jobs: int = 1,
1102
1284
  ) -> RunResult:
1103
1285
  """Run a fold aggregator: ``Splitter × seeds → RunResult`` with CV-CI summary.
1104
1286
 
@@ -1128,6 +1310,15 @@ def evaluate_folded(
1128
1310
  RNG seeds for multi-seed × CV. Default ``(42,)`` (single seed).
1129
1311
  n_resamples, paired_diffs, leakage_checks, on_leakage, on_scorer_error :
1130
1312
  Forwarded to :func:`evaluate` per fold.
1313
+ n_jobs : int, optional
1314
+ Parallel workers (default 1 — sequential). Forwarded to
1315
+ :func:`evaluate` per fold; parallelizes the inner
1316
+ ``(slice × scorer)`` work-unit loop within each fold. Folds
1317
+ themselves run sequentially to keep determinism + traceback
1318
+ fidelity simple; for fold-level parallelism, consider an external
1319
+ ``joblib.Parallel`` wrapper at the call site. See
1320
+ :doc:`methodology/parallelism` § Scorer picklability for the
1321
+ Scorer picklability contract when ``n_jobs != 1``.
1131
1322
  eval_split_names : sequence of str, optional
1132
1323
  Subset of each fold-dict's keys to actually evaluate. Default
1133
1324
  ``("test",)`` — train sets are skipped (eval-only K-fold). Pass
@@ -1183,6 +1374,7 @@ def evaluate_folded(
1183
1374
  leakage_checks=leakage_checks,
1184
1375
  on_leakage=on_leakage,
1185
1376
  on_scorer_error=on_scorer_error,
1377
+ n_jobs=n_jobs,
1186
1378
  )
1187
1379
  by_fold[fold_id] = fold_result
1188
1380
 
@@ -31,6 +31,16 @@ class Scorer(Protocol):
31
31
  Accepts ``list[str]``, ``np.ndarray``, or ``pd.Series`` of features.
32
32
  Pandas is imported under ``TYPE_CHECKING`` only, so this Protocol
33
33
  has no runtime pandas dependency.
34
+
35
+ Notes
36
+ -----
37
+ When passed to a parallel-capable harness call (``n_jobs > 1``), Scorer
38
+ instances MUST be picklable — joblib's loky backend serializes the entire
39
+ delayed call (function plus bound arguments) before worker dispatch.
40
+ Closures, lambdas, local-scope classes, and attributes holding live
41
+ sockets / file handles break pickling. See
42
+ ``docs/source/methodology/parallelism.md#scorer-picklability`` for the
43
+ full contract and worked examples.
34
44
  """
35
45
 
36
46
  def predict_proba( # pragma: no cover