eval-toolkit 0.34.0__tar.gz → 0.35.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/CHANGELOG.md +33 -0
  2. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/__init__.py +1 -0
  4. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/calibration.py +97 -0
  6. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/protocols.py +10 -0
  7. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/public_api/snapshot.json +7 -1
  8. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_unit.py +126 -0
  9. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/.gitignore +0 -0
  10. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/LICENSE +0 -0
  11. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/README.md +0 -0
  12. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/STYLE.md +0 -0
  13. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/archive/README.md +0 -0
  14. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/README.md +0 -0
  15. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/datasets/README.md +0 -0
  16. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/data-integrity/README.md +0 -0
  17. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  18. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/inference/README.md +0 -0
  19. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/prompt-injection/README.md +0 -0
  20. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/source/methodology/README.md +0 -0
  21. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/pyproject.toml +0 -0
  22. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/__main__.py +0 -0
  23. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_deprecated.py +0 -0
  24. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_parallel.py +0 -0
  25. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/analysis.py +0 -0
  26. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/artifacts.py +0 -0
  27. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/bootstrap.py +0 -0
  28. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/claims.py +0 -0
  29. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/config.py +0 -0
  30. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/docs.py +0 -0
  31. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/embeddings.py +0 -0
  32. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/evidence.py +0 -0
  33. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/harness.py +0 -0
  34. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/leakage.py +0 -0
  35. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/loaders.py +0 -0
  36. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/manifest.py +0 -0
  37. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/metrics.py +0 -0
  38. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/operating_points.py +0 -0
  39. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/paths.py +0 -0
  40. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/plotting.py +0 -0
  41. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/provenance.py +0 -0
  42. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/py.typed +0 -0
  43. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  44. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  45. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  46. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  47. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  48. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/seeds.py +0 -0
  49. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/splits.py +0 -0
  50. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/text_dedup.py +0 -0
  51. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/thresholds.py +0 -0
  52. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  53. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  54. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  55. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  56. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  57. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  58. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  59. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  60. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  61. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  62. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/benchmarks/__init__.py +0 -0
  63. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  64. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/conftest.py +0 -0
  65. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  66. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  67. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  68. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  69. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/expected.md +0 -0
  70. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/input.md +0 -0
  71. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/metrics.json +0 -0
  72. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  73. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/strategies.py +0 -0
  74. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_analysis.py +0 -0
  75. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_artifacts.py +0 -0
  76. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  77. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  78. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_edge_cases.py +0 -0
  79. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_golden.py +0 -0
  80. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_njobs.py +0 -0
  81. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_props.py +0 -0
  82. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_research_grounded.py +0 -0
  83. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_unit.py +0 -0
  84. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  85. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_determinism.py +0 -0
  86. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_optimization_failures.py +0 -0
  87. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_props.py +0 -0
  88. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_research_grounded.py +0 -0
  89. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims.py +0 -0
  90. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims_coverage.py +0 -0
  91. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims_props.py +0 -0
  92. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_cli.py +0 -0
  93. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_config.py +0 -0
  94. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_bootstrap.py +0 -0
  95. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_calibration.py +0 -0
  96. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_harness.py +0 -0
  97. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_metrics.py +0 -0
  98. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_plotting.py +0 -0
  99. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  100. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_deprecations.py +0 -0
  101. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_docs_golden.py +0 -0
  102. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_docs_props.py +0 -0
  103. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_embeddings.py +0 -0
  104. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_evidence_validators.py +0 -0
  105. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_edge_cases.py +0 -0
  106. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_fault_injection.py +0 -0
  107. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_folded.py +0 -0
  108. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_internals.py +0 -0
  109. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_metric_options.py +0 -0
  110. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_smoke.py +0 -0
  111. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_import_boundaries.py +0 -0
  112. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage.py +0 -0
  113. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage_error_paths.py +0 -0
  114. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage_props.py +0 -0
  115. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders.py +0 -0
  116. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders_coverage.py +0 -0
  117. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders_props.py +0 -0
  118. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_logging.py +0 -0
  119. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest.py +0 -0
  120. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  121. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_props.py +0 -0
  122. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_validation.py +0 -0
  123. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_props.py +0 -0
  124. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_stratified_subsets.py +0 -0
  125. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_unit.py +0 -0
  126. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_misc_coverage.py +0 -0
  127. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_numeric_edge_cases.py +0 -0
  128. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_operating_points.py +0 -0
  129. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_operating_points_props.py +0 -0
  130. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_parallel.py +0 -0
  131. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_paths.py +0 -0
  132. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_pipeline_e2e.py +0 -0
  133. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_edge.py +0 -0
  134. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_smoke.py +0 -0
  135. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_visual.py +0 -0
  136. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_protocol_conformance.py +0 -0
  137. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_provenance.py +0 -0
  138. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_public_api.py +0 -0
  139. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_recall_at_fpr.py +0 -0
  140. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_reference_equivalence.py +0 -0
  141. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_reproducibility_integration.py +0 -0
  142. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_schemas.py +0 -0
  143. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_seeds.py +0 -0
  144. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits.py +0 -0
  145. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits_leakage_integration.py +0 -0
  146. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits_props.py +0 -0
  147. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup.py +0 -0
  148. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_coverage.py +0 -0
  149. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_props.py +0 -0
  150. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_strategies.py +0 -0
  151. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds.py +0 -0
  152. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_constant_score.py +0 -0
  153. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_coverage.py +0 -0
  154. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_props.py +0 -0
  155. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_research_grounded.py +0 -0
  156. {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_v09_contracts.py +0 -0
@@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
11
+
12
+ Small, additive release. Adds a binary-classification calibration helper
13
+ that lets consumers drop the ~50 LOC scalar-proba adapter many were
14
+ carrying, plus a design ADR that unblocks the v0.36 harness / operating-
15
+ point parallelization work (#29, #30) without re-litigating picklability.
16
+
17
+ ### Added
18
+
19
+ - `eval_toolkit.fit_temperature_binary(y_true, y_score)` — scalar-proba
20
+ adapter for the multi-class `fit_temperature` fitter. Converts `(n,)`
21
+ probabilities of class 1 to a 2-column logit array via clipped logit
22
+ (`[0, logit(p)]` so softmax row 1 reproduces `p`), delegates to the
23
+ deployment-quality fitter, and returns `(T_opt, apply)` where
24
+ `apply: (n,) -> (n,)` does scalar-in / scalar-out T-scaling. Unlike
25
+ `fit_temperature_oracle`, no warning — the contract assumes val / test
26
+ separation (deployment-quality calibration, not fit-on-test). Closes
27
+ #28.
28
+
29
+ ### Documentation
30
+
31
+ - `docs/source/methodology/parallelism.md` — new `## Scorer picklability`
32
+ sub-section documenting the Scorer protocol's picklability contract
33
+ for `n_jobs > 1` usage. Includes worked picklable / broken-closure /
34
+ fix examples plus a list of common non-picklable patterns to watch for
35
+ in user-supplied Scorers (closures, lambdas on instances, local-scope
36
+ classes, attributes holding live sockets / file handles). Anchors on
37
+ the existing v0.34.0 `parallel_map` pickle sniff + `TypeError`
38
+ channel — no new exception class. Unblocks v0.36 implementation of
39
+ #29 and #30.
40
+ - `eval_toolkit.protocols.Scorer` docstring — Notes block pointing at
41
+ the new methodology section.
42
+
10
43
  ## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
11
44
 
12
45
  Closes all 7 open backlog issues in one consumer-closing release. Also
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.34.0
3
+ Version: 0.35.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -87,6 +87,7 @@ _EXPORTS: dict[str, str] = {
87
87
  "fit_isotonic_calibrator": "eval_toolkit.calibration",
88
88
  "fit_platt_calibrator": "eval_toolkit.calibration",
89
89
  "fit_temperature": "eval_toolkit.calibration",
90
+ "fit_temperature_binary": "eval_toolkit.calibration",
90
91
  "fit_temperature_oracle": "eval_toolkit.calibration",
91
92
  "reliability_curve": "eval_toolkit.calibration",
92
93
  "reliability_diagram_data": "eval_toolkit.calibration",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.34.0"
5
+ __version__ = "0.35.0"
@@ -57,6 +57,7 @@ __all__ = [
57
57
  "fit_isotonic_calibrator",
58
58
  "fit_platt_calibrator",
59
59
  "fit_temperature",
60
+ "fit_temperature_binary",
60
61
  "fit_temperature_oracle",
61
62
  "maximum_calibration_error",
62
63
  "reliability_curve",
@@ -1038,6 +1039,102 @@ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -
1038
1039
  return float(-log_probs[np.arange(len(labels)), labels].mean())
1039
1040
 
1040
1041
 
1042
+ def fit_temperature_binary(
1043
+ y_true: np.ndarray,
1044
+ y_score: np.ndarray,
1045
+ *,
1046
+ bounds: tuple[float, float] = (0.05, 20.0),
1047
+ ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
1048
+ r"""Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).
1049
+
1050
+ Fits a scalar T > 0 on *validation* probabilities of class 1 and returns
1051
+ both T and a callable that applies the same T-scaling to test
1052
+ probabilities. Internally:
1053
+
1054
+ 1. Clips ``y_score`` to ``[1e-7, 1-1e-7]`` for finite logit inversion.
1055
+ 2. Builds a 2-column logit array ``[0, logit(p)]`` so softmax row 1
1056
+ reproduces ``p`` exactly.
1057
+ 3. Delegates to :func:`fit_temperature` for the bounded NLL minimization.
1058
+ 4. Returns ``(T, apply)`` where ``apply(p_test) = sigmoid(logit(p_test)/T)``.
1059
+
1060
+ Unlike :func:`fit_temperature_oracle`, this does NOT emit a warning — the
1061
+ contract is that ``y_true`` / ``y_score`` come from a held-out validation
1062
+ set and ``apply`` is invoked on a separate test set (deployment-quality
1063
+ calibration, not fit-on-test).
1064
+
1065
+ Parameters
1066
+ ----------
1067
+ y_true : np.ndarray, shape (n,)
1068
+ Binary validation labels in {0, 1}.
1069
+ y_score : np.ndarray, shape (n,)
1070
+ Validation predicted probabilities of class 1, in [0, 1]. Values at
1071
+ the extremes are clipped to ``[1e-7, 1 - 1e-7]``.
1072
+ bounds : tuple of float, optional
1073
+ ``(lo, hi)`` bracket for T. Default ``(0.05, 20.0)``, matches
1074
+ :func:`fit_temperature`.
1075
+
1076
+ Returns
1077
+ -------
1078
+ tuple
1079
+ ``(T_optimal, apply)`` where ``apply: (n,) -> (n,)`` maps any input
1080
+ probability array through :math:`\sigma(\mathrm{logit}(p) / T)`.
1081
+
1082
+ Raises
1083
+ ------
1084
+ ValueError
1085
+ On shape mismatch, empty input, non-finite scores, or single-class
1086
+ ``y_true``.
1087
+ RuntimeError
1088
+ If the bounded scalar optimizer fails to converge.
1089
+
1090
+ Examples
1091
+ --------
1092
+ >>> import numpy as np
1093
+ >>> rng = np.random.default_rng(0)
1094
+ >>> n = 500
1095
+ >>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
1096
+ >>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
1097
+ >>> T, apply = fit_temperature_binary(y_val, p_val)
1098
+ >>> T > 0
1099
+ True
1100
+ >>> p_test = np.array([0.1, 0.5, 0.9])
1101
+ >>> apply(p_test).shape == (3,)
1102
+ True
1103
+
1104
+ See Also
1105
+ --------
1106
+ fit_temperature : underlying multi-class fitter (operates on 2-col logits)
1107
+ fit_temperature_oracle : diagnostic-only variant that fits T on the same
1108
+ probabilities it scores
1109
+
1110
+ References
1111
+ ----------
1112
+ .. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
1113
+ calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
1114
+ """
1115
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
1116
+
1117
+ # Build 2-col logits [0, logit(p)] so softmax([0, logit(p)])[1] == p exactly.
1118
+ s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1119
+ logit_pos = np.log(s_clipped / (1.0 - s_clipped))
1120
+ val_logits_2col = np.column_stack([np.zeros_like(logit_pos), logit_pos])
1121
+
1122
+ result = fit_temperature(val_logits_2col, y_true_arr, bounds=bounds)
1123
+ t_optimal = float(result["temperature"])
1124
+
1125
+ def apply(scores: np.ndarray) -> np.ndarray:
1126
+ arr = np.asarray(scores, dtype=float).ravel()
1127
+ if not np.isfinite(arr).all():
1128
+ raise ValueError("scores contains NaN or inf")
1129
+ clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1130
+ logit = np.log(clipped / (1.0 - clipped))
1131
+ scaled = logit / t_optimal
1132
+ out: np.ndarray = (1.0 / (1.0 + np.exp(-scaled))).astype(float)
1133
+ return out
1134
+
1135
+ return t_optimal, apply
1136
+
1137
+
1041
1138
  def fit_temperature_oracle(
1042
1139
  y_true: np.ndarray, y_score: np.ndarray
1043
1140
  ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
@@ -31,6 +31,16 @@ class Scorer(Protocol):
31
31
  Accepts ``list[str]``, ``np.ndarray``, or ``pd.Series`` of features.
32
32
  Pandas is imported under ``TYPE_CHECKING`` only, so this Protocol
33
33
  has no runtime pandas dependency.
34
+
35
+ Notes
36
+ -----
37
+ When passed to a parallel-capable harness call (``n_jobs > 1``), Scorer
38
+ instances MUST be picklable — joblib's loky backend serializes the entire
39
+ delayed call (function plus bound arguments) before worker dispatch.
40
+ Closures, lambdas, local-scope classes, and attributes holding live
41
+ sockets / file handles break pickling. See
42
+ ``docs/source/methodology/parallelism.md#scorer-picklability`` for the
43
+ full contract and worked examples.
34
44
  """
35
45
 
36
46
  def predict_proba( # pragma: no cover
@@ -137,6 +137,7 @@
137
137
  "fit_operating_points",
138
138
  "fit_platt_calibrator",
139
139
  "fit_temperature",
140
+ "fit_temperature_binary",
140
141
  "fit_temperature_oracle",
141
142
  "from_yaml",
142
143
  "frozen_config",
@@ -1016,7 +1017,7 @@
1016
1017
  "doc_first_line": "str(object='') -> str",
1017
1018
  "kind": "value",
1018
1019
  "type": "str",
1019
- "value": "'0.34.0'"
1020
+ "value": "'0.35.0'"
1020
1021
  },
1021
1022
  "apply_operating_points": {
1022
1023
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1203,6 +1204,11 @@
1203
1204
  "kind": "function",
1204
1205
  "signature": "(val_logits: 'np.ndarray', val_labels: 'np.ndarray', bounds: 'tuple[float, float]' = (0.05, 20.0)) -> 'dict[str, float]'"
1205
1206
  },
1207
+ "fit_temperature_binary": {
1208
+ "doc_first_line": "Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).",
1209
+ "kind": "function",
1210
+ "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', *, bounds: 'tuple[float, float]' = (0.05, 20.0)) -> 'tuple[float, Callable[[np.ndarray], np.ndarray]]'"
1211
+ },
1206
1212
  "fit_temperature_oracle": {
1207
1213
  "doc_first_line": "**DIAGNOSTIC ONLY** \u2014 fit-on-test oracle T-scaling per Guo et al. 2017 [#guo]_.",
1208
1214
  "kind": "function",
@@ -16,6 +16,7 @@ from eval_toolkit.calibration import (
16
16
  fit_isotonic_calibrator,
17
17
  fit_platt_calibrator,
18
18
  fit_temperature,
19
+ fit_temperature_binary,
19
20
  fit_temperature_oracle,
20
21
  maximum_calibration_error,
21
22
  reliability_curve,
@@ -361,3 +362,128 @@ def test_fit_platt_matches_sklearn_canonical() -> None:
361
362
  ours_out = ours(grid)
362
363
  sk_out = sk_cal.predict(grid)
363
364
  np.testing.assert_allclose(ours_out, sk_out, atol=1e-6, rtol=1e-6)
365
+
366
+
367
+ # --- fit_temperature_binary (#28) -------------------------------------------------
368
+
369
+
370
+ @pytest.mark.unit
371
+ def test_fit_temperature_binary_runs(well_separated: tuple[np.ndarray, np.ndarray]) -> None:
372
+ """Smoke test: returns positive T + callable; calibrated outputs in (0, 1)."""
373
+ y, s = well_separated
374
+ s_clipped = np.clip(s, 0.01, 0.99)
375
+ T, apply = fit_temperature_binary(y, s_clipped)
376
+ assert T > 0
377
+ out = apply(s_clipped)
378
+ assert out.shape == s_clipped.shape # scalar (n,) in/out contract
379
+ assert (out > 0.0).all() and (out < 1.0).all()
380
+
381
+
382
+ @pytest.mark.unit
383
+ def test_fit_temperature_binary_shape_contract() -> None:
384
+ """Apply returns shape (n,), never (n, 2). Guards against 2-col regressions."""
385
+ rng = np.random.default_rng(0)
386
+ y = rng.binomial(1, 0.3, size=200).astype(int)
387
+ s = np.clip(y * 0.6 + rng.normal(0, 0.2, 200), 0.01, 0.99)
388
+ _, apply = fit_temperature_binary(y, s)
389
+ for shape in [(1,), (3,), (50,)]:
390
+ p_test = rng.uniform(0.05, 0.95, size=shape)
391
+ assert apply(p_test).shape == shape
392
+
393
+
394
+ @pytest.mark.unit
395
+ def test_fit_temperature_binary_handles_extremes() -> None:
396
+ """Probas at exactly 0 and 1 produce finite outputs (clipping covers the logit pole).
397
+
398
+ Contract: ``logit(0)`` and ``logit(1)`` are infinite, but the internal
399
+ clipping to ``[1e-7, 1-1e-7]`` keeps the math finite. Outputs may hit the
400
+ float64 boundary (0.0 or 1.0) at extreme inputs with small T — that is
401
+ correct behavior, not a violation. The real failure mode this test guards
402
+ against is ``inf`` / ``nan`` in either fit or apply.
403
+ """
404
+ rng = np.random.default_rng(0)
405
+ n = 200
406
+ y = rng.binomial(1, 0.5, size=n).astype(int)
407
+ s = y.astype(float) # exact 0s and 1s in val data
408
+ T, apply = fit_temperature_binary(y, s)
409
+ assert np.isfinite(T)
410
+ # Apply to extremes — must be finite + in [0, 1] (boundary-inclusive)
411
+ p_test = np.array([0.0, 0.5, 1.0])
412
+ out = apply(p_test)
413
+ assert np.isfinite(out).all()
414
+ assert (out >= 0.0).all() and (out <= 1.0).all()
415
+
416
+
417
+ @pytest.mark.unit
418
+ def test_fit_temperature_binary_parity_with_multiclass() -> None:
419
+ """fit_temperature_binary(y, p) matches manual fit_temperature(2-col-logits, y).
420
+
421
+ Establishes the contract that the binary adapter is a thin wrapper, not a
422
+ re-implementation: identical T, identical applied probabilities.
423
+ """
424
+ rng = np.random.default_rng(7)
425
+ n = 400
426
+ y = rng.binomial(1, 0.4, size=n).astype(int)
427
+ p_val = np.clip(y * 0.5 + rng.normal(0, 0.25, n), 0.01, 0.99)
428
+ p_test = rng.uniform(0.05, 0.95, size=50)
429
+
430
+ T_binary, apply_binary = fit_temperature_binary(y, p_val)
431
+
432
+ # Manual multi-class path: build 2-col logits, fit T, apply via softmax row 1.
433
+ logit_val = np.log(p_val / (1.0 - p_val))
434
+ val_logits_2col = np.column_stack([np.zeros_like(logit_val), logit_val])
435
+ result_mc = fit_temperature(val_logits_2col, y)
436
+ T_mc = result_mc["temperature"]
437
+
438
+ logit_test = np.log(p_test / (1.0 - p_test))
439
+ test_logits_2col = np.column_stack([np.zeros_like(logit_test), logit_test]) / T_mc
440
+ # softmax row 1 = exp(z1) / (exp(0) + exp(z1)) = sigmoid(z1)
441
+ expected = 1.0 / (1.0 + np.exp(-test_logits_2col[:, 1]))
442
+
443
+ assert T_binary == pytest.approx(T_mc, rel=1e-9)
444
+ np.testing.assert_allclose(apply_binary(p_test), expected, rtol=1e-9, atol=1e-12)
445
+
446
+
447
+ @pytest.mark.unit
448
+ def test_fit_temperature_binary_improves_nll() -> None:
449
+ """T_post NLL ≤ T_pre NLL (T=1 is always a feasible point in the bracket)."""
450
+ rng = np.random.default_rng(0)
451
+ n = 500
452
+ y = rng.binomial(1, 0.4, size=n).astype(int)
453
+ # Overconfident probabilities: push away from 0.5
454
+ raw = y * 0.7 + rng.normal(0, 0.15, n)
455
+ p = np.clip(0.5 + 2.5 * (raw - 0.5), 0.01, 0.99)
456
+ T, apply = fit_temperature_binary(y, p)
457
+ eps = 1e-12
458
+
459
+ def _binary_nll(probs: np.ndarray, labels: np.ndarray) -> float:
460
+ c = np.clip(probs, eps, 1 - eps)
461
+ return float(-(labels * np.log(c) + (1 - labels) * np.log(1 - c)).mean())
462
+
463
+ nll_pre = _binary_nll(p, y)
464
+ nll_post = _binary_nll(apply(p), y)
465
+ assert nll_post <= nll_pre + 1e-9
466
+
467
+
468
+ @pytest.mark.unit
469
+ def test_fit_temperature_binary_validates() -> None:
470
+ """Error paths inherit from _validate_calibrator_inputs."""
471
+ with pytest.raises(ValueError, match="shape mismatch"):
472
+ fit_temperature_binary(np.zeros(5, dtype=int), np.zeros(7))
473
+ with pytest.raises(ValueError, match="empty"):
474
+ fit_temperature_binary(np.array([], dtype=int), np.array([]))
475
+ with pytest.raises(ValueError, match="NaN or inf"):
476
+ fit_temperature_binary(np.array([0, 1, 0, 1]), np.array([0.1, np.nan, 0.3, 0.7]))
477
+ with pytest.raises(ValueError, match="both classes"):
478
+ fit_temperature_binary(np.ones(50, dtype=int), np.linspace(0.1, 0.9, 50))
479
+
480
+
481
+ @pytest.mark.unit
482
+ def test_fit_temperature_binary_apply_rejects_nonfinite() -> None:
483
+ """Apply rejects non-finite test-time scores (does not silently mask)."""
484
+ rng = np.random.default_rng(0)
485
+ y = rng.binomial(1, 0.3, size=200).astype(int)
486
+ s = np.clip(y * 0.6 + rng.normal(0, 0.2, 200), 0.01, 0.99)
487
+ _, apply = fit_temperature_binary(y, s)
488
+ with pytest.raises(ValueError, match="NaN or inf"):
489
+ apply(np.array([0.5, np.nan, 0.7]))
File without changes
File without changes
File without changes
File without changes