eval-toolkit 0.34.0__tar.gz → 0.35.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/CHANGELOG.md +33 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/PKG-INFO +1 -1
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/__init__.py +1 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/calibration.py +97 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/protocols.py +10 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/public_api/snapshot.json +7 -1
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_unit.py +126 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/.gitignore +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/LICENSE +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/STYLE.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/pyproject.toml +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,39 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
|
|
11
|
+
|
|
12
|
+
Small, additive release. Adds a binary-classification calibration helper
|
|
13
|
+
that lets consumers drop the ~50 LOC scalar-proba adapter many were
|
|
14
|
+
carrying, plus a design ADR that unblocks the v0.36 harness / operating-
|
|
15
|
+
point parallelization work (#29, #30) without re-litigating picklability.
|
|
16
|
+
|
|
17
|
+
### Added
|
|
18
|
+
|
|
19
|
+
- `eval_toolkit.fit_temperature_binary(y_true, y_score)` — scalar-proba
|
|
20
|
+
adapter for the multi-class `fit_temperature` fitter. Converts `(n,)`
|
|
21
|
+
probabilities of class 1 to a 2-column logit array via clipped logit
|
|
22
|
+
(`[0, logit(p)]` so softmax row 1 reproduces `p`), delegates to the
|
|
23
|
+
deployment-quality fitter, and returns `(T_opt, apply)` where
|
|
24
|
+
`apply: (n,) -> (n,)` does scalar-in / scalar-out T-scaling. Unlike
|
|
25
|
+
`fit_temperature_oracle`, no warning — the contract assumes val / test
|
|
26
|
+
separation (deployment-quality calibration, not fit-on-test). Closes
|
|
27
|
+
#28.
|
|
28
|
+
|
|
29
|
+
### Documentation
|
|
30
|
+
|
|
31
|
+
- `docs/source/methodology/parallelism.md` — new `## Scorer picklability`
|
|
32
|
+
sub-section documenting the Scorer protocol's picklability contract
|
|
33
|
+
for `n_jobs > 1` usage. Includes worked picklable / broken-closure /
|
|
34
|
+
fix examples plus a list of common non-picklable patterns to watch for
|
|
35
|
+
in user-supplied Scorers (closures, lambdas on instances, local-scope
|
|
36
|
+
classes, attributes holding live sockets / file handles). Anchors on
|
|
37
|
+
the existing v0.34.0 `parallel_map` pickle sniff + `TypeError`
|
|
38
|
+
channel — no new exception class. Unblocks v0.36 implementation of
|
|
39
|
+
#29 and #30.
|
|
40
|
+
- `eval_toolkit.protocols.Scorer` docstring — Notes block pointing at
|
|
41
|
+
the new methodology section.
|
|
42
|
+
|
|
10
43
|
## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
|
|
11
44
|
|
|
12
45
|
Closes all 7 open backlog issues in one consumer-closing release. Also
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -87,6 +87,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
87
87
|
"fit_isotonic_calibrator": "eval_toolkit.calibration",
|
|
88
88
|
"fit_platt_calibrator": "eval_toolkit.calibration",
|
|
89
89
|
"fit_temperature": "eval_toolkit.calibration",
|
|
90
|
+
"fit_temperature_binary": "eval_toolkit.calibration",
|
|
90
91
|
"fit_temperature_oracle": "eval_toolkit.calibration",
|
|
91
92
|
"reliability_curve": "eval_toolkit.calibration",
|
|
92
93
|
"reliability_diagram_data": "eval_toolkit.calibration",
|
|
@@ -57,6 +57,7 @@ __all__ = [
|
|
|
57
57
|
"fit_isotonic_calibrator",
|
|
58
58
|
"fit_platt_calibrator",
|
|
59
59
|
"fit_temperature",
|
|
60
|
+
"fit_temperature_binary",
|
|
60
61
|
"fit_temperature_oracle",
|
|
61
62
|
"maximum_calibration_error",
|
|
62
63
|
"reliability_curve",
|
|
@@ -1038,6 +1039,102 @@ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -
|
|
|
1038
1039
|
return float(-log_probs[np.arange(len(labels)), labels].mean())
|
|
1039
1040
|
|
|
1040
1041
|
|
|
1042
|
+
def fit_temperature_binary(
|
|
1043
|
+
y_true: np.ndarray,
|
|
1044
|
+
y_score: np.ndarray,
|
|
1045
|
+
*,
|
|
1046
|
+
bounds: tuple[float, float] = (0.05, 20.0),
|
|
1047
|
+
) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
|
|
1048
|
+
r"""Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).
|
|
1049
|
+
|
|
1050
|
+
Fits a scalar T > 0 on *validation* probabilities of class 1 and returns
|
|
1051
|
+
both T and a callable that applies the same T-scaling to test
|
|
1052
|
+
probabilities. Internally:
|
|
1053
|
+
|
|
1054
|
+
1. Clips ``y_score`` to ``[1e-7, 1-1e-7]`` for finite logit inversion.
|
|
1055
|
+
2. Builds a 2-column logit array ``[0, logit(p)]`` so softmax row 1
|
|
1056
|
+
reproduces ``p`` exactly.
|
|
1057
|
+
3. Delegates to :func:`fit_temperature` for the bounded NLL minimization.
|
|
1058
|
+
4. Returns ``(T, apply)`` where ``apply(p_test) = sigmoid(logit(p_test)/T)``.
|
|
1059
|
+
|
|
1060
|
+
Unlike :func:`fit_temperature_oracle`, this does NOT emit a warning — the
|
|
1061
|
+
contract is that ``y_true`` / ``y_score`` come from a held-out validation
|
|
1062
|
+
set and ``apply`` is invoked on a separate test set (deployment-quality
|
|
1063
|
+
calibration, not fit-on-test).
|
|
1064
|
+
|
|
1065
|
+
Parameters
|
|
1066
|
+
----------
|
|
1067
|
+
y_true : np.ndarray, shape (n,)
|
|
1068
|
+
Binary validation labels in {0, 1}.
|
|
1069
|
+
y_score : np.ndarray, shape (n,)
|
|
1070
|
+
Validation predicted probabilities of class 1, in [0, 1]. Values at
|
|
1071
|
+
the extremes are clipped to ``[1e-7, 1 - 1e-7]``.
|
|
1072
|
+
bounds : tuple of float, optional
|
|
1073
|
+
``(lo, hi)`` bracket for T. Default ``(0.05, 20.0)``, matches
|
|
1074
|
+
:func:`fit_temperature`.
|
|
1075
|
+
|
|
1076
|
+
Returns
|
|
1077
|
+
-------
|
|
1078
|
+
tuple
|
|
1079
|
+
``(T_optimal, apply)`` where ``apply: (n,) -> (n,)`` maps any input
|
|
1080
|
+
probability array through :math:`\sigma(\mathrm{logit}(p) / T)`.
|
|
1081
|
+
|
|
1082
|
+
Raises
|
|
1083
|
+
------
|
|
1084
|
+
ValueError
|
|
1085
|
+
On shape mismatch, empty input, non-finite scores, or single-class
|
|
1086
|
+
``y_true``.
|
|
1087
|
+
RuntimeError
|
|
1088
|
+
If the bounded scalar optimizer fails to converge.
|
|
1089
|
+
|
|
1090
|
+
Examples
|
|
1091
|
+
--------
|
|
1092
|
+
>>> import numpy as np
|
|
1093
|
+
>>> rng = np.random.default_rng(0)
|
|
1094
|
+
>>> n = 500
|
|
1095
|
+
>>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
|
|
1096
|
+
>>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
|
|
1097
|
+
>>> T, apply = fit_temperature_binary(y_val, p_val)
|
|
1098
|
+
>>> T > 0
|
|
1099
|
+
True
|
|
1100
|
+
>>> p_test = np.array([0.1, 0.5, 0.9])
|
|
1101
|
+
>>> apply(p_test).shape == (3,)
|
|
1102
|
+
True
|
|
1103
|
+
|
|
1104
|
+
See Also
|
|
1105
|
+
--------
|
|
1106
|
+
fit_temperature : underlying multi-class fitter (operates on 2-col logits)
|
|
1107
|
+
fit_temperature_oracle : diagnostic-only variant that fits T on the same
|
|
1108
|
+
probabilities it scores
|
|
1109
|
+
|
|
1110
|
+
References
|
|
1111
|
+
----------
|
|
1112
|
+
.. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
|
|
1113
|
+
calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
|
|
1114
|
+
"""
|
|
1115
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
1116
|
+
|
|
1117
|
+
# Build 2-col logits [0, logit(p)] so softmax([0, logit(p)])[1] == p exactly.
|
|
1118
|
+
s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
1119
|
+
logit_pos = np.log(s_clipped / (1.0 - s_clipped))
|
|
1120
|
+
val_logits_2col = np.column_stack([np.zeros_like(logit_pos), logit_pos])
|
|
1121
|
+
|
|
1122
|
+
result = fit_temperature(val_logits_2col, y_true_arr, bounds=bounds)
|
|
1123
|
+
t_optimal = float(result["temperature"])
|
|
1124
|
+
|
|
1125
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
1126
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
1127
|
+
if not np.isfinite(arr).all():
|
|
1128
|
+
raise ValueError("scores contains NaN or inf")
|
|
1129
|
+
clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
1130
|
+
logit = np.log(clipped / (1.0 - clipped))
|
|
1131
|
+
scaled = logit / t_optimal
|
|
1132
|
+
out: np.ndarray = (1.0 / (1.0 + np.exp(-scaled))).astype(float)
|
|
1133
|
+
return out
|
|
1134
|
+
|
|
1135
|
+
return t_optimal, apply
|
|
1136
|
+
|
|
1137
|
+
|
|
1041
1138
|
def fit_temperature_oracle(
|
|
1042
1139
|
y_true: np.ndarray, y_score: np.ndarray
|
|
1043
1140
|
) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
|
|
@@ -31,6 +31,16 @@ class Scorer(Protocol):
|
|
|
31
31
|
Accepts ``list[str]``, ``np.ndarray``, or ``pd.Series`` of features.
|
|
32
32
|
Pandas is imported under ``TYPE_CHECKING`` only, so this Protocol
|
|
33
33
|
has no runtime pandas dependency.
|
|
34
|
+
|
|
35
|
+
Notes
|
|
36
|
+
-----
|
|
37
|
+
When passed to a parallel-capable harness call (``n_jobs > 1``), Scorer
|
|
38
|
+
instances MUST be picklable — joblib's loky backend serializes the entire
|
|
39
|
+
delayed call (function plus bound arguments) before worker dispatch.
|
|
40
|
+
Closures, lambdas, local-scope classes, and attributes holding live
|
|
41
|
+
sockets / file handles break pickling. See
|
|
42
|
+
``docs/source/methodology/parallelism.md#scorer-picklability`` for the
|
|
43
|
+
full contract and worked examples.
|
|
34
44
|
"""
|
|
35
45
|
|
|
36
46
|
def predict_proba( # pragma: no cover
|
|
@@ -137,6 +137,7 @@
|
|
|
137
137
|
"fit_operating_points",
|
|
138
138
|
"fit_platt_calibrator",
|
|
139
139
|
"fit_temperature",
|
|
140
|
+
"fit_temperature_binary",
|
|
140
141
|
"fit_temperature_oracle",
|
|
141
142
|
"from_yaml",
|
|
142
143
|
"frozen_config",
|
|
@@ -1016,7 +1017,7 @@
|
|
|
1016
1017
|
"doc_first_line": "str(object='') -> str",
|
|
1017
1018
|
"kind": "value",
|
|
1018
1019
|
"type": "str",
|
|
1019
|
-
"value": "'0.
|
|
1020
|
+
"value": "'0.35.0'"
|
|
1020
1021
|
},
|
|
1021
1022
|
"apply_operating_points": {
|
|
1022
1023
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1203,6 +1204,11 @@
|
|
|
1203
1204
|
"kind": "function",
|
|
1204
1205
|
"signature": "(val_logits: 'np.ndarray', val_labels: 'np.ndarray', bounds: 'tuple[float, float]' = (0.05, 20.0)) -> 'dict[str, float]'"
|
|
1205
1206
|
},
|
|
1207
|
+
"fit_temperature_binary": {
|
|
1208
|
+
"doc_first_line": "Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).",
|
|
1209
|
+
"kind": "function",
|
|
1210
|
+
"signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', *, bounds: 'tuple[float, float]' = (0.05, 20.0)) -> 'tuple[float, Callable[[np.ndarray], np.ndarray]]'"
|
|
1211
|
+
},
|
|
1206
1212
|
"fit_temperature_oracle": {
|
|
1207
1213
|
"doc_first_line": "**DIAGNOSTIC ONLY** \u2014 fit-on-test oracle T-scaling per Guo et al. 2017 [#guo]_.",
|
|
1208
1214
|
"kind": "function",
|
|
@@ -16,6 +16,7 @@ from eval_toolkit.calibration import (
|
|
|
16
16
|
fit_isotonic_calibrator,
|
|
17
17
|
fit_platt_calibrator,
|
|
18
18
|
fit_temperature,
|
|
19
|
+
fit_temperature_binary,
|
|
19
20
|
fit_temperature_oracle,
|
|
20
21
|
maximum_calibration_error,
|
|
21
22
|
reliability_curve,
|
|
@@ -361,3 +362,128 @@ def test_fit_platt_matches_sklearn_canonical() -> None:
|
|
|
361
362
|
ours_out = ours(grid)
|
|
362
363
|
sk_out = sk_cal.predict(grid)
|
|
363
364
|
np.testing.assert_allclose(ours_out, sk_out, atol=1e-6, rtol=1e-6)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# --- fit_temperature_binary (#28) -------------------------------------------------
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
@pytest.mark.unit
|
|
371
|
+
def test_fit_temperature_binary_runs(well_separated: tuple[np.ndarray, np.ndarray]) -> None:
|
|
372
|
+
"""Smoke test: returns positive T + callable; calibrated outputs in (0, 1)."""
|
|
373
|
+
y, s = well_separated
|
|
374
|
+
s_clipped = np.clip(s, 0.01, 0.99)
|
|
375
|
+
T, apply = fit_temperature_binary(y, s_clipped)
|
|
376
|
+
assert T > 0
|
|
377
|
+
out = apply(s_clipped)
|
|
378
|
+
assert out.shape == s_clipped.shape # scalar (n,) in/out contract
|
|
379
|
+
assert (out > 0.0).all() and (out < 1.0).all()
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
@pytest.mark.unit
|
|
383
|
+
def test_fit_temperature_binary_shape_contract() -> None:
|
|
384
|
+
"""Apply returns shape (n,), never (n, 2). Guards against 2-col regressions."""
|
|
385
|
+
rng = np.random.default_rng(0)
|
|
386
|
+
y = rng.binomial(1, 0.3, size=200).astype(int)
|
|
387
|
+
s = np.clip(y * 0.6 + rng.normal(0, 0.2, 200), 0.01, 0.99)
|
|
388
|
+
_, apply = fit_temperature_binary(y, s)
|
|
389
|
+
for shape in [(1,), (3,), (50,)]:
|
|
390
|
+
p_test = rng.uniform(0.05, 0.95, size=shape)
|
|
391
|
+
assert apply(p_test).shape == shape
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
@pytest.mark.unit
|
|
395
|
+
def test_fit_temperature_binary_handles_extremes() -> None:
|
|
396
|
+
"""Probas at exactly 0 and 1 produce finite outputs (clipping covers the logit pole).
|
|
397
|
+
|
|
398
|
+
Contract: ``logit(0)`` and ``logit(1)`` are infinite, but the internal
|
|
399
|
+
clipping to ``[1e-7, 1-1e-7]`` keeps the math finite. Outputs may hit the
|
|
400
|
+
float64 boundary (0.0 or 1.0) at extreme inputs with small T — that is
|
|
401
|
+
correct behavior, not a violation. The real failure mode this test guards
|
|
402
|
+
against is ``inf`` / ``nan`` in either fit or apply.
|
|
403
|
+
"""
|
|
404
|
+
rng = np.random.default_rng(0)
|
|
405
|
+
n = 200
|
|
406
|
+
y = rng.binomial(1, 0.5, size=n).astype(int)
|
|
407
|
+
s = y.astype(float) # exact 0s and 1s in val data
|
|
408
|
+
T, apply = fit_temperature_binary(y, s)
|
|
409
|
+
assert np.isfinite(T)
|
|
410
|
+
# Apply to extremes — must be finite + in [0, 1] (boundary-inclusive)
|
|
411
|
+
p_test = np.array([0.0, 0.5, 1.0])
|
|
412
|
+
out = apply(p_test)
|
|
413
|
+
assert np.isfinite(out).all()
|
|
414
|
+
assert (out >= 0.0).all() and (out <= 1.0).all()
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
@pytest.mark.unit
|
|
418
|
+
def test_fit_temperature_binary_parity_with_multiclass() -> None:
|
|
419
|
+
"""fit_temperature_binary(y, p) matches manual fit_temperature(2-col-logits, y).
|
|
420
|
+
|
|
421
|
+
Establishes the contract that the binary adapter is a thin wrapper, not a
|
|
422
|
+
re-implementation: identical T, identical applied probabilities.
|
|
423
|
+
"""
|
|
424
|
+
rng = np.random.default_rng(7)
|
|
425
|
+
n = 400
|
|
426
|
+
y = rng.binomial(1, 0.4, size=n).astype(int)
|
|
427
|
+
p_val = np.clip(y * 0.5 + rng.normal(0, 0.25, n), 0.01, 0.99)
|
|
428
|
+
p_test = rng.uniform(0.05, 0.95, size=50)
|
|
429
|
+
|
|
430
|
+
T_binary, apply_binary = fit_temperature_binary(y, p_val)
|
|
431
|
+
|
|
432
|
+
# Manual multi-class path: build 2-col logits, fit T, apply via softmax row 1.
|
|
433
|
+
logit_val = np.log(p_val / (1.0 - p_val))
|
|
434
|
+
val_logits_2col = np.column_stack([np.zeros_like(logit_val), logit_val])
|
|
435
|
+
result_mc = fit_temperature(val_logits_2col, y)
|
|
436
|
+
T_mc = result_mc["temperature"]
|
|
437
|
+
|
|
438
|
+
logit_test = np.log(p_test / (1.0 - p_test))
|
|
439
|
+
test_logits_2col = np.column_stack([np.zeros_like(logit_test), logit_test]) / T_mc
|
|
440
|
+
# softmax row 1 = exp(z1) / (exp(0) + exp(z1)) = sigmoid(z1)
|
|
441
|
+
expected = 1.0 / (1.0 + np.exp(-test_logits_2col[:, 1]))
|
|
442
|
+
|
|
443
|
+
assert T_binary == pytest.approx(T_mc, rel=1e-9)
|
|
444
|
+
np.testing.assert_allclose(apply_binary(p_test), expected, rtol=1e-9, atol=1e-12)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
@pytest.mark.unit
|
|
448
|
+
def test_fit_temperature_binary_improves_nll() -> None:
|
|
449
|
+
"""T_post NLL ≤ T_pre NLL (T=1 is always a feasible point in the bracket)."""
|
|
450
|
+
rng = np.random.default_rng(0)
|
|
451
|
+
n = 500
|
|
452
|
+
y = rng.binomial(1, 0.4, size=n).astype(int)
|
|
453
|
+
# Overconfident probabilities: push away from 0.5
|
|
454
|
+
raw = y * 0.7 + rng.normal(0, 0.15, n)
|
|
455
|
+
p = np.clip(0.5 + 2.5 * (raw - 0.5), 0.01, 0.99)
|
|
456
|
+
T, apply = fit_temperature_binary(y, p)
|
|
457
|
+
eps = 1e-12
|
|
458
|
+
|
|
459
|
+
def _binary_nll(probs: np.ndarray, labels: np.ndarray) -> float:
|
|
460
|
+
c = np.clip(probs, eps, 1 - eps)
|
|
461
|
+
return float(-(labels * np.log(c) + (1 - labels) * np.log(1 - c)).mean())
|
|
462
|
+
|
|
463
|
+
nll_pre = _binary_nll(p, y)
|
|
464
|
+
nll_post = _binary_nll(apply(p), y)
|
|
465
|
+
assert nll_post <= nll_pre + 1e-9
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
@pytest.mark.unit
|
|
469
|
+
def test_fit_temperature_binary_validates() -> None:
|
|
470
|
+
"""Error paths inherit from _validate_calibrator_inputs."""
|
|
471
|
+
with pytest.raises(ValueError, match="shape mismatch"):
|
|
472
|
+
fit_temperature_binary(np.zeros(5, dtype=int), np.zeros(7))
|
|
473
|
+
with pytest.raises(ValueError, match="empty"):
|
|
474
|
+
fit_temperature_binary(np.array([], dtype=int), np.array([]))
|
|
475
|
+
with pytest.raises(ValueError, match="NaN or inf"):
|
|
476
|
+
fit_temperature_binary(np.array([0, 1, 0, 1]), np.array([0.1, np.nan, 0.3, 0.7]))
|
|
477
|
+
with pytest.raises(ValueError, match="both classes"):
|
|
478
|
+
fit_temperature_binary(np.ones(50, dtype=int), np.linspace(0.1, 0.9, 50))
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
@pytest.mark.unit
|
|
482
|
+
def test_fit_temperature_binary_apply_rejects_nonfinite() -> None:
|
|
483
|
+
"""Apply rejects non-finite test-time scores (does not silently mask)."""
|
|
484
|
+
rng = np.random.default_rng(0)
|
|
485
|
+
y = rng.binomial(1, 0.3, size=200).astype(int)
|
|
486
|
+
s = np.clip(y * 0.6 + rng.normal(0, 0.2, 200), 0.01, 0.99)
|
|
487
|
+
_, apply = fit_temperature_binary(y, s)
|
|
488
|
+
with pytest.raises(ValueError, match="NaN or inf"):
|
|
489
|
+
apply(np.array([0.5, np.nan, 0.7]))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png
RENAMED
|
File without changes
|
{eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
{eval_toolkit-0.34.0 → eval_toolkit-0.35.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|