eval-toolkit 0.46.0__tar.gz → 0.46.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/.gitignore +10 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/CHANGELOG.md +65 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/PKG-INFO +1 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/source/adr/README.md +3 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__init__.py +100 -19
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/metric_specs.py +35 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/public_api/snapshot.json +1 -1
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_deprecated_scalars_shim.py +157 -7
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_scorecard.py +58 -13
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/LICENSE +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/STYLE.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/archive/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/pyproject.toml +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_scorecard.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/conftest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/strategies.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_cli.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_config.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_is_metric_defined_for_slice.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_logging.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_losses.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_paths.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_probes.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_stacking.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_v09_contracts.py +0 -0
|
@@ -45,6 +45,16 @@ coverage.json
|
|
|
45
45
|
# Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
|
|
46
46
|
mutants/
|
|
47
47
|
|
|
48
|
+
# Local audit artifacts (Round 5+ Gate 3 LLM cross-review packets + reports).
|
|
49
|
+
# The canonical prompt lives at ~/.claude/plans/gate3-audit-prompt.md and the
|
|
50
|
+
# canonical findings ledger lives at docs/source/audit_findings.md; per-run
|
|
51
|
+
# raw model outputs are author-local working copies.
|
|
52
|
+
# Tracked: per-round briefing files (`gate3-audit-round-<N>.md`).
|
|
53
|
+
# Untracked: prompt template, generic report, per-round report files.
|
|
54
|
+
gate3-audit-prompt.md
|
|
55
|
+
gate3-audit-report.md
|
|
56
|
+
gate3-audit-round-*-report.md
|
|
57
|
+
|
|
48
58
|
# Claude Code project settings (machine-local)
|
|
49
59
|
.claude/
|
|
50
60
|
|
|
@@ -5,6 +5,71 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
|
|
9
|
+
|
|
10
|
+
Hotfix release per **Decision Q** (data correctness regression + time-sensitive
|
|
11
|
+
warning content) + **Decision R6-E** (scope: R6-F1 + R6-F2 only; R6-A docstring
|
|
12
|
+
rolls forward to v0.47). All other Round 6 findings dispositioned to v0.47.0.
|
|
13
|
+
|
|
14
|
+
See [`docs/source/audit_findings.md`](docs/source/audit_findings.md) Round 6 for
|
|
15
|
+
the full disposition ledger.
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
|
|
19
|
+
- **`metric_specs.ece(strategy=<value>)` strategy validation** (Round 6 Codex
|
|
20
|
+
R6-F1). Prior to v0.46.1, an invalid strategy string (e.g.
|
|
21
|
+
`metric_specs.ece(strategy="typo")`) silently dispatched to quantile ECE and
|
|
22
|
+
returned a `scorecard()` cell with `status="ok"` under an invalid encoded key
|
|
23
|
+
(`"ece_n_bins_15_strategy_typo"`) — wrong-by-design data correctness path.
|
|
24
|
+
Verified by Codex via runtime probe. Now both the `ece()` factory and
|
|
25
|
+
`_EceSpec.compute()` raise:
|
|
26
|
+
```
|
|
27
|
+
ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
|
|
28
|
+
```
|
|
29
|
+
Defence-in-depth: the factory validates eagerly (before LRU cache hit) AND
|
|
30
|
+
`compute()` validates at the compute boundary so direct construction of
|
|
31
|
+
`_EceSpec(strategy="typo")` (bypassing the factory) also raises.
|
|
32
|
+
|
|
33
|
+
- **Deprecation warning content for all 5 ECE variants** (Round 6 Codex R6-F2 +
|
|
34
|
+
Gemini R6-F2, with Decisions R6-F + R6-G). The v0.46.0 `__getattr__`
|
|
35
|
+
deprecation shim's warning messages produced broken migration snippets:
|
|
36
|
+
- For `expected_calibration_error` + `expected_calibration_error_equal_mass`:
|
|
37
|
+
the suggested `Scorecard` lookup key was the factory-call expression
|
|
38
|
+
(`"ece(n_bins=10)"`) instead of the encoded spec name
|
|
39
|
+
(`"ece_n_bins_10_strategy_uniform"`). Now uses the correct encoded key.
|
|
40
|
+
- For `expected_calibration_error_debiased` / `_l2` / `_l2_debiased`: these
|
|
41
|
+
variants are not in the v0.46 `metric_specs` namespace (Decision R6-G;
|
|
42
|
+
research-completeness primitives, deferred to v1.x if user demand
|
|
43
|
+
surfaces). Their warnings now point at the submodule path
|
|
44
|
+
(`from eval_toolkit.metrics import expected_calibration_error_debiased`)
|
|
45
|
+
instead of an unconstructable scorecard snippet.
|
|
46
|
+
- Pre-v0.46 default verification: Gemini's report claimed
|
|
47
|
+
`expected_calibration_error` defaulted to `n_bins=15`; verified against
|
|
48
|
+
`metrics.py:730-734` that the actual default is `n_bins=10`. Per Decision
|
|
49
|
+
R6-F, warning snippets use `n_bins=10` to preserve bit-identical pre-v0.46
|
|
50
|
+
math + add a migration note explaining the new `metric_specs.ece()` factory
|
|
51
|
+
default of `n_bins=15` (matching Hines et al.).
|
|
52
|
+
|
|
53
|
+
### Tests
|
|
54
|
+
|
|
55
|
+
- `tests/test_scorecard.py`: 4 new tests for ECE strategy validation
|
|
56
|
+
(parametrized factory-rejection + compute-defence-in-depth).
|
|
57
|
+
- `tests/test_deprecated_scalars_shim.py`: 4 new test classes — verify each
|
|
58
|
+
warning contains correct factory expression + encoded scorecard key, ECE
|
|
59
|
+
warnings carry the n_bins=10/15 migration note, submodule-only warnings cite
|
|
60
|
+
`eval_toolkit.metrics` path, and the snippet in each first-party warning is
|
|
61
|
+
EXECUTABLE (parses + runs against synthetic data + produces ok-status cell).
|
|
62
|
+
|
|
63
|
+
### Rolled forward to v0.47 (Decision R6-E)
|
|
64
|
+
|
|
65
|
+
- R6-A `seed=None` docstring fix (non-blocker per Decision Q).
|
|
66
|
+
- R6-F3 duplicate `MetricSpec.name` rejection.
|
|
67
|
+
- R6-F5 (Codex) Protocol method-shape drift guard.
|
|
68
|
+
- R6-F3 (Gemini) `Scorecard.to_pandas()` schema expansion.
|
|
69
|
+
- R6-F4 (Gemini) `make_spec_name()` helper.
|
|
70
|
+
- R6-F5 (Gemini) narrow `_evaluate_spec()` exception catch.
|
|
71
|
+
- R6-F6 (Codex) plan + roadmap state-drift refresh.
|
|
72
|
+
|
|
8
73
|
## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
|
|
9
74
|
|
|
10
75
|
Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.46.
|
|
3
|
+
Version: 0.46.1
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -73,4 +73,6 @@ What would have to change for this decision to be reopened?
|
|
|
73
73
|
|
|
74
74
|
| # | Title | Status | Date |
|
|
75
75
|
|---|---|---|---|
|
|
76
|
-
|
|
|
76
|
+
| [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
|
|
77
|
+
| [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
|
|
78
|
+
| [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
|
|
@@ -342,10 +342,7 @@ def __getattr__(name: str) -> Any:
|
|
|
342
342
|
import warnings
|
|
343
343
|
|
|
344
344
|
warnings.warn(
|
|
345
|
-
|
|
346
|
-
f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
|
|
347
|
-
f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
|
|
348
|
-
f" `eval_toolkit.metrics` submodule directly (internal API).",
|
|
345
|
+
_deprecation_warning_for(name),
|
|
349
346
|
DeprecationWarning,
|
|
350
347
|
stacklevel=2,
|
|
351
348
|
)
|
|
@@ -366,23 +363,107 @@ def __getattr__(name: str) -> Any:
|
|
|
366
363
|
|
|
367
364
|
|
|
368
365
|
# ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
369
|
-
|
|
370
|
-
|
|
366
|
+
#
|
|
367
|
+
# Per Round 6 audit (Codex R6-F2 + Gemini R6-F2; Decisions R6-F + R6-G):
|
|
368
|
+
# - For deprecated scalars with a first-party `metric_specs` equivalent, the
|
|
369
|
+
# warning emits an EXECUTABLE scorecard snippet (factory expression + the
|
|
370
|
+
# correct encoded scorecard key, not the factory call string).
|
|
371
|
+
# - For the 3 ECE variants without a `metric_specs` equivalent
|
|
372
|
+
# (expected_calibration_error_debiased / _l2 / _l2_debiased), the warning
|
|
373
|
+
# instead points at the submodule path per Decision R6-G — no first-party
|
|
374
|
+
# replacement is shipped at v0.47.
|
|
375
|
+
# - ECE `n_bins=10` preserves the pre-v0.46 default (verified at
|
|
376
|
+
# `metrics.py:730-734`) — Decision R6-F. A migration note explains that
|
|
377
|
+
# the v0.46+ `metric_specs.ece()` factory defaults to `n_bins=15` (matching
|
|
378
|
+
# Hines et al.) and how to opt in.
|
|
379
|
+
_FirstParty = tuple[str, str] # (factory_expression, scorecard_key)
|
|
380
|
+
"""Type alias for a deprecated-scalar that has a metric_specs replacement.
|
|
381
|
+
|
|
382
|
+
The factory expression is what the user types after ``metric_specs.``; the
|
|
383
|
+
scorecard key is the literal string that indexes ``Scorecard``.
|
|
384
|
+
"""
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
_FIRST_PARTY_REPLACEMENTS: dict[str, _FirstParty] = {
|
|
388
|
+
"pr_auc": ("pr_auc", "pr_auc"),
|
|
389
|
+
"roc_auc": ("roc_auc", "roc_auc"),
|
|
390
|
+
"brier_score": ("brier", "brier"),
|
|
391
|
+
# ECE variants: use n_bins=10 (pre-v0.46 default per Decision R6-F).
|
|
392
|
+
# The migration note in the warning text explains how to switch to
|
|
393
|
+
# n_bins=15 if the user wants the v0.46+ metric_specs.ece() default.
|
|
394
|
+
"expected_calibration_error": (
|
|
395
|
+
"ece(n_bins=10)",
|
|
396
|
+
"ece_n_bins_10_strategy_uniform",
|
|
397
|
+
),
|
|
398
|
+
"expected_calibration_error_equal_mass": (
|
|
399
|
+
'ece(n_bins=10, strategy="quantile")',
|
|
400
|
+
"ece_n_bins_10_strategy_quantile",
|
|
401
|
+
),
|
|
402
|
+
}
|
|
403
|
+
"""Names that have a first-party metric_specs replacement at v0.46.
|
|
404
|
+
|
|
405
|
+
The 3 ECE variants NOT in this map (_debiased, _l2, _l2_debiased) get the
|
|
406
|
+
submodule-path warning template instead (Decision R6-G).
|
|
407
|
+
"""
|
|
371
408
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
409
|
+
|
|
410
|
+
def _deprecation_warning_for(name: str) -> str:
|
|
411
|
+
"""Render the DeprecationWarning message for a deprecated scalar name.
|
|
412
|
+
|
|
413
|
+
Branches on whether ``name`` has a first-party `metric_specs` replacement
|
|
414
|
+
(Decision R6-G):
|
|
415
|
+
|
|
416
|
+
- First-party (5 names): scorecard snippet with the correct encoded key
|
|
417
|
+
(Decision R6-F).
|
|
418
|
+
- Submodule-only (3 ECE variants): point at the submodule path per
|
|
419
|
+
Decision R6-G.
|
|
420
|
+
|
|
421
|
+
The first-party variants for ECE include a migration note explaining the
|
|
422
|
+
new ``metric_specs.ece()`` factory default of ``n_bins=15`` so users can
|
|
423
|
+
opt in to the new convention; the snippet itself uses ``n_bins=10`` for
|
|
424
|
+
bit-identical pre-v0.46 math (Decision R6-F).
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
name : str
|
|
429
|
+
A name in ``_DEPRECATED_SCALARS``.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
str
|
|
434
|
+
The warning message, ready to pass to ``warnings.warn``.
|
|
378
435
|
"""
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
436
|
+
first_party = _FIRST_PARTY_REPLACEMENTS.get(name)
|
|
437
|
+
if first_party is not None:
|
|
438
|
+
factory_expr, scorecard_key = first_party
|
|
439
|
+
msg = (
|
|
440
|
+
f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
|
|
441
|
+
f"For the same math, use:\n"
|
|
442
|
+
f" scorecard(y, s, metrics=[metric_specs.{factory_expr}])"
|
|
443
|
+
f'["{scorecard_key}"].value\n'
|
|
444
|
+
f"Or import from the eval_toolkit.metrics submodule directly "
|
|
445
|
+
f"(internal API per ADR 0002 — stable across v1.x, subject to "
|
|
446
|
+
f"refactor in major versions)."
|
|
447
|
+
)
|
|
448
|
+
# ECE-specific migration note about the n_bins default change.
|
|
449
|
+
if name.startswith("expected_calibration_error"):
|
|
450
|
+
msg += (
|
|
451
|
+
"\nNote: the v0.46+ metric_specs.ece() factory defaults to "
|
|
452
|
+
"n_bins=15 (matching Hines et al.); the n_bins=10 in this "
|
|
453
|
+
"snippet preserves the pre-v0.46 math. Pass n_bins=15 to use "
|
|
454
|
+
"the new convention."
|
|
455
|
+
)
|
|
456
|
+
return msg
|
|
457
|
+
# Decision R6-G: 3 ECE variants without first-party replacements →
|
|
458
|
+
# submodule path only.
|
|
459
|
+
return (
|
|
460
|
+
f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
|
|
461
|
+
f"This variant is NOT in v0.46+ metric_specs. Use:\n"
|
|
462
|
+
f" from eval_toolkit.metrics import {name}\n"
|
|
463
|
+
f"(internal API per ADR 0002 — stable across v1.x, subject to "
|
|
464
|
+
f"refactor in major versions). Or contribute the variant to "
|
|
465
|
+
f"metric_specs if you use it regularly."
|
|
466
|
+
)
|
|
386
467
|
|
|
387
468
|
|
|
388
469
|
# ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
@@ -118,6 +118,23 @@ brier: MetricSpec = _BrierSpec()
|
|
|
118
118
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
119
119
|
|
|
120
120
|
|
|
121
|
+
# Valid strategy values for ECE specs. Locked at v0.46.1 to prevent the
|
|
122
|
+
# Round 6 R6-F1 footgun where `ece(strategy="typo")` silently dispatched to
|
|
123
|
+
# quantile ECE and returned a scorecard cell with status="ok" under an
|
|
124
|
+
# invalid key. See `docs/source/audit_findings.md` Round 6.
|
|
125
|
+
_ECE_VALID_STRATEGIES: frozenset[str] = frozenset({"uniform", "quantile"})
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _validate_ece_strategy(strategy: str) -> None:
|
|
129
|
+
"""Validate ECE strategy value; raise ValueError with context if invalid.
|
|
130
|
+
|
|
131
|
+
Shared between the factory (eager validation) and ``_EceSpec.compute`` (defence in
|
|
132
|
+
depth for direct construction paths that bypass the factory).
|
|
133
|
+
"""
|
|
134
|
+
if strategy not in _ECE_VALID_STRATEGIES:
|
|
135
|
+
raise ValueError(f"ECE strategy must be 'uniform' or 'quantile'; got {strategy!r}")
|
|
136
|
+
|
|
137
|
+
|
|
121
138
|
@dataclass(frozen=True, slots=True)
|
|
122
139
|
class _EceSpec:
|
|
123
140
|
"""Internal :class:`MetricSpec` for expected calibration error.
|
|
@@ -135,6 +152,10 @@ class _EceSpec:
|
|
|
135
152
|
return f"ece_n_bins_{self.n_bins}_strategy_{self.strategy}"
|
|
136
153
|
|
|
137
154
|
def compute(self, y_true: np.ndarray, y_score: np.ndarray) -> float:
|
|
155
|
+
# Defence-in-depth strategy validation — the factory validates first,
|
|
156
|
+
# but a caller bypassing the factory and constructing `_EceSpec` directly
|
|
157
|
+
# would otherwise produce a wrong-metric scorecard cell silently.
|
|
158
|
+
_validate_ece_strategy(self.strategy)
|
|
138
159
|
if self.strategy == "uniform":
|
|
139
160
|
return float(_ece_uniform(y_true, y_score, n_bins=self.n_bins))
|
|
140
161
|
return float(_ece_equal_mass(y_true, y_score, n_bins=self.n_bins))
|
|
@@ -178,5 +199,19 @@ def ece(*, n_bins: int = 15, strategy: ECEStrategy = "uniform") -> MetricSpec:
|
|
|
178
199
|
'ece_n_bins_15_strategy_uniform'
|
|
179
200
|
>>> ece(n_bins=10, strategy="quantile").name
|
|
180
201
|
'ece_n_bins_10_strategy_quantile'
|
|
202
|
+
|
|
203
|
+
Invalid strategies raise ``ValueError`` eagerly (v0.46.1+; Round 6 R6-F1
|
|
204
|
+
fix — prior to v0.46.1 this silently dispatched to quantile ECE):
|
|
205
|
+
|
|
206
|
+
>>> ece(strategy="typo")
|
|
207
|
+
Traceback (most recent call last):
|
|
208
|
+
...
|
|
209
|
+
ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
|
|
210
|
+
|
|
211
|
+
Raises
|
|
212
|
+
------
|
|
213
|
+
ValueError
|
|
214
|
+
If ``strategy`` is not in ``{"uniform", "quantile"}``.
|
|
181
215
|
"""
|
|
216
|
+
_validate_ece_strategy(strategy)
|
|
182
217
|
return _EceSpec(n_bins=n_bins, strategy=strategy)
|
|
@@ -1192,7 +1192,7 @@
|
|
|
1192
1192
|
"doc_first_line": "str(object='') -> str",
|
|
1193
1193
|
"kind": "value",
|
|
1194
1194
|
"type": "str",
|
|
1195
|
-
"value": "'0.46.
|
|
1195
|
+
"value": "'0.46.1'"
|
|
1196
1196
|
},
|
|
1197
1197
|
"apply_operating_points": {
|
|
1198
1198
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -58,7 +58,13 @@ def test_deprecated_names_not_in_exports(name: str) -> None:
|
|
|
58
58
|
@pytest.mark.unit
|
|
59
59
|
@pytest.mark.parametrize("name", sorted(DEPRECATED_SCALARS))
|
|
60
60
|
def test_deprecated_name_emits_warning(name: str) -> None:
|
|
61
|
-
"""Looking up a deprecated name at the top level emits DeprecationWarning.
|
|
61
|
+
"""Looking up a deprecated name at the top level emits DeprecationWarning.
|
|
62
|
+
|
|
63
|
+
Updated v0.46.1 per Decision R6-G: the 3 ECE variants without first-party
|
|
64
|
+
`metric_specs` equivalents point at the submodule path
|
|
65
|
+
(`from eval_toolkit.metrics import ...`) rather than a scorecard snippet.
|
|
66
|
+
The other 5 first-party-replaceable names use the scorecard snippet.
|
|
67
|
+
"""
|
|
62
68
|
with warnings.catch_warnings(record=True) as caught:
|
|
63
69
|
warnings.simplefilter("always")
|
|
64
70
|
_ = getattr(eval_toolkit, name)
|
|
@@ -66,9 +72,16 @@ def test_deprecated_name_emits_warning(name: str) -> None:
|
|
|
66
72
|
assert (
|
|
67
73
|
len(deprecations) >= 1
|
|
68
74
|
), f"expected DeprecationWarning for {name}; got {[w.category.__name__ for w in caught]}"
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
assert
|
|
75
|
+
msg = str(deprecations[0].message)
|
|
76
|
+
# Universal assertions for ALL deprecated names:
|
|
77
|
+
assert name in msg
|
|
78
|
+
assert "v0.47" in msg
|
|
79
|
+
# Per-name-class assertions: scorecard for first-party, submodule for the rest.
|
|
80
|
+
if name in _EXPECTED_SUBMODULE_ONLY:
|
|
81
|
+
assert "eval_toolkit.metrics" in msg
|
|
82
|
+
assert "NOT in v0.46+ metric_specs" in msg
|
|
83
|
+
else:
|
|
84
|
+
assert "scorecard" in msg
|
|
72
85
|
|
|
73
86
|
|
|
74
87
|
@pytest.mark.unit
|
|
@@ -76,7 +89,7 @@ def test_deprecated_pr_auc_still_functional() -> None:
|
|
|
76
89
|
"""The returned function still works — only the WAY it's imported is deprecated."""
|
|
77
90
|
with warnings.catch_warnings():
|
|
78
91
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
79
|
-
pr_auc = eval_toolkit.pr_auc
|
|
92
|
+
pr_auc = eval_toolkit.pr_auc
|
|
80
93
|
y = np.array([0, 1, 0, 1, 1, 0, 1, 0])
|
|
81
94
|
s = np.array([0.2, 0.8, 0.3, 0.7, 0.9, 0.1, 0.6, 0.4])
|
|
82
95
|
assert 0.0 <= pr_auc(y, s) <= 1.0
|
|
@@ -86,7 +99,7 @@ def test_deprecated_pr_auc_still_functional() -> None:
|
|
|
86
99
|
def test_deprecated_brier_score_still_functional() -> None:
|
|
87
100
|
with warnings.catch_warnings():
|
|
88
101
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
89
|
-
brier_score = eval_toolkit.brier_score
|
|
102
|
+
brier_score = eval_toolkit.brier_score
|
|
90
103
|
y = np.array([0, 1, 0, 1])
|
|
91
104
|
s = np.array([0.1, 0.9, 0.2, 0.8])
|
|
92
105
|
assert 0.0 <= brier_score(y, s) <= 1.0
|
|
@@ -170,7 +183,7 @@ def test_full_all_resolves_without_attribute_error() -> None:
|
|
|
170
183
|
def test_unknown_name_still_raises_attribute_error() -> None:
|
|
171
184
|
"""The deprecation branch must not swallow unknown-name errors."""
|
|
172
185
|
with pytest.raises(AttributeError, match="no attribute"):
|
|
173
|
-
_ = eval_toolkit.nonexistent_symbol_xyz
|
|
186
|
+
_ = eval_toolkit.nonexistent_symbol_xyz
|
|
174
187
|
|
|
175
188
|
|
|
176
189
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -182,3 +195,140 @@ def test_unknown_name_still_raises_attribute_error() -> None:
|
|
|
182
195
|
def test_deprecated_scalars_set_matches() -> None:
|
|
183
196
|
"""The internal `_DEPRECATED_SCALARS` set lines up with this test's expectations."""
|
|
184
197
|
assert eval_toolkit._DEPRECATED_SCALARS == DEPRECATED_SCALARS
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
201
|
+
# v0.46.1 — Round 6 R6-F2 + R6-F + R6-G: warning snippet content & executability
|
|
202
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# First-party replacements that should appear in warning snippets verbatim.
|
|
206
|
+
# (factory_expression, scorecard_key) per deprecated name. Matches
|
|
207
|
+
# `eval_toolkit._FIRST_PARTY_REPLACEMENTS`.
|
|
208
|
+
_EXPECTED_FIRST_PARTY: dict[str, tuple[str, str]] = {
|
|
209
|
+
"pr_auc": ("pr_auc", "pr_auc"),
|
|
210
|
+
"roc_auc": ("roc_auc", "roc_auc"),
|
|
211
|
+
"brier_score": ("brier", "brier"),
|
|
212
|
+
"expected_calibration_error": ("ece(n_bins=10)", "ece_n_bins_10_strategy_uniform"),
|
|
213
|
+
"expected_calibration_error_equal_mass": (
|
|
214
|
+
'ece(n_bins=10, strategy="quantile")',
|
|
215
|
+
"ece_n_bins_10_strategy_quantile",
|
|
216
|
+
),
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# ECE variants without first-party metric_specs equivalents (Decision R6-G).
|
|
220
|
+
_EXPECTED_SUBMODULE_ONLY: frozenset[str] = frozenset(
|
|
221
|
+
{
|
|
222
|
+
"expected_calibration_error_debiased",
|
|
223
|
+
"expected_calibration_error_l2",
|
|
224
|
+
"expected_calibration_error_l2_debiased",
|
|
225
|
+
}
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _capture_warning_message(name: str) -> str:
|
|
230
|
+
"""Trigger the deprecation shim for `name` and return the rendered message."""
|
|
231
|
+
with warnings.catch_warnings(record=True) as caught:
|
|
232
|
+
warnings.simplefilter("always")
|
|
233
|
+
getattr(eval_toolkit, name)
|
|
234
|
+
deprecations = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
|
235
|
+
assert deprecations, f"no DeprecationWarning emitted for {name}"
|
|
236
|
+
return str(deprecations[0].message)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
@pytest.mark.unit
|
|
240
|
+
@pytest.mark.parametrize("name", sorted(_EXPECTED_FIRST_PARTY))
|
|
241
|
+
def test_first_party_warning_contains_correct_snippet(name: str) -> None:
|
|
242
|
+
"""First-party replacements emit scorecard snippet with the encoded key.
|
|
243
|
+
|
|
244
|
+
Round 6 R6-F2: prior warnings used the factory-call expression
|
|
245
|
+
(e.g. ``"ece(n_bins=10)"``) as the scorecard lookup key. The shipped
|
|
246
|
+
Scorecard is a Mapping keyed by the encoded spec name
|
|
247
|
+
(e.g. ``"ece_n_bins_10_strategy_uniform"``). The v0.46.1 fix uses the
|
|
248
|
+
correct encoded key inline so blindly-copied snippets actually work.
|
|
249
|
+
"""
|
|
250
|
+
factory_expr, scorecard_key = _EXPECTED_FIRST_PARTY[name]
|
|
251
|
+
msg = _capture_warning_message(name)
|
|
252
|
+
assert f"metric_specs.{factory_expr}" in msg
|
|
253
|
+
assert f'["{scorecard_key}"]' in msg
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@pytest.mark.unit
|
|
257
|
+
def test_ece_first_party_warnings_carry_n_bins_10_migration_note() -> None:
|
|
258
|
+
"""ECE first-party warnings preserve pre-v0.46 default (n_bins=10) + nudge.
|
|
259
|
+
|
|
260
|
+
Per Decision R6-F: pre-v0.46 `expected_calibration_error` defaulted to
|
|
261
|
+
n_bins=10; v0.46+ `metric_specs.ece()` defaults to n_bins=15. The
|
|
262
|
+
warning snippet uses n_bins=10 for bit-identical math; an appended note
|
|
263
|
+
explains the new convention.
|
|
264
|
+
"""
|
|
265
|
+
for name in ("expected_calibration_error", "expected_calibration_error_equal_mass"):
|
|
266
|
+
msg = _capture_warning_message(name)
|
|
267
|
+
assert "n_bins=10" in msg
|
|
268
|
+
# Migration note about the new default:
|
|
269
|
+
assert "n_bins=15" in msg
|
|
270
|
+
assert "Hines" in msg or "new convention" in msg
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@pytest.mark.unit
|
|
274
|
+
@pytest.mark.parametrize("name", sorted(_EXPECTED_SUBMODULE_ONLY))
|
|
275
|
+
def test_submodule_only_warning_points_at_submodule_path(name: str) -> None:
|
|
276
|
+
"""The 3 ECE variants without first-party specs route users to the submodule.
|
|
277
|
+
|
|
278
|
+
Per Decision R6-G: `expected_calibration_error_debiased` / `_l2` /
|
|
279
|
+
`_l2_debiased` are research-completeness primitives without
|
|
280
|
+
`metric_specs` equivalents at v0.46. Their warnings cite
|
|
281
|
+
`eval_toolkit.metrics.<name>` rather than a scorecard snippet.
|
|
282
|
+
"""
|
|
283
|
+
msg = _capture_warning_message(name)
|
|
284
|
+
assert f"from eval_toolkit.metrics import {name}" in msg
|
|
285
|
+
assert "NOT in v0.46+ metric_specs" in msg
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@pytest.mark.unit
|
|
289
|
+
@pytest.mark.parametrize("name", sorted(_EXPECTED_FIRST_PARTY))
|
|
290
|
+
def test_first_party_warning_snippet_is_executable(name: str) -> None:
|
|
291
|
+
"""The scorecard snippet in the warning produces a usable MetricResult.
|
|
292
|
+
|
|
293
|
+
Parses the snippet, executes it against a synthetic balanced slice, and
|
|
294
|
+
asserts that the resulting `MetricResult` has `status="ok"` and finite
|
|
295
|
+
`value`. This is the user-facing migration contract: copy the snippet,
|
|
296
|
+
run it, get a number.
|
|
297
|
+
"""
|
|
298
|
+
|
|
299
|
+
from eval_toolkit import metric_specs as ms
|
|
300
|
+
from eval_toolkit import scorecard # noqa: F401
|
|
301
|
+
|
|
302
|
+
msg = _capture_warning_message(name)
|
|
303
|
+
factory_expr, scorecard_key = _EXPECTED_FIRST_PARTY[name]
|
|
304
|
+
|
|
305
|
+
# Build the snippet that the warning instructs the user to use:
|
|
306
|
+
# scorecard(y, s, metrics=[metric_specs.<factory_expr>])["<key>"].value
|
|
307
|
+
rng = np.random.default_rng(0)
|
|
308
|
+
y = rng.integers(0, 2, 200)
|
|
309
|
+
s = rng.random(200)
|
|
310
|
+
|
|
311
|
+
snippet = (
|
|
312
|
+
f"scorecard(y, s, metrics=[ms.{factory_expr}], bootstrap=False)" f'["{scorecard_key}"]'
|
|
313
|
+
)
|
|
314
|
+
# Confirm the warning actually contains the snippet shape it promises:
|
|
315
|
+
assert f"metric_specs.{factory_expr}" in msg
|
|
316
|
+
# Evaluate (safe — we constructed factory_expr from the known mapping):
|
|
317
|
+
cell = eval(snippet, {"scorecard": scorecard, "ms": ms, "y": y, "s": s}) # noqa: S307
|
|
318
|
+
assert cell.status == "ok", f"snippet for {name}: {cell.status} (reason: {cell.reason})"
|
|
319
|
+
assert cell.value is not None
|
|
320
|
+
assert isinstance(cell.value, float)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
@pytest.mark.unit
|
|
324
|
+
@pytest.mark.parametrize("name", sorted(_EXPECTED_SUBMODULE_ONLY))
|
|
325
|
+
def test_submodule_only_snippet_is_importable(name: str) -> None:
|
|
326
|
+
"""The submodule-import snippet in the warning actually imports something callable."""
|
|
327
|
+
import importlib
|
|
328
|
+
|
|
329
|
+
metrics_mod = importlib.import_module("eval_toolkit.metrics")
|
|
330
|
+
assert hasattr(metrics_mod, name), (
|
|
331
|
+
f"warning for {name} promises `from eval_toolkit.metrics import {name}` "
|
|
332
|
+
f"but the symbol isn't present in the submodule"
|
|
333
|
+
)
|
|
334
|
+
assert callable(getattr(metrics_mod, name))
|