eval-toolkit 0.45.0__tar.gz → 0.46.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/CHANGELOG.md +75 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/PKG-INFO +1 -1
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__init__.py +82 -8
- eval_toolkit-0.46.0/src/eval_toolkit/_scorecard.py +509 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.46.0/src/eval_toolkit/metric_specs.py +182 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/metrics.py +31 -2
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/public_api/snapshot.json +36 -51
- eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +184 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_is_metric_defined_for_slice.py +25 -2
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_misc_coverage.py +13 -2
- eval_toolkit-0.46.0/tests/test_scorecard.py +408 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/.gitignore +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/LICENSE +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/STYLE.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/source/adr/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/pyproject.toml +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/stacking.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_stacking.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_v09_contracts.py +0 -0
|
@@ -5,6 +5,81 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
|
|
9
|
+
|
|
10
|
+
Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
|
|
11
|
+
**Soft-breaking** — existing top-level scalar metric imports still work but
|
|
12
|
+
emit `DeprecationWarning` (hard-removed at v0.47).
|
|
13
|
+
|
|
14
|
+
See `docs/source/migration/v0.46.md` for the full consumer migration guide and
|
|
15
|
+
`docs/source/adr/0002-scorecard-as-primary-metric-surface.md` for the
|
|
16
|
+
decision record.
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- **`eval_toolkit.scorecard(y_true, y_score, metrics=[...], bootstrap=True)`**
|
|
21
|
+
— primary v1.0 metric surface. Single call computes multiple threshold-free
|
|
22
|
+
metrics + bootstrap CIs on one slice; returns a `Scorecard` (read-only
|
|
23
|
+
`Mapping[str, MetricResult]`). Type-safe dict-subscript access; status-aware
|
|
24
|
+
cells; per-cell error isolation.
|
|
25
|
+
- **`MetricSpec` Protocol** — v1.0 Tier-2 contract; `name: str` +
|
|
26
|
+
`compute(y_true, y_score) -> float`. Custom user specs satisfy structurally.
|
|
27
|
+
- **`MetricResult`** frozen dataclass — `value: float | None`, `status:
|
|
28
|
+
Literal["ok", "skipped", "error"]`, `reason: str`, `ci: BootstrapCI | None`.
|
|
29
|
+
Reuses the existing `MetricState` vocabulary from `artifacts.py:30-61`.
|
|
30
|
+
- **`Scorecard`** read-only `Mapping[str, MetricResult]` — `to_dict()`
|
|
31
|
+
JSON-friendly, `to_pandas()` one-row DataFrame (lazy pandas import).
|
|
32
|
+
- **`eval_toolkit.metric_specs`** namespace submodule with threshold-free
|
|
33
|
+
first-party specs:
|
|
34
|
+
- `pr_auc`, `roc_auc`, `brier` — module-level singletons (identity stable).
|
|
35
|
+
- `ece(n_bins, strategy)` — LRU-cached factory (identity stable per kwargs).
|
|
36
|
+
- **`SINGLE_CLASS_INCOMPATIBLE_METRICS`** extended with `pr_auc` / `roc_auc`
|
|
37
|
+
aliases (alongside existing `auroc` / `auprc`) so the v0.46 scorecard
|
|
38
|
+
surface and the v0.39 harness paths both produce correct skipped-status
|
|
39
|
+
behavior. Non-breaking; doctest + unit tests added.
|
|
40
|
+
- **`docs/source/adr/0002-scorecard-as-primary-metric-surface.md`** —
|
|
41
|
+
decision record covering single-surface rationale, threshold-free scope,
|
|
42
|
+
Tier-2 Protocol commitment, and v2.0 trigger conditions.
|
|
43
|
+
- **`docs/source/migration/v0.46.md`** — consumer migration guide with
|
|
44
|
+
side-by-side recipes for every common pattern.
|
|
45
|
+
|
|
46
|
+
### Deprecated
|
|
47
|
+
|
|
48
|
+
The following 8 top-level scalar imports emit `DeprecationWarning` and will
|
|
49
|
+
be hard-removed at v0.47.0. Use `scorecard()` + `metric_specs` or the
|
|
50
|
+
`eval_toolkit.metrics` submodule path (internal API, no warning).
|
|
51
|
+
|
|
52
|
+
- `pr_auc`, `roc_auc`, `brier_score`
|
|
53
|
+
- `expected_calibration_error`
|
|
54
|
+
- `expected_calibration_error_debiased`
|
|
55
|
+
- `expected_calibration_error_equal_mass`
|
|
56
|
+
- `expected_calibration_error_l2`
|
|
57
|
+
- `expected_calibration_error_l2_debiased`
|
|
58
|
+
|
|
59
|
+
### Audit findings integrated (Round 5)
|
|
60
|
+
|
|
61
|
+
Per `docs/source/audit_findings.md`:
|
|
62
|
+
|
|
63
|
+
- **F1** (scorecard threshold semantics) — addressed by Decision R: ship
|
|
64
|
+
threshold-free first-party specs only at v0.46. Threshold-dependent
|
|
65
|
+
metrics (F1, accuracy, precision, recall) deferred to v1.x with explicit
|
|
66
|
+
operating-point provenance.
|
|
67
|
+
- **F2** (scorecard cell-state semantics) — addressed by Decision S: reuse
|
|
68
|
+
existing `MetricState` (`ok`/`skipped`/`error`) vocabulary.
|
|
69
|
+
- **F4** (deprecation shim must extend the lazy resolver, not replace it) —
|
|
70
|
+
addressed: `__getattr__` deprecation branch sits between `__version__`
|
|
71
|
+
short-circuit and the base `_EXPORTS` lookup; tagged with BEGIN/END
|
|
72
|
+
TRANSITIONAL markers for clean v0.47 removal. Tests guard that every
|
|
73
|
+
remaining `_EXPORTS` symbol still resolves.
|
|
74
|
+
- **X.2 precondition** — `is_metric_defined_for_slice` aliases shipped
|
|
75
|
+
ahead of v0.46 (PR #62).
|
|
76
|
+
|
|
77
|
+
### Protocol stability
|
|
78
|
+
|
|
79
|
+
Tier-2 streak continues: 7 of 7 consecutive minors (v0.40–v0.46) without
|
|
80
|
+
method-shape edits to any existing Tier-2 Protocol. `MetricSpec` is a NEW
|
|
81
|
+
Tier-2 Protocol added at v0.46; freezes at v1.0.
|
|
82
|
+
|
|
8
83
|
## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
|
|
9
84
|
|
|
10
85
|
First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.46.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -193,20 +193,18 @@ _EXPORTS: dict[str, str] = {
|
|
|
193
193
|
"SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
|
|
194
194
|
"ThresholdResult": "eval_toolkit.metrics",
|
|
195
195
|
"brier_decomposition": "eval_toolkit.metrics",
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
196
|
+
# `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
|
|
197
|
+
# `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
|
|
198
|
+
# level via the `__getattr__` deprecation branch (emits
|
|
199
|
+
# `DeprecationWarning`; branch removed at v0.47) and via the metrics
|
|
200
|
+
# submodule (`from eval_toolkit.metrics import pr_auc` — internal API
|
|
201
|
+
# per ADR 0002, not part of the v1.0 stability contract).
|
|
202
202
|
"headline_metrics": "eval_toolkit.metrics",
|
|
203
203
|
"is_metric_defined_for_slice": "eval_toolkit.metrics",
|
|
204
204
|
"metrics_at_threshold": "eval_toolkit.metrics",
|
|
205
|
-
"pr_auc": "eval_toolkit.metrics",
|
|
206
205
|
"precision_at_prior": "eval_toolkit.metrics",
|
|
207
206
|
"quantile_stratified_pr_auc": "eval_toolkit.metrics",
|
|
208
207
|
"quantile_stratified_report": "eval_toolkit.metrics",
|
|
209
|
-
"roc_auc": "eval_toolkit.metrics",
|
|
210
208
|
"score_distribution_summary": "eval_toolkit.metrics",
|
|
211
209
|
"single_class_threshold_metrics": "eval_toolkit.metrics",
|
|
212
210
|
"stratified_recall": "eval_toolkit.metrics",
|
|
@@ -296,15 +294,68 @@ _EXPORTS: dict[str, str] = {
|
|
|
296
294
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
297
295
|
"LogisticStacker": "eval_toolkit.stacking",
|
|
298
296
|
"MetaLearner": "eval_toolkit.stacking",
|
|
297
|
+
"MetricResult": "eval_toolkit._scorecard",
|
|
298
|
+
"MetricSpec": "eval_toolkit._scorecard",
|
|
299
|
+
"Scorecard": "eval_toolkit._scorecard",
|
|
300
|
+
"scorecard": "eval_toolkit._scorecard",
|
|
299
301
|
}
|
|
300
302
|
|
|
301
303
|
__all__ = ["__version__", *_EXPORTS.keys()]
|
|
302
304
|
|
|
303
305
|
|
|
306
|
+
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
307
|
+
# At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
|
|
308
|
+
# in favor of the `scorecard()` surface (Decision A). To give the consumer one
|
|
309
|
+
# release of overlap before the hard removal at v0.47, the names below remain
|
|
310
|
+
# reachable via the package-level `__getattr__` (which delegates to the
|
|
311
|
+
# `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
|
|
312
|
+
# lookup pointing at the new API.
|
|
313
|
+
#
|
|
314
|
+
# WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
|
|
315
|
+
# `__getattr__` below is the load-bearing lazy export resolver for every name
|
|
316
|
+
# in `_EXPORTS`. The deprecation branch is a discrete `if name in
|
|
317
|
+
# _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
|
|
318
|
+
# behavior for non-deprecated names is unchanged. At v0.47 we delete this
|
|
319
|
+
# transitional block and the resolver continues to work for every remaining
|
|
320
|
+
# `_EXPORTS` entry.
|
|
321
|
+
_DEPRECATED_SCALARS: frozenset[str] = frozenset(
|
|
322
|
+
{
|
|
323
|
+
"pr_auc",
|
|
324
|
+
"roc_auc",
|
|
325
|
+
"brier_score",
|
|
326
|
+
"expected_calibration_error",
|
|
327
|
+
"expected_calibration_error_debiased",
|
|
328
|
+
"expected_calibration_error_equal_mass",
|
|
329
|
+
"expected_calibration_error_l2",
|
|
330
|
+
"expected_calibration_error_l2_debiased",
|
|
331
|
+
}
|
|
332
|
+
)
|
|
333
|
+
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
334
|
+
|
|
335
|
+
|
|
304
336
|
def __getattr__(name: str) -> Any:
|
|
305
337
|
"""Resolve public symbols lazily."""
|
|
306
338
|
if name == "__version__":
|
|
307
339
|
return __version__
|
|
340
|
+
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
341
|
+
if name in _DEPRECATED_SCALARS:
|
|
342
|
+
import warnings
|
|
343
|
+
|
|
344
|
+
warnings.warn(
|
|
345
|
+
f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
|
|
346
|
+
f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
|
|
347
|
+
f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
|
|
348
|
+
f" `eval_toolkit.metrics` submodule directly (internal API).",
|
|
349
|
+
DeprecationWarning,
|
|
350
|
+
stacklevel=2,
|
|
351
|
+
)
|
|
352
|
+
module = import_module("eval_toolkit.metrics")
|
|
353
|
+
value = getattr(module, name)
|
|
354
|
+
# Do NOT cache in globals() — repeated lookups should keep re-warning
|
|
355
|
+
# (one warning per call site, modulo Python's default
|
|
356
|
+
# DeprecationWarning de-duplication).
|
|
357
|
+
return value
|
|
358
|
+
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
308
359
|
module_name = _EXPORTS.get(name)
|
|
309
360
|
if module_name is None:
|
|
310
361
|
raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
|
|
@@ -314,6 +365,29 @@ def __getattr__(name: str) -> Any:
|
|
|
314
365
|
return value
|
|
315
366
|
|
|
316
367
|
|
|
368
|
+
# ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
369
|
+
def _scorecard_spec_for(deprecated_name: str) -> str:
|
|
370
|
+
"""Map a deprecated-scalar name to its `metric_specs` replacement name.
|
|
371
|
+
|
|
372
|
+
Used only inside the v0.46 deprecation warning message. Returns the
|
|
373
|
+
closest equivalent first-party spec name where one exists; falls back
|
|
374
|
+
to the original name for ECE variants whose exact-match spec isn't in
|
|
375
|
+
the v0.46 first-party namespace (e.g., the L2 / debiased variants —
|
|
376
|
+
callers either implement a custom `MetricSpec` or stay on the
|
|
377
|
+
submodule path).
|
|
378
|
+
"""
|
|
379
|
+
return {
|
|
380
|
+
"pr_auc": "pr_auc",
|
|
381
|
+
"roc_auc": "roc_auc",
|
|
382
|
+
"brier_score": "brier",
|
|
383
|
+
"expected_calibration_error": "ece(n_bins=10)",
|
|
384
|
+
"expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
|
|
385
|
+
}.get(deprecated_name, deprecated_name)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
389
|
+
|
|
390
|
+
|
|
317
391
|
def __dir__() -> list[str]:
|
|
318
392
|
"""Expose lazy public symbols to introspection."""
|
|
319
393
|
return sorted(__all__)
|