eval-toolkit 0.44.0__tar.gz → 0.46.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/.gitignore +6 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/CHANGELOG.md +122 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/PKG-INFO +1 -1
- eval_toolkit-0.46.0/docs/source/adr/README.md +76 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__init__.py +84 -8
- eval_toolkit-0.46.0/src/eval_toolkit/_scorecard.py +509 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_version.py +1 -1
- eval_toolkit-0.46.0/src/eval_toolkit/metric_specs.py +182 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/metrics.py +31 -2
- eval_toolkit-0.46.0/src/eval_toolkit/stacking.py +412 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/public_api/snapshot.json +54 -51
- eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +184 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_is_metric_defined_for_slice.py +25 -2
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_misc_coverage.py +13 -2
- eval_toolkit-0.46.0/tests/test_scorecard.py +408 -0
- eval_toolkit-0.46.0/tests/test_stacking.py +369 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/LICENSE +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/STYLE.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/pyproject.toml +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/adversarial.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/losses.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/preprocessing.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/probes.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_adversarial.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_binary_adapters.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_croissant_e2e.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_parallelism.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_losses.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_ood_loader.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_preprocessing.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_probes.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_tokenization_leakage_check.py +0 -0
- {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_v09_contracts.py +0 -0
|
@@ -39,6 +39,12 @@ coverage.json
|
|
|
39
39
|
# Logs
|
|
40
40
|
*.log
|
|
41
41
|
|
|
42
|
+
# Local environment overrides (machine-local credentials / config)
|
|
43
|
+
.env.local
|
|
44
|
+
|
|
45
|
+
# Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
|
|
46
|
+
mutants/
|
|
47
|
+
|
|
42
48
|
# Claude Code project settings (machine-local)
|
|
43
49
|
.claude/
|
|
44
50
|
|
|
@@ -5,6 +5,128 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
|
|
9
|
+
|
|
10
|
+
Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
|
|
11
|
+
**Soft-breaking** — existing top-level scalar metric imports still work but
|
|
12
|
+
emit `DeprecationWarning` (hard-removed at v0.47).
|
|
13
|
+
|
|
14
|
+
See `docs/source/migration/v0.46.md` for the full consumer migration guide and
|
|
15
|
+
`docs/source/adr/0002-scorecard-as-primary-metric-surface.md` for the
|
|
16
|
+
decision record.
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- **`eval_toolkit.scorecard(y_true, y_score, metrics=[...], bootstrap=True)`**
|
|
21
|
+
— primary v1.0 metric surface. Single call computes multiple threshold-free
|
|
22
|
+
metrics + bootstrap CIs on one slice; returns a `Scorecard` (read-only
|
|
23
|
+
`Mapping[str, MetricResult]`). Type-safe dict-subscript access; status-aware
|
|
24
|
+
cells; per-cell error isolation.
|
|
25
|
+
- **`MetricSpec` Protocol** — v1.0 Tier-2 contract; `name: str` +
|
|
26
|
+
`compute(y_true, y_score) -> float`. Custom user specs satisfy structurally.
|
|
27
|
+
- **`MetricResult`** frozen dataclass — `value: float | None`, `status:
|
|
28
|
+
Literal["ok", "skipped", "error"]`, `reason: str`, `ci: BootstrapCI | None`.
|
|
29
|
+
Reuses the existing `MetricState` vocabulary from `artifacts.py:30-61`.
|
|
30
|
+
- **`Scorecard`** read-only `Mapping[str, MetricResult]` — `to_dict()`
|
|
31
|
+
JSON-friendly, `to_pandas()` one-row DataFrame (lazy pandas import).
|
|
32
|
+
- **`eval_toolkit.metric_specs`** namespace submodule with threshold-free
|
|
33
|
+
first-party specs:
|
|
34
|
+
- `pr_auc`, `roc_auc`, `brier` — module-level singletons (identity stable).
|
|
35
|
+
- `ece(n_bins, strategy)` — LRU-cached factory (identity stable per kwargs).
|
|
36
|
+
- **`SINGLE_CLASS_INCOMPATIBLE_METRICS`** extended with `pr_auc` / `roc_auc`
|
|
37
|
+
aliases (alongside existing `auroc` / `auprc`) so the v0.46 scorecard
|
|
38
|
+
surface and the v0.39 harness paths both produce correct skipped-status
|
|
39
|
+
behavior. Non-breaking; doctest + unit tests added.
|
|
40
|
+
- **`docs/source/adr/0002-scorecard-as-primary-metric-surface.md`** —
|
|
41
|
+
decision record covering single-surface rationale, threshold-free scope,
|
|
42
|
+
Tier-2 Protocol commitment, and v2.0 trigger conditions.
|
|
43
|
+
- **`docs/source/migration/v0.46.md`** — consumer migration guide with
|
|
44
|
+
side-by-side recipes for every common pattern.
|
|
45
|
+
|
|
46
|
+
### Deprecated
|
|
47
|
+
|
|
48
|
+
The following 8 top-level scalar imports emit `DeprecationWarning` and will
|
|
49
|
+
be hard-removed at v0.47.0. Use `scorecard()` + `metric_specs` or the
|
|
50
|
+
`eval_toolkit.metrics` submodule path (internal API, no warning).
|
|
51
|
+
|
|
52
|
+
- `pr_auc`, `roc_auc`, `brier_score`
|
|
53
|
+
- `expected_calibration_error`
|
|
54
|
+
- `expected_calibration_error_debiased`
|
|
55
|
+
- `expected_calibration_error_equal_mass`
|
|
56
|
+
- `expected_calibration_error_l2`
|
|
57
|
+
- `expected_calibration_error_l2_debiased`
|
|
58
|
+
|
|
59
|
+
### Audit findings integrated (Round 5)
|
|
60
|
+
|
|
61
|
+
Per `docs/source/audit_findings.md`:
|
|
62
|
+
|
|
63
|
+
- **F1** (scorecard threshold semantics) — addressed by Decision R: ship
|
|
64
|
+
threshold-free first-party specs only at v0.46. Threshold-dependent
|
|
65
|
+
metrics (F1, accuracy, precision, recall) deferred to v1.x with explicit
|
|
66
|
+
operating-point provenance.
|
|
67
|
+
- **F2** (scorecard cell-state semantics) — addressed by Decision S: reuse
|
|
68
|
+
existing `MetricState` (`ok`/`skipped`/`error`) vocabulary.
|
|
69
|
+
- **F4** (deprecation shim must extend the lazy resolver, not replace it) —
|
|
70
|
+
addressed: `__getattr__` deprecation branch sits between `__version__`
|
|
71
|
+
short-circuit and the base `_EXPORTS` lookup; tagged with BEGIN/END
|
|
72
|
+
TRANSITIONAL markers for clean v0.47 removal. Tests guard that every
|
|
73
|
+
remaining `_EXPORTS` symbol still resolves.
|
|
74
|
+
- **X.2 precondition** — `is_metric_defined_for_slice` aliases shipped
|
|
75
|
+
ahead of v0.46 (PR #62).
|
|
76
|
+
|
|
77
|
+
### Protocol stability
|
|
78
|
+
|
|
79
|
+
Tier-2 streak continues: 7 of 7 consecutive minors (v0.40–v0.46) without
|
|
80
|
+
method-shape edits to any existing Tier-2 Protocol. `MetricSpec` is a NEW
|
|
81
|
+
Tier-2 Protocol added at v0.46; freezes at v1.0.
|
|
82
|
+
|
|
83
|
+
## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
|
|
84
|
+
|
|
85
|
+
First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
|
|
86
|
+
(per the v1.0 plan at `~/.claude/plans/evaluate-all-the-work-twinkly-kite.md`).
|
|
87
|
+
Non-breaking — purely additive. No Protocol shape edits to the existing 6
|
|
88
|
+
Tier-2 contracts (Gate 2 streak continues: 6 of 6 consecutive minors without
|
|
89
|
+
Protocol-shape changes).
|
|
90
|
+
|
|
91
|
+
### Added
|
|
92
|
+
|
|
93
|
+
- `eval_toolkit.stacking` — new module providing the `MetaLearner` Protocol
|
|
94
|
+
and one reference impl, `LogisticStacker`, for combining outputs from
|
|
95
|
+
multiple binary detectors into a calibrated ensemble. Wraps
|
|
96
|
+
`sklearn.linear_model.LogisticRegression` with a stacker-shaped public API
|
|
97
|
+
(sklearn-style `fit(score_matrix, y)`, `predict(score_matrix)`,
|
|
98
|
+
`predict_proba(score_matrix)`, plus `coef_` / `classes_` / `intercept_`
|
|
99
|
+
attributes). No new dependencies — `scikit-learn` is already core since
|
|
100
|
+
v0.27. Closes #52.
|
|
101
|
+
- `MetaLearner` Protocol — `@runtime_checkable`; sklearn-shape contract
|
|
102
|
+
taking a `(n_samples, n_detectors)` score matrix. Sized as a v1.0 Tier-2
|
|
103
|
+
contract per the v1.0 plan Decision M (tiered stability — strict freeze at
|
|
104
|
+
v1.0; additive subprotocols permitted in minor releases). Mirrors the
|
|
105
|
+
`Probe` Protocol pattern from v0.43.
|
|
106
|
+
- `LogisticStacker` reference impl — configurable C, fit_intercept,
|
|
107
|
+
class_weight, penalty, solver, max_iter, random_state. Class-weight default
|
|
108
|
+
`"balanced"` for the common imbalanced-detection setting. Composes with the
|
|
109
|
+
4-binary-calibrator family (v0.40 + v0.42) via `fit_platt_binary` /
|
|
110
|
+
`fit_isotonic_binary` chaining on stacked output.
|
|
111
|
+
- 24-test coverage in `tests/test_stacking.py`: Protocol satisfaction (both
|
|
112
|
+
structural and duck-typed), shape contracts (3-detector × 500-sample
|
|
113
|
+
fixtures), regularization behavior (C, L1 penalty), signal ordering,
|
|
114
|
+
calibration chaining (Platt + Isotonic), bootstrap CI on stacker output
|
|
115
|
+
(Audit F6a-aware — uses correct `BootstrapCI.ci_low/ci_high` attribute
|
|
116
|
+
names), determinism under fixed `random_state`, hypothesis property on
|
|
117
|
+
signal monotonicity, input validation (shape mismatch, single-class,
|
|
118
|
+
non-finite, unfit, wrong-n-detectors).
|
|
119
|
+
- `docs/source/examples/stacking.md` — myst-nb worked example: 3 synthetic
|
|
120
|
+
detectors with descending signal-to-noise, stacker fit, post-stacking
|
|
121
|
+
Platt calibration. Cites Wolpert 1992 + Breiman 1996.
|
|
122
|
+
|
|
123
|
+
### Notes
|
|
124
|
+
|
|
125
|
+
- Sklearn 1.8+ deprecates `LogisticRegression(penalty=...)` in favor of
|
|
126
|
+
`l1_ratio`. The public `LogisticStacker(penalty=...)` API is preserved;
|
|
127
|
+
internal sklearn-side migration to `l1_ratio` will land when sklearn 1.10
|
|
128
|
+
lands and the warning becomes more visible. No user-facing impact.
|
|
129
|
+
|
|
8
130
|
## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
|
|
9
131
|
|
|
10
132
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.46.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Architecture Decision Records
|
|
2
|
+
|
|
3
|
+
This directory captures architecturally-significant decisions that shape
|
|
4
|
+
`eval-toolkit`'s long-term design. ADRs are immutable historical records —
|
|
5
|
+
once accepted, a decision is not edited in place; if it changes, a new ADR
|
|
6
|
+
supersedes it.
|
|
7
|
+
|
|
8
|
+
## When to file an ADR
|
|
9
|
+
|
|
10
|
+
File a new ADR when a decision:
|
|
11
|
+
|
|
12
|
+
- **Locks in an interface or shape** that future code is expected to
|
|
13
|
+
conform to (e.g., "metrics return type", "Protocol vs ABC").
|
|
14
|
+
- **Closes off alternatives** that were seriously considered, so the
|
|
15
|
+
reasoning isn't lost.
|
|
16
|
+
- **Carries cost** to reverse (e.g., a public-API contract that promises
|
|
17
|
+
stability across a release line).
|
|
18
|
+
|
|
19
|
+
Routine refactors, bug fixes, and internal-only patterns do not need ADRs —
|
|
20
|
+
the commit message + CHANGELOG entry are enough.
|
|
21
|
+
|
|
22
|
+
## Numbering
|
|
23
|
+
|
|
24
|
+
Sequential, zero-padded: `0001-flat-module-layout.md`,
|
|
25
|
+
`0002-scorecard-as-primary-metric-surface.md`, etc. Number is assigned
|
|
26
|
+
at the time of writing; if two ADRs are drafted in parallel, the second
|
|
27
|
+
to merge takes the next number.
|
|
28
|
+
|
|
29
|
+
## Format
|
|
30
|
+
|
|
31
|
+
Each ADR uses this skeleton (loosely based on MADR — Markdown ADR — without
|
|
32
|
+
the heavyweight template):
|
|
33
|
+
|
|
34
|
+
```markdown
|
|
35
|
+
# ADR NNNN: Title
|
|
36
|
+
|
|
37
|
+
**Status:** Proposed | Accepted | Superseded by ADR-MMMM
|
|
38
|
+
**Date:** YYYY-MM-DD
|
|
39
|
+
**Deciders:** (names or roles)
|
|
40
|
+
|
|
41
|
+
## Context
|
|
42
|
+
|
|
43
|
+
What's the situation that requires a decision? What constraints are at play?
|
|
44
|
+
|
|
45
|
+
## Decision
|
|
46
|
+
|
|
47
|
+
What did we decide?
|
|
48
|
+
|
|
49
|
+
## Consequences
|
|
50
|
+
|
|
51
|
+
What follows from this decision? (Both positive and negative.)
|
|
52
|
+
|
|
53
|
+
## Alternatives considered
|
|
54
|
+
|
|
55
|
+
What else was on the table, and why wasn't it chosen?
|
|
56
|
+
|
|
57
|
+
## Trigger to revisit
|
|
58
|
+
|
|
59
|
+
What would have to change for this decision to be reopened?
|
|
60
|
+
(Optional but useful — keeps the ADR self-documenting.)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Cross-references
|
|
64
|
+
|
|
65
|
+
- [`docs/RELEASING.md`](../../RELEASING.md) — release-flow process; ADRs
|
|
66
|
+
are typically drafted as part of release prep.
|
|
67
|
+
- [`docs/source/roadmap.md`](../roadmap.md) — long-term direction;
|
|
68
|
+
ADRs explain how individual roadmap decisions were made.
|
|
69
|
+
|
|
70
|
+
## Index
|
|
71
|
+
|
|
72
|
+
(Updated as ADRs are added.)
|
|
73
|
+
|
|
74
|
+
| # | Title | Status | Date |
|
|
75
|
+
|---|---|---|---|
|
|
76
|
+
| _none yet_ | | | |
|
|
@@ -193,20 +193,18 @@ _EXPORTS: dict[str, str] = {
|
|
|
193
193
|
"SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
|
|
194
194
|
"ThresholdResult": "eval_toolkit.metrics",
|
|
195
195
|
"brier_decomposition": "eval_toolkit.metrics",
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
196
|
+
# `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
|
|
197
|
+
# `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
|
|
198
|
+
# level via the `__getattr__` deprecation branch (emits
|
|
199
|
+
# `DeprecationWarning`; branch removed at v0.47) and via the metrics
|
|
200
|
+
# submodule (`from eval_toolkit.metrics import pr_auc` — internal API
|
|
201
|
+
# per ADR 0002, not part of the v1.0 stability contract).
|
|
202
202
|
"headline_metrics": "eval_toolkit.metrics",
|
|
203
203
|
"is_metric_defined_for_slice": "eval_toolkit.metrics",
|
|
204
204
|
"metrics_at_threshold": "eval_toolkit.metrics",
|
|
205
|
-
"pr_auc": "eval_toolkit.metrics",
|
|
206
205
|
"precision_at_prior": "eval_toolkit.metrics",
|
|
207
206
|
"quantile_stratified_pr_auc": "eval_toolkit.metrics",
|
|
208
207
|
"quantile_stratified_report": "eval_toolkit.metrics",
|
|
209
|
-
"roc_auc": "eval_toolkit.metrics",
|
|
210
208
|
"score_distribution_summary": "eval_toolkit.metrics",
|
|
211
209
|
"single_class_threshold_metrics": "eval_toolkit.metrics",
|
|
212
210
|
"stratified_recall": "eval_toolkit.metrics",
|
|
@@ -294,15 +292,70 @@ _EXPORTS: dict[str, str] = {
|
|
|
294
292
|
"recall_at_fpr": "eval_toolkit.thresholds",
|
|
295
293
|
"select_threshold": "eval_toolkit.thresholds",
|
|
296
294
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
295
|
+
"LogisticStacker": "eval_toolkit.stacking",
|
|
296
|
+
"MetaLearner": "eval_toolkit.stacking",
|
|
297
|
+
"MetricResult": "eval_toolkit._scorecard",
|
|
298
|
+
"MetricSpec": "eval_toolkit._scorecard",
|
|
299
|
+
"Scorecard": "eval_toolkit._scorecard",
|
|
300
|
+
"scorecard": "eval_toolkit._scorecard",
|
|
297
301
|
}
|
|
298
302
|
|
|
299
303
|
__all__ = ["__version__", *_EXPORTS.keys()]
|
|
300
304
|
|
|
301
305
|
|
|
306
|
+
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
307
|
+
# At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
|
|
308
|
+
# in favor of the `scorecard()` surface (Decision A). To give the consumer one
|
|
309
|
+
# release of overlap before the hard removal at v0.47, the names below remain
|
|
310
|
+
# reachable via the package-level `__getattr__` (which delegates to the
|
|
311
|
+
# `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
|
|
312
|
+
# lookup pointing at the new API.
|
|
313
|
+
#
|
|
314
|
+
# WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
|
|
315
|
+
# `__getattr__` below is the load-bearing lazy export resolver for every name
|
|
316
|
+
# in `_EXPORTS`. The deprecation branch is a discrete `if name in
|
|
317
|
+
# _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
|
|
318
|
+
# behavior for non-deprecated names is unchanged. At v0.47 we delete this
|
|
319
|
+
# transitional block and the resolver continues to work for every remaining
|
|
320
|
+
# `_EXPORTS` entry.
|
|
321
|
+
_DEPRECATED_SCALARS: frozenset[str] = frozenset(
|
|
322
|
+
{
|
|
323
|
+
"pr_auc",
|
|
324
|
+
"roc_auc",
|
|
325
|
+
"brier_score",
|
|
326
|
+
"expected_calibration_error",
|
|
327
|
+
"expected_calibration_error_debiased",
|
|
328
|
+
"expected_calibration_error_equal_mass",
|
|
329
|
+
"expected_calibration_error_l2",
|
|
330
|
+
"expected_calibration_error_l2_debiased",
|
|
331
|
+
}
|
|
332
|
+
)
|
|
333
|
+
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
334
|
+
|
|
335
|
+
|
|
302
336
|
def __getattr__(name: str) -> Any:
|
|
303
337
|
"""Resolve public symbols lazily."""
|
|
304
338
|
if name == "__version__":
|
|
305
339
|
return __version__
|
|
340
|
+
# ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
|
|
341
|
+
if name in _DEPRECATED_SCALARS:
|
|
342
|
+
import warnings
|
|
343
|
+
|
|
344
|
+
warnings.warn(
|
|
345
|
+
f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
|
|
346
|
+
f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
|
|
347
|
+
f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
|
|
348
|
+
f" `eval_toolkit.metrics` submodule directly (internal API).",
|
|
349
|
+
DeprecationWarning,
|
|
350
|
+
stacklevel=2,
|
|
351
|
+
)
|
|
352
|
+
module = import_module("eval_toolkit.metrics")
|
|
353
|
+
value = getattr(module, name)
|
|
354
|
+
# Do NOT cache in globals() — repeated lookups should keep re-warning
|
|
355
|
+
# (one warning per call site, modulo Python's default
|
|
356
|
+
# DeprecationWarning de-duplication).
|
|
357
|
+
return value
|
|
358
|
+
# ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
|
|
306
359
|
module_name = _EXPORTS.get(name)
|
|
307
360
|
if module_name is None:
|
|
308
361
|
raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
|
|
@@ -312,6 +365,29 @@ def __getattr__(name: str) -> Any:
|
|
|
312
365
|
return value
|
|
313
366
|
|
|
314
367
|
|
|
368
|
+
# ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
369
|
+
def _scorecard_spec_for(deprecated_name: str) -> str:
|
|
370
|
+
"""Map a deprecated-scalar name to its `metric_specs` replacement name.
|
|
371
|
+
|
|
372
|
+
Used only inside the v0.46 deprecation warning message. Returns the
|
|
373
|
+
closest equivalent first-party spec name where one exists; falls back
|
|
374
|
+
to the original name for ECE variants whose exact-match spec isn't in
|
|
375
|
+
the v0.46 first-party namespace (e.g., the L2 / debiased variants —
|
|
376
|
+
callers either implement a custom `MetricSpec` or stay on the
|
|
377
|
+
submodule path).
|
|
378
|
+
"""
|
|
379
|
+
return {
|
|
380
|
+
"pr_auc": "pr_auc",
|
|
381
|
+
"roc_auc": "roc_auc",
|
|
382
|
+
"brier_score": "brier",
|
|
383
|
+
"expected_calibration_error": "ece(n_bins=10)",
|
|
384
|
+
"expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
|
|
385
|
+
}.get(deprecated_name, deprecated_name)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
|
|
389
|
+
|
|
390
|
+
|
|
315
391
|
def __dir__() -> list[str]:
|
|
316
392
|
"""Expose lazy public symbols to introspection."""
|
|
317
393
|
return sorted(__all__)
|