eval-toolkit 0.45.0__tar.gz → 0.46.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/CHANGELOG.md +75 -0
  2. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__init__.py +82 -8
  4. eval_toolkit-0.46.0/src/eval_toolkit/_scorecard.py +509 -0
  5. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_version.py +1 -1
  6. eval_toolkit-0.46.0/src/eval_toolkit/metric_specs.py +182 -0
  7. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/metrics.py +31 -2
  8. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/public_api/snapshot.json +36 -51
  9. eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +184 -0
  10. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_is_metric_defined_for_slice.py +25 -2
  11. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_misc_coverage.py +13 -2
  12. eval_toolkit-0.46.0/tests/test_scorecard.py +408 -0
  13. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/.gitignore +0 -0
  14. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/LICENSE +0 -0
  15. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/README.md +0 -0
  16. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/STYLE.md +0 -0
  17. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/archive/README.md +0 -0
  18. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/README.md +0 -0
  19. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/datasets/README.md +0 -0
  20. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/data-integrity/README.md +0 -0
  21. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  22. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/inference/README.md +0 -0
  23. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/research/papers/prompt-injection/README.md +0 -0
  24. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/source/adr/README.md +0 -0
  25. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/docs/source/methodology/README.md +0 -0
  26. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/pyproject.toml +0 -0
  27. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__main__.py +0 -0
  28. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_deprecated.py +0 -0
  29. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_parallel.py +0 -0
  30. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/adversarial.py +0 -0
  31. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/analysis.py +0 -0
  32. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/artifacts.py +0 -0
  33. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/bootstrap.py +0 -0
  34. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/calibration.py +0 -0
  35. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/claims.py +0 -0
  36. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/config.py +0 -0
  37. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/docs.py +0 -0
  38. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/embeddings.py +0 -0
  39. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/evidence.py +0 -0
  40. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/harness.py +0 -0
  41. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/leakage.py +0 -0
  42. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/loaders.py +0 -0
  43. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/losses.py +0 -0
  44. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/manifest.py +0 -0
  45. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/operating_points.py +0 -0
  46. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/paths.py +0 -0
  47. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/plotting.py +0 -0
  48. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/preprocessing.py +0 -0
  49. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/probes.py +0 -0
  50. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/protocols.py +0 -0
  51. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/provenance.py +0 -0
  52. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/py.typed +0 -0
  53. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  54. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  55. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  56. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  57. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/seeds.py +0 -0
  60. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/splits.py +0 -0
  61. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/stacking.py +0 -0
  62. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/text_dedup.py +0 -0
  63. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/src/eval_toolkit/thresholds.py +0 -0
  64. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  65. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  66. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  67. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  68. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  69. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  70. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  71. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  72. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  73. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  74. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/benchmarks/__init__.py +0 -0
  75. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  76. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/conftest.py +0 -0
  77. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  78. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  79. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  80. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  81. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/expected.md +0 -0
  82. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/input.md +0 -0
  83. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/docs/metrics.json +0 -0
  84. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  85. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/strategies.py +0 -0
  86. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_adversarial.py +0 -0
  87. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_analysis.py +0 -0
  88. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_artifacts.py +0 -0
  89. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  90. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  91. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_edge_cases.py +0 -0
  92. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_golden.py +0 -0
  93. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_njobs.py +0 -0
  94. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_props.py +0 -0
  95. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_research_grounded.py +0 -0
  96. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_unit.py +0 -0
  97. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_binary_adapters.py +0 -0
  98. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  99. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_determinism.py +0 -0
  100. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_optimization_failures.py +0 -0
  101. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_props.py +0 -0
  102. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_research_grounded.py +0 -0
  103. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_calibration_unit.py +0 -0
  104. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims.py +0 -0
  105. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims_coverage.py +0 -0
  106. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_claims_props.py +0 -0
  107. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_cli.py +0 -0
  108. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_config.py +0 -0
  109. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_bootstrap.py +0 -0
  110. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_calibration.py +0 -0
  111. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_harness.py +0 -0
  112. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_metrics.py +0 -0
  113. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_coverage_plotting.py +0 -0
  114. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_croissant_e2e.py +0 -0
  115. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  116. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_deprecations.py +0 -0
  117. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_docs_golden.py +0 -0
  118. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_docs_props.py +0 -0
  119. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_embeddings.py +0 -0
  120. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_evidence_validators.py +0 -0
  121. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_edge_cases.py +0 -0
  122. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_fault_injection.py +0 -0
  123. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_folded.py +0 -0
  124. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_internals.py +0 -0
  125. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_metric_options.py +0 -0
  126. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_parallelism.py +0 -0
  127. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_harness_smoke.py +0 -0
  128. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_import_boundaries.py +0 -0
  129. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage.py +0 -0
  130. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage_error_paths.py +0 -0
  131. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_leakage_props.py +0 -0
  132. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders.py +0 -0
  133. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders_coverage.py +0 -0
  134. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_loaders_props.py +0 -0
  135. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_logging.py +0 -0
  136. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_losses.py +0 -0
  137. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest.py +0 -0
  138. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  139. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_props.py +0 -0
  140. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_manifest_validation.py +0 -0
  141. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_props.py +0 -0
  142. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_stratified_subsets.py +0 -0
  143. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_metrics_unit.py +0 -0
  144. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_numeric_edge_cases.py +0 -0
  145. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_ood_loader.py +0 -0
  146. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_operating_points.py +0 -0
  147. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_operating_points_props.py +0 -0
  148. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_parallel.py +0 -0
  149. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_paths.py +0 -0
  150. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_pipeline_e2e.py +0 -0
  151. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_edge.py +0 -0
  152. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_smoke.py +0 -0
  153. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_plotting_visual.py +0 -0
  154. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_preprocessing.py +0 -0
  155. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_probes.py +0 -0
  156. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_protocol_conformance.py +0 -0
  157. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_provenance.py +0 -0
  158. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_public_api.py +0 -0
  159. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_recall_at_fpr.py +0 -0
  160. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_reference_equivalence.py +0 -0
  161. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_reproducibility_integration.py +0 -0
  162. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_schemas.py +0 -0
  163. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_seeds.py +0 -0
  164. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits.py +0 -0
  165. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits_leakage_integration.py +0 -0
  166. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_splits_props.py +0 -0
  167. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_stacking.py +0 -0
  168. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup.py +0 -0
  169. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_coverage.py +0 -0
  170. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_props.py +0 -0
  171. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_strategies.py +0 -0
  172. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds.py +0 -0
  173. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_constant_score.py +0 -0
  174. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_coverage.py +0 -0
  175. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_props.py +0 -0
  176. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_thresholds_research_grounded.py +0 -0
  177. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_tokenization_leakage_check.py +0 -0
  178. {eval_toolkit-0.45.0 → eval_toolkit-0.46.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,81 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
9
+
10
+ Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
11
+ **Soft-breaking** — existing top-level scalar metric imports still work but
12
+ emit `DeprecationWarning` (hard-removed at v0.47).
13
+
14
+ See `docs/source/migration/v0.46.md` for the full consumer migration guide and
15
+ `docs/source/adr/0002-scorecard-as-primary-metric-surface.md` for the
16
+ decision record.
17
+
18
+ ### Added
19
+
20
+ - **`eval_toolkit.scorecard(y_true, y_score, metrics=[...], bootstrap=True)`**
21
+ — primary v1.0 metric surface. Single call computes multiple threshold-free
22
+ metrics + bootstrap CIs on one slice; returns a `Scorecard` (read-only
23
+ `Mapping[str, MetricResult]`). Type-safe dict-subscript access; status-aware
24
+ cells; per-cell error isolation.
25
+ - **`MetricSpec` Protocol** — v1.0 Tier-2 contract; `name: str` +
26
+ `compute(y_true, y_score) -> float`. Custom user specs satisfy structurally.
27
+ - **`MetricResult`** frozen dataclass — `value: float | None`, `status:
28
+ Literal["ok", "skipped", "error"]`, `reason: str`, `ci: BootstrapCI | None`.
29
+ Reuses the existing `MetricState` vocabulary from `artifacts.py:30-61`.
30
+ - **`Scorecard`** read-only `Mapping[str, MetricResult]` — `to_dict()`
31
+ JSON-friendly, `to_pandas()` one-row DataFrame (lazy pandas import).
32
+ - **`eval_toolkit.metric_specs`** namespace submodule with threshold-free
33
+ first-party specs:
34
+ - `pr_auc`, `roc_auc`, `brier` — module-level singletons (identity stable).
35
+ - `ece(n_bins, strategy)` — LRU-cached factory (identity stable per kwargs).
36
+ - **`SINGLE_CLASS_INCOMPATIBLE_METRICS`** extended with `pr_auc` / `roc_auc`
37
+ aliases (alongside existing `auroc` / `auprc`) so the v0.46 scorecard
38
+ surface and the v0.39 harness paths both produce correct skipped-status
39
+ behavior. Non-breaking; doctest + unit tests added.
40
+ - **`docs/source/adr/0002-scorecard-as-primary-metric-surface.md`** —
41
+ decision record covering single-surface rationale, threshold-free scope,
42
+ Tier-2 Protocol commitment, and v2.0 trigger conditions.
43
+ - **`docs/source/migration/v0.46.md`** — consumer migration guide with
44
+ side-by-side recipes for every common pattern.
45
+
46
+ ### Deprecated
47
+
48
+ The following 8 top-level scalar imports emit `DeprecationWarning` and will
49
+ be hard-removed at v0.47.0. Use `scorecard()` + `metric_specs` or the
50
+ `eval_toolkit.metrics` submodule path (internal API, no warning).
51
+
52
+ - `pr_auc`, `roc_auc`, `brier_score`
53
+ - `expected_calibration_error`
54
+ - `expected_calibration_error_debiased`
55
+ - `expected_calibration_error_equal_mass`
56
+ - `expected_calibration_error_l2`
57
+ - `expected_calibration_error_l2_debiased`
58
+
59
+ ### Audit findings integrated (Round 5)
60
+
61
+ Per `docs/source/audit_findings.md`:
62
+
63
+ - **F1** (scorecard threshold semantics) — addressed by Decision R: ship
64
+ threshold-free first-party specs only at v0.46. Threshold-dependent
65
+ metrics (F1, accuracy, precision, recall) deferred to v1.x with explicit
66
+ operating-point provenance.
67
+ - **F2** (scorecard cell-state semantics) — addressed by Decision S: reuse
68
+ existing `MetricState` (`ok`/`skipped`/`error`) vocabulary.
69
+ - **F4** (deprecation shim must extend the lazy resolver, not replace it) —
70
+ addressed: `__getattr__` deprecation branch sits between `__version__`
71
+ short-circuit and the base `_EXPORTS` lookup; tagged with BEGIN/END
72
+ TRANSITIONAL markers for clean v0.47 removal. Tests guard that every
73
+ remaining `_EXPORTS` symbol still resolves.
74
+ - **X.2 precondition** — `is_metric_defined_for_slice` aliases shipped
75
+ ahead of v0.46 (PR #62).
76
+
77
+ ### Protocol stability
78
+
79
+ Tier-2 streak continues: 7 of 7 consecutive minors (v0.40–v0.46) without
80
+ method-shape edits to any existing Tier-2 Protocol. `MetricSpec` is a NEW
81
+ Tier-2 Protocol added at v0.46; freezes at v1.0.
82
+
8
83
  ## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
9
84
 
10
85
  First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.45.0
3
+ Version: 0.46.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -193,20 +193,18 @@ _EXPORTS: dict[str, str] = {
193
193
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
194
194
  "ThresholdResult": "eval_toolkit.metrics",
195
195
  "brier_decomposition": "eval_toolkit.metrics",
196
- "brier_score": "eval_toolkit.metrics",
197
- "expected_calibration_error": "eval_toolkit.metrics",
198
- "expected_calibration_error_debiased": "eval_toolkit.metrics",
199
- "expected_calibration_error_equal_mass": "eval_toolkit.metrics",
200
- "expected_calibration_error_l2": "eval_toolkit.metrics",
201
- "expected_calibration_error_l2_debiased": "eval_toolkit.metrics",
196
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
197
+ # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
198
+ # level via the `__getattr__` deprecation branch (emits
199
+ # `DeprecationWarning`; branch removed at v0.47) and via the metrics
200
+ # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
201
+ # per ADR 0002, not part of the v1.0 stability contract).
202
202
  "headline_metrics": "eval_toolkit.metrics",
203
203
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
204
204
  "metrics_at_threshold": "eval_toolkit.metrics",
205
- "pr_auc": "eval_toolkit.metrics",
206
205
  "precision_at_prior": "eval_toolkit.metrics",
207
206
  "quantile_stratified_pr_auc": "eval_toolkit.metrics",
208
207
  "quantile_stratified_report": "eval_toolkit.metrics",
209
- "roc_auc": "eval_toolkit.metrics",
210
208
  "score_distribution_summary": "eval_toolkit.metrics",
211
209
  "single_class_threshold_metrics": "eval_toolkit.metrics",
212
210
  "stratified_recall": "eval_toolkit.metrics",
@@ -296,15 +294,68 @@ _EXPORTS: dict[str, str] = {
296
294
  "wilson_interval": "eval_toolkit.thresholds",
297
295
  "LogisticStacker": "eval_toolkit.stacking",
298
296
  "MetaLearner": "eval_toolkit.stacking",
297
+ "MetricResult": "eval_toolkit._scorecard",
298
+ "MetricSpec": "eval_toolkit._scorecard",
299
+ "Scorecard": "eval_toolkit._scorecard",
300
+ "scorecard": "eval_toolkit._scorecard",
299
301
  }
300
302
 
301
303
  __all__ = ["__version__", *_EXPORTS.keys()]
302
304
 
303
305
 
306
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
307
+ # At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
308
+ # in favor of the `scorecard()` surface (Decision A). To give the consumer one
309
+ # release of overlap before the hard removal at v0.47, the names below remain
310
+ # reachable via the package-level `__getattr__` (which delegates to the
311
+ # `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
312
+ # lookup pointing at the new API.
313
+ #
314
+ # WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
315
+ # `__getattr__` below is the load-bearing lazy export resolver for every name
316
+ # in `_EXPORTS`. The deprecation branch is a discrete `if name in
317
+ # _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
318
+ # behavior for non-deprecated names is unchanged. At v0.47 we delete this
319
+ # transitional block and the resolver continues to work for every remaining
320
+ # `_EXPORTS` entry.
321
+ _DEPRECATED_SCALARS: frozenset[str] = frozenset(
322
+ {
323
+ "pr_auc",
324
+ "roc_auc",
325
+ "brier_score",
326
+ "expected_calibration_error",
327
+ "expected_calibration_error_debiased",
328
+ "expected_calibration_error_equal_mass",
329
+ "expected_calibration_error_l2",
330
+ "expected_calibration_error_l2_debiased",
331
+ }
332
+ )
333
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
334
+
335
+
304
336
  def __getattr__(name: str) -> Any:
305
337
  """Resolve public symbols lazily."""
306
338
  if name == "__version__":
307
339
  return __version__
340
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
341
+ if name in _DEPRECATED_SCALARS:
342
+ import warnings
343
+
344
+ warnings.warn(
345
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
346
+ f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
347
+ f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
348
+ f" `eval_toolkit.metrics` submodule directly (internal API).",
349
+ DeprecationWarning,
350
+ stacklevel=2,
351
+ )
352
+ module = import_module("eval_toolkit.metrics")
353
+ value = getattr(module, name)
354
+ # Do NOT cache in globals() — repeated lookups should keep re-warning
355
+ # (one warning per call site, modulo Python's default
356
+ # DeprecationWarning de-duplication).
357
+ return value
358
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
308
359
  module_name = _EXPORTS.get(name)
309
360
  if module_name is None:
310
361
  raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
@@ -314,6 +365,29 @@ def __getattr__(name: str) -> Any:
314
365
  return value
315
366
 
316
367
 
368
+ # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
369
+ def _scorecard_spec_for(deprecated_name: str) -> str:
370
+ """Map a deprecated-scalar name to its `metric_specs` replacement name.
371
+
372
+ Used only inside the v0.46 deprecation warning message. Returns the
373
+ closest equivalent first-party spec name where one exists; falls back
374
+ to the original name for ECE variants whose exact-match spec isn't in
375
+ the v0.46 first-party namespace (e.g., the L2 / debiased variants —
376
+ callers either implement a custom `MetricSpec` or stay on the
377
+ submodule path).
378
+ """
379
+ return {
380
+ "pr_auc": "pr_auc",
381
+ "roc_auc": "roc_auc",
382
+ "brier_score": "brier",
383
+ "expected_calibration_error": "ece(n_bins=10)",
384
+ "expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
385
+ }.get(deprecated_name, deprecated_name)
386
+
387
+
388
+ # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
389
+
390
+
317
391
  def __dir__() -> list[str]:
318
392
  """Expose lazy public symbols to introspection."""
319
393
  return sorted(__all__)