eval-toolkit 0.44.0__tar.gz → 0.46.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/.gitignore +6 -0
  2. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/CHANGELOG.md +122 -0
  3. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/PKG-INFO +1 -1
  4. eval_toolkit-0.46.0/docs/source/adr/README.md +76 -0
  5. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__init__.py +84 -8
  6. eval_toolkit-0.46.0/src/eval_toolkit/_scorecard.py +509 -0
  7. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_version.py +1 -1
  8. eval_toolkit-0.46.0/src/eval_toolkit/metric_specs.py +182 -0
  9. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/metrics.py +31 -2
  10. eval_toolkit-0.46.0/src/eval_toolkit/stacking.py +412 -0
  11. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/public_api/snapshot.json +54 -51
  12. eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +184 -0
  13. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_is_metric_defined_for_slice.py +25 -2
  14. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_misc_coverage.py +13 -2
  15. eval_toolkit-0.46.0/tests/test_scorecard.py +408 -0
  16. eval_toolkit-0.46.0/tests/test_stacking.py +369 -0
  17. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/LICENSE +0 -0
  18. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/README.md +0 -0
  19. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/STYLE.md +0 -0
  20. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/archive/README.md +0 -0
  21. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/README.md +0 -0
  22. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/datasets/README.md +0 -0
  23. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/data-integrity/README.md +0 -0
  24. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  25. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/inference/README.md +0 -0
  26. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/research/papers/prompt-injection/README.md +0 -0
  27. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/docs/source/methodology/README.md +0 -0
  28. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/pyproject.toml +0 -0
  29. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/__main__.py +0 -0
  30. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_deprecated.py +0 -0
  31. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/_parallel.py +0 -0
  32. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/adversarial.py +0 -0
  33. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/analysis.py +0 -0
  34. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/artifacts.py +0 -0
  35. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/bootstrap.py +0 -0
  36. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/calibration.py +0 -0
  37. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/claims.py +0 -0
  38. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/config.py +0 -0
  39. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/docs.py +0 -0
  40. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/embeddings.py +0 -0
  41. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/evidence.py +0 -0
  42. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/harness.py +0 -0
  43. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/leakage.py +0 -0
  44. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/loaders.py +0 -0
  45. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/losses.py +0 -0
  46. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/manifest.py +0 -0
  47. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/operating_points.py +0 -0
  48. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/paths.py +0 -0
  49. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/plotting.py +0 -0
  50. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/preprocessing.py +0 -0
  51. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/probes.py +0 -0
  52. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/protocols.py +0 -0
  53. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/provenance.py +0 -0
  54. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/py.typed +0 -0
  55. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  56. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  57. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  58. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  59. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  60. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  61. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/seeds.py +0 -0
  62. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/splits.py +0 -0
  63. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/text_dedup.py +0 -0
  64. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/src/eval_toolkit/thresholds.py +0 -0
  65. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  66. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  67. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  68. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  69. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  70. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  71. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  72. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  73. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  74. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  75. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/benchmarks/__init__.py +0 -0
  76. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  77. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/conftest.py +0 -0
  78. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  79. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  80. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  81. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  82. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/expected.md +0 -0
  83. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/input.md +0 -0
  84. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/docs/metrics.json +0 -0
  85. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  86. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/strategies.py +0 -0
  87. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_adversarial.py +0 -0
  88. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_analysis.py +0 -0
  89. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_artifacts.py +0 -0
  90. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  91. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  92. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_edge_cases.py +0 -0
  93. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_golden.py +0 -0
  94. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_njobs.py +0 -0
  95. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_props.py +0 -0
  96. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_research_grounded.py +0 -0
  97. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_bootstrap_unit.py +0 -0
  98. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_binary_adapters.py +0 -0
  99. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  100. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_determinism.py +0 -0
  101. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_optimization_failures.py +0 -0
  102. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_props.py +0 -0
  103. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_research_grounded.py +0 -0
  104. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_calibration_unit.py +0 -0
  105. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims.py +0 -0
  106. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims_coverage.py +0 -0
  107. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_claims_props.py +0 -0
  108. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_cli.py +0 -0
  109. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_config.py +0 -0
  110. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_bootstrap.py +0 -0
  111. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_calibration.py +0 -0
  112. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_harness.py +0 -0
  113. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_metrics.py +0 -0
  114. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_coverage_plotting.py +0 -0
  115. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_croissant_e2e.py +0 -0
  116. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  117. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_deprecations.py +0 -0
  118. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_docs_golden.py +0 -0
  119. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_docs_props.py +0 -0
  120. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_embeddings.py +0 -0
  121. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_evidence_validators.py +0 -0
  122. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_edge_cases.py +0 -0
  123. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_fault_injection.py +0 -0
  124. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_folded.py +0 -0
  125. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_internals.py +0 -0
  126. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_metric_options.py +0 -0
  127. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_parallelism.py +0 -0
  128. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_harness_smoke.py +0 -0
  129. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_import_boundaries.py +0 -0
  130. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage.py +0 -0
  131. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage_error_paths.py +0 -0
  132. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_leakage_props.py +0 -0
  133. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders.py +0 -0
  134. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders_coverage.py +0 -0
  135. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_loaders_props.py +0 -0
  136. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_logging.py +0 -0
  137. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_losses.py +0 -0
  138. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest.py +0 -0
  139. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  140. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_props.py +0 -0
  141. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_manifest_validation.py +0 -0
  142. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_props.py +0 -0
  143. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_stratified_subsets.py +0 -0
  144. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_metrics_unit.py +0 -0
  145. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_numeric_edge_cases.py +0 -0
  146. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_ood_loader.py +0 -0
  147. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_operating_points.py +0 -0
  148. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_operating_points_props.py +0 -0
  149. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_parallel.py +0 -0
  150. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_paths.py +0 -0
  151. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_pipeline_e2e.py +0 -0
  152. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_edge.py +0 -0
  153. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_smoke.py +0 -0
  154. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_plotting_visual.py +0 -0
  155. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_preprocessing.py +0 -0
  156. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_probes.py +0 -0
  157. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_protocol_conformance.py +0 -0
  158. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_provenance.py +0 -0
  159. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_public_api.py +0 -0
  160. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_recall_at_fpr.py +0 -0
  161. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_reference_equivalence.py +0 -0
  162. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_reproducibility_integration.py +0 -0
  163. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_schemas.py +0 -0
  164. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_seeds.py +0 -0
  165. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits.py +0 -0
  166. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits_leakage_integration.py +0 -0
  167. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_splits_props.py +0 -0
  168. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup.py +0 -0
  169. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_coverage.py +0 -0
  170. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_props.py +0 -0
  171. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_text_dedup_strategies.py +0 -0
  172. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds.py +0 -0
  173. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_constant_score.py +0 -0
  174. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_coverage.py +0 -0
  175. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_props.py +0 -0
  176. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_thresholds_research_grounded.py +0 -0
  177. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_tokenization_leakage_check.py +0 -0
  178. {eval_toolkit-0.44.0 → eval_toolkit-0.46.0}/tests/test_v09_contracts.py +0 -0
@@ -39,6 +39,12 @@ coverage.json
39
39
  # Logs
40
40
  *.log
41
41
 
42
+ # Local environment overrides (machine-local credentials / config)
43
+ .env.local
44
+
45
+ # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
+ mutants/
47
+
42
48
  # Claude Code project settings (machine-local)
43
49
  .claude/
44
50
 
@@ -5,6 +5,128 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
9
+
10
+ Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
11
+ **Soft-breaking** — existing top-level scalar metric imports still work but
12
+ emit `DeprecationWarning` (hard-removed at v0.47).
13
+
14
+ See `docs/source/migration/v0.46.md` for the full consumer migration guide and
15
+ `docs/source/adr/0002-scorecard-as-primary-metric-surface.md` for the
16
+ decision record.
17
+
18
+ ### Added
19
+
20
+ - **`eval_toolkit.scorecard(y_true, y_score, metrics=[...], bootstrap=True)`**
21
+ — primary v1.0 metric surface. Single call computes multiple threshold-free
22
+ metrics + bootstrap CIs on one slice; returns a `Scorecard` (read-only
23
+ `Mapping[str, MetricResult]`). Type-safe dict-subscript access; status-aware
24
+ cells; per-cell error isolation.
25
+ - **`MetricSpec` Protocol** — v1.0 Tier-2 contract; `name: str` +
26
+ `compute(y_true, y_score) -> float`. Custom user specs satisfy structurally.
27
+ - **`MetricResult`** frozen dataclass — `value: float | None`, `status:
28
+ Literal["ok", "skipped", "error"]`, `reason: str`, `ci: BootstrapCI | None`.
29
+ Reuses the existing `MetricState` vocabulary from `artifacts.py:30-61`.
30
+ - **`Scorecard`** read-only `Mapping[str, MetricResult]` — `to_dict()`
31
+ JSON-friendly, `to_pandas()` one-row DataFrame (lazy pandas import).
32
+ - **`eval_toolkit.metric_specs`** namespace submodule with threshold-free
33
+ first-party specs:
34
+ - `pr_auc`, `roc_auc`, `brier` — module-level singletons (identity stable).
35
+ - `ece(n_bins, strategy)` — LRU-cached factory (identity stable per kwargs).
36
+ - **`SINGLE_CLASS_INCOMPATIBLE_METRICS`** extended with `pr_auc` / `roc_auc`
37
+ aliases (alongside existing `auroc` / `auprc`) so the v0.46 scorecard
38
+ surface and the v0.39 harness paths both produce correct skipped-status
39
+ behavior. Non-breaking; doctest + unit tests added.
40
+ - **`docs/source/adr/0002-scorecard-as-primary-metric-surface.md`** —
41
+ decision record covering single-surface rationale, threshold-free scope,
42
+ Tier-2 Protocol commitment, and v2.0 trigger conditions.
43
+ - **`docs/source/migration/v0.46.md`** — consumer migration guide with
44
+ side-by-side recipes for every common pattern.
45
+
46
+ ### Deprecated
47
+
48
+ The following 8 top-level scalar imports emit `DeprecationWarning` and will
49
+ be hard-removed at v0.47.0. Use `scorecard()` + `metric_specs` or the
50
+ `eval_toolkit.metrics` submodule path (internal API, no warning).
51
+
52
+ - `pr_auc`, `roc_auc`, `brier_score`
53
+ - `expected_calibration_error`
54
+ - `expected_calibration_error_debiased`
55
+ - `expected_calibration_error_equal_mass`
56
+ - `expected_calibration_error_l2`
57
+ - `expected_calibration_error_l2_debiased`
58
+
59
+ ### Audit findings integrated (Round 5)
60
+
61
+ Per `docs/source/audit_findings.md`:
62
+
63
+ - **F1** (scorecard threshold semantics) — addressed by Decision R: ship
64
+ threshold-free first-party specs only at v0.46. Threshold-dependent
65
+ metrics (F1, accuracy, precision, recall) deferred to v1.x with explicit
66
+ operating-point provenance.
67
+ - **F2** (scorecard cell-state semantics) — addressed by Decision S: reuse
68
+ existing `MetricState` (`ok`/`skipped`/`error`) vocabulary.
69
+ - **F4** (deprecation shim must extend the lazy resolver, not replace it) —
70
+ addressed: `__getattr__` deprecation branch sits between `__version__`
71
+ short-circuit and the base `_EXPORTS` lookup; tagged with BEGIN/END
72
+ TRANSITIONAL markers for clean v0.47 removal. Tests guard that every
73
+ remaining `_EXPORTS` symbol still resolves.
74
+ - **X.2 precondition** — `is_metric_defined_for_slice` aliases shipped
75
+ ahead of v0.46 (PR #62).
76
+
77
+ ### Protocol stability
78
+
79
+ Tier-2 streak continues: 7 of 7 consecutive minors (v0.40–v0.46) without
80
+ method-shape edits to any existing Tier-2 Protocol. `MetricSpec` is a NEW
81
+ Tier-2 Protocol added at v0.46; freezes at v1.0.
82
+
83
+ ## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
84
+
85
+ First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
86
+ (per the v1.0 plan at `~/.claude/plans/evaluate-all-the-work-twinkly-kite.md`).
87
+ Non-breaking — purely additive. No Protocol shape edits to the existing 6
88
+ Tier-2 contracts (Gate 2 streak continues: 6 of 6 consecutive minors without
89
+ Protocol-shape changes).
90
+
91
+ ### Added
92
+
93
+ - `eval_toolkit.stacking` — new module providing the `MetaLearner` Protocol
94
+ and one reference impl, `LogisticStacker`, for combining outputs from
95
+ multiple binary detectors into a calibrated ensemble. Wraps
96
+ `sklearn.linear_model.LogisticRegression` with a stacker-shaped public API
97
+ (sklearn-style `fit(score_matrix, y)`, `predict(score_matrix)`,
98
+ `predict_proba(score_matrix)`, plus `coef_` / `classes_` / `intercept_`
99
+ attributes). No new dependencies — `scikit-learn` is already core since
100
+ v0.27. Closes #52.
101
+ - `MetaLearner` Protocol — `@runtime_checkable`; sklearn-shape contract
102
+ taking a `(n_samples, n_detectors)` score matrix. Sized as a v1.0 Tier-2
103
+ contract per the v1.0 plan Decision M (tiered stability — strict freeze at
104
+ v1.0; additive subprotocols permitted in minor releases). Mirrors the
105
+ `Probe` Protocol pattern from v0.43.
106
+ - `LogisticStacker` reference impl — configurable C, fit_intercept,
107
+ class_weight, penalty, solver, max_iter, random_state. Class-weight default
108
+ `"balanced"` for the common imbalanced-detection setting. Composes with the
109
+ 4-binary-calibrator family (v0.40 + v0.42) via `fit_platt_binary` /
110
+ `fit_isotonic_binary` chaining on stacked output.
111
+ - 24-test coverage in `tests/test_stacking.py`: Protocol satisfaction (both
112
+ structural and duck-typed), shape contracts (3-detector × 500-sample
113
+ fixtures), regularization behavior (C, L1 penalty), signal ordering,
114
+ calibration chaining (Platt + Isotonic), bootstrap CI on stacker output
115
+ (Audit F6a-aware — uses correct `BootstrapCI.ci_low/ci_high` attribute
116
+ names), determinism under fixed `random_state`, hypothesis property on
117
+ signal monotonicity, input validation (shape mismatch, single-class,
118
+ non-finite, unfit, wrong-n-detectors).
119
+ - `docs/source/examples/stacking.md` — myst-nb worked example: 3 synthetic
120
+ detectors with descending signal-to-noise, stacker fit, post-stacking
121
+ Platt calibration. Cites Wolpert 1992 + Breiman 1996.
122
+
123
+ ### Notes
124
+
125
+ - Sklearn 1.8+ deprecates `LogisticRegression(penalty=...)` in favor of
126
+ `l1_ratio`. The public `LogisticStacker(penalty=...)` API is preserved;
127
+ internal sklearn-side migration to `l1_ratio` will land when sklearn 1.10
128
+ lands and the warning becomes more visible. No user-facing impact.
129
+
8
130
  ## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
9
131
 
10
132
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.44.0
3
+ Version: 0.46.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -0,0 +1,76 @@
1
+ # Architecture Decision Records
2
+
3
+ This directory captures architecturally-significant decisions that shape
4
+ `eval-toolkit`'s long-term design. ADRs are immutable historical records —
5
+ once accepted, a decision is not edited in place; if it changes, a new ADR
6
+ supersedes it.
7
+
8
+ ## When to file an ADR
9
+
10
+ File a new ADR when a decision:
11
+
12
+ - **Locks in an interface or shape** that future code is expected to
13
+ conform to (e.g., "metrics return type", "Protocol vs ABC").
14
+ - **Closes off alternatives** that were seriously considered, so the
15
+ reasoning isn't lost.
16
+ - **Carries cost** to reverse (e.g., a public-API contract that promises
17
+ stability across a release line).
18
+
19
+ Routine refactors, bug fixes, and internal-only patterns do not need ADRs —
20
+ the commit message + CHANGELOG entry are enough.
21
+
22
+ ## Numbering
23
+
24
+ Sequential, zero-padded: `0001-flat-module-layout.md`,
25
+ `0002-scorecard-as-primary-metric-surface.md`, etc. Number is assigned
26
+ at the time of writing; if two ADRs are drafted in parallel, the second
27
+ to merge takes the next number.
28
+
29
+ ## Format
30
+
31
+ Each ADR uses this skeleton (loosely based on MADR — Markdown ADR — without
32
+ the heavyweight template):
33
+
34
+ ```markdown
35
+ # ADR NNNN: Title
36
+
37
+ **Status:** Proposed | Accepted | Superseded by ADR-MMMM
38
+ **Date:** YYYY-MM-DD
39
+ **Deciders:** (names or roles)
40
+
41
+ ## Context
42
+
43
+ What's the situation that requires a decision? What constraints are at play?
44
+
45
+ ## Decision
46
+
47
+ What did we decide?
48
+
49
+ ## Consequences
50
+
51
+ What follows from this decision? (Both positive and negative.)
52
+
53
+ ## Alternatives considered
54
+
55
+ What else was on the table, and why wasn't it chosen?
56
+
57
+ ## Trigger to revisit
58
+
59
+ What would have to change for this decision to be reopened?
60
+ (Optional but useful — keeps the ADR self-documenting.)
61
+ ```
62
+
63
+ ## Cross-references
64
+
65
+ - [`docs/RELEASING.md`](../../RELEASING.md) — release-flow process; ADRs
66
+ are typically drafted as part of release prep.
67
+ - [`docs/source/roadmap.md`](../roadmap.md) — long-term direction;
68
+ ADRs explain how individual roadmap decisions were made.
69
+
70
+ ## Index
71
+
72
+ (Updated as ADRs are added.)
73
+
74
+ | # | Title | Status | Date |
75
+ |---|---|---|---|
76
+ | _none yet_ | | | |
@@ -193,20 +193,18 @@ _EXPORTS: dict[str, str] = {
193
193
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
194
194
  "ThresholdResult": "eval_toolkit.metrics",
195
195
  "brier_decomposition": "eval_toolkit.metrics",
196
- "brier_score": "eval_toolkit.metrics",
197
- "expected_calibration_error": "eval_toolkit.metrics",
198
- "expected_calibration_error_debiased": "eval_toolkit.metrics",
199
- "expected_calibration_error_equal_mass": "eval_toolkit.metrics",
200
- "expected_calibration_error_l2": "eval_toolkit.metrics",
201
- "expected_calibration_error_l2_debiased": "eval_toolkit.metrics",
196
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
197
+ # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
198
+ # level via the `__getattr__` deprecation branch (emits
199
+ # `DeprecationWarning`; branch removed at v0.47) and via the metrics
200
+ # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
201
+ # per ADR 0002, not part of the v1.0 stability contract).
202
202
  "headline_metrics": "eval_toolkit.metrics",
203
203
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
204
204
  "metrics_at_threshold": "eval_toolkit.metrics",
205
- "pr_auc": "eval_toolkit.metrics",
206
205
  "precision_at_prior": "eval_toolkit.metrics",
207
206
  "quantile_stratified_pr_auc": "eval_toolkit.metrics",
208
207
  "quantile_stratified_report": "eval_toolkit.metrics",
209
- "roc_auc": "eval_toolkit.metrics",
210
208
  "score_distribution_summary": "eval_toolkit.metrics",
211
209
  "single_class_threshold_metrics": "eval_toolkit.metrics",
212
210
  "stratified_recall": "eval_toolkit.metrics",
@@ -294,15 +292,70 @@ _EXPORTS: dict[str, str] = {
294
292
  "recall_at_fpr": "eval_toolkit.thresholds",
295
293
  "select_threshold": "eval_toolkit.thresholds",
296
294
  "wilson_interval": "eval_toolkit.thresholds",
295
+ "LogisticStacker": "eval_toolkit.stacking",
296
+ "MetaLearner": "eval_toolkit.stacking",
297
+ "MetricResult": "eval_toolkit._scorecard",
298
+ "MetricSpec": "eval_toolkit._scorecard",
299
+ "Scorecard": "eval_toolkit._scorecard",
300
+ "scorecard": "eval_toolkit._scorecard",
297
301
  }
298
302
 
299
303
  __all__ = ["__version__", *_EXPORTS.keys()]
300
304
 
301
305
 
306
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
307
+ # At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
308
+ # in favor of the `scorecard()` surface (Decision A). To give the consumer one
309
+ # release of overlap before the hard removal at v0.47, the names below remain
310
+ # reachable via the package-level `__getattr__` (which delegates to the
311
+ # `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
312
+ # lookup pointing at the new API.
313
+ #
314
+ # WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
315
+ # `__getattr__` below is the load-bearing lazy export resolver for every name
316
+ # in `_EXPORTS`. The deprecation branch is a discrete `if name in
317
+ # _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
318
+ # behavior for non-deprecated names is unchanged. At v0.47 we delete this
319
+ # transitional block and the resolver continues to work for every remaining
320
+ # `_EXPORTS` entry.
321
+ _DEPRECATED_SCALARS: frozenset[str] = frozenset(
322
+ {
323
+ "pr_auc",
324
+ "roc_auc",
325
+ "brier_score",
326
+ "expected_calibration_error",
327
+ "expected_calibration_error_debiased",
328
+ "expected_calibration_error_equal_mass",
329
+ "expected_calibration_error_l2",
330
+ "expected_calibration_error_l2_debiased",
331
+ }
332
+ )
333
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
334
+
335
+
302
336
  def __getattr__(name: str) -> Any:
303
337
  """Resolve public symbols lazily."""
304
338
  if name == "__version__":
305
339
  return __version__
340
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
341
+ if name in _DEPRECATED_SCALARS:
342
+ import warnings
343
+
344
+ warnings.warn(
345
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
346
+ f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
347
+ f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
348
+ f" `eval_toolkit.metrics` submodule directly (internal API).",
349
+ DeprecationWarning,
350
+ stacklevel=2,
351
+ )
352
+ module = import_module("eval_toolkit.metrics")
353
+ value = getattr(module, name)
354
+ # Do NOT cache in globals() — repeated lookups should keep re-warning
355
+ # (one warning per call site, modulo Python's default
356
+ # DeprecationWarning de-duplication).
357
+ return value
358
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
306
359
  module_name = _EXPORTS.get(name)
307
360
  if module_name is None:
308
361
  raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
@@ -312,6 +365,29 @@ def __getattr__(name: str) -> Any:
312
365
  return value
313
366
 
314
367
 
368
+ # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
369
+ def _scorecard_spec_for(deprecated_name: str) -> str:
370
+ """Map a deprecated-scalar name to its `metric_specs` replacement name.
371
+
372
+ Used only inside the v0.46 deprecation warning message. Returns the
373
+ closest equivalent first-party spec name where one exists; falls back
374
+ to the original name for ECE variants whose exact-match spec isn't in
375
+ the v0.46 first-party namespace (e.g., the L2 / debiased variants —
376
+ callers either implement a custom `MetricSpec` or stay on the
377
+ submodule path).
378
+ """
379
+ return {
380
+ "pr_auc": "pr_auc",
381
+ "roc_auc": "roc_auc",
382
+ "brier_score": "brier",
383
+ "expected_calibration_error": "ece(n_bins=10)",
384
+ "expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
385
+ }.get(deprecated_name, deprecated_name)
386
+
387
+
388
+ # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
389
+
390
+
315
391
  def __dir__() -> list[str]:
316
392
  """Expose lazy public symbols to introspection."""
317
393
  return sorted(__all__)