eval-toolkit 0.45.0__tar.gz → 0.46.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/.gitignore +10 -0
  2. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/CHANGELOG.md +140 -0
  3. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/PKG-INFO +1 -1
  4. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/source/adr/README.md +3 -1
  5. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__init__.py +163 -8
  6. eval_toolkit-0.46.1/src/eval_toolkit/_scorecard.py +509 -0
  7. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_version.py +1 -1
  8. eval_toolkit-0.46.1/src/eval_toolkit/metric_specs.py +217 -0
  9. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/metrics.py +31 -2
  10. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/public_api/snapshot.json +36 -51
  11. eval_toolkit-0.46.1/tests/test_deprecated_scalars_shim.py +334 -0
  12. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_is_metric_defined_for_slice.py +25 -2
  13. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_misc_coverage.py +13 -2
  14. eval_toolkit-0.46.1/tests/test_scorecard.py +453 -0
  15. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/LICENSE +0 -0
  16. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/README.md +0 -0
  17. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/STYLE.md +0 -0
  18. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/archive/README.md +0 -0
  19. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/README.md +0 -0
  20. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/datasets/README.md +0 -0
  21. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/papers/data-integrity/README.md +0 -0
  22. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/papers/eval-ecosystem/README.md +0 -0
  23. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/papers/inference/README.md +0 -0
  24. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/research/papers/prompt-injection/README.md +0 -0
  25. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/docs/source/methodology/README.md +0 -0
  26. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/pyproject.toml +0 -0
  27. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__main__.py +0 -0
  28. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_deprecated.py +0 -0
  29. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_parallel.py +0 -0
  30. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/adversarial.py +0 -0
  31. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/analysis.py +0 -0
  32. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/artifacts.py +0 -0
  33. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/bootstrap.py +0 -0
  34. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/calibration.py +0 -0
  35. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/claims.py +0 -0
  36. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/config.py +0 -0
  37. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/docs.py +0 -0
  38. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/embeddings.py +0 -0
  39. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/evidence.py +0 -0
  40. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/harness.py +0 -0
  41. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/leakage.py +0 -0
  42. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/loaders.py +0 -0
  43. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/losses.py +0 -0
  44. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/manifest.py +0 -0
  45. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/operating_points.py +0 -0
  46. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/paths.py +0 -0
  47. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/plotting.py +0 -0
  48. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/preprocessing.py +0 -0
  49. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/probes.py +0 -0
  50. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/protocols.py +0 -0
  51. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/provenance.py +0 -0
  52. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/py.typed +0 -0
  53. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  54. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  55. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  56. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  57. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/seeds.py +0 -0
  60. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/splits.py +0 -0
  61. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/stacking.py +0 -0
  62. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/text_dedup.py +0 -0
  63. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/src/eval_toolkit/thresholds.py +0 -0
  64. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  65. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  66. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  67. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  68. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  69. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  70. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  71. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  72. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  73. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  74. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/benchmarks/__init__.py +0 -0
  75. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  76. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/conftest.py +0 -0
  77. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/bootstrap_ci/cases.json +0 -0
  78. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout.jsonl +0 -0
  79. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_expected.json +0 -0
  80. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  81. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/docs/expected.md +0 -0
  82. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/docs/input.md +0 -0
  83. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/docs/metrics.json +0 -0
  84. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  85. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/strategies.py +0 -0
  86. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_adversarial.py +0 -0
  87. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_analysis.py +0 -0
  88. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_artifacts.py +0 -0
  89. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_block_bootstrap_on_folds.py +0 -0
  90. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_calibration_mc.py +0 -0
  91. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_edge_cases.py +0 -0
  92. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_golden.py +0 -0
  93. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_njobs.py +0 -0
  94. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_props.py +0 -0
  95. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_research_grounded.py +0 -0
  96. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_unit.py +0 -0
  97. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_binary_adapters.py +0 -0
  98. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_bootstrap_chain.py +0 -0
  99. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_determinism.py +0 -0
  100. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_optimization_failures.py +0 -0
  101. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_props.py +0 -0
  102. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_research_grounded.py +0 -0
  103. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_calibration_unit.py +0 -0
  104. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_claims.py +0 -0
  105. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_claims_coverage.py +0 -0
  106. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_claims_props.py +0 -0
  107. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_cli.py +0 -0
  108. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_config.py +0 -0
  109. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_coverage_bootstrap.py +0 -0
  110. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_coverage_calibration.py +0 -0
  111. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_coverage_harness.py +0 -0
  112. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_coverage_metrics.py +0 -0
  113. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_coverage_plotting.py +0 -0
  114. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_croissant_e2e.py +0 -0
  115. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_dedup_split_leakage_chain.py +0 -0
  116. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_deprecations.py +0 -0
  117. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_docs_golden.py +0 -0
  118. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_docs_props.py +0 -0
  119. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_embeddings.py +0 -0
  120. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_evidence_validators.py +0 -0
  121. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_edge_cases.py +0 -0
  122. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_fault_injection.py +0 -0
  123. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_folded.py +0 -0
  124. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_internals.py +0 -0
  125. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_metric_options.py +0 -0
  126. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_parallelism.py +0 -0
  127. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_harness_smoke.py +0 -0
  128. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_import_boundaries.py +0 -0
  129. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_leakage.py +0 -0
  130. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_leakage_error_paths.py +0 -0
  131. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_leakage_props.py +0 -0
  132. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_loaders.py +0 -0
  133. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_loaders_coverage.py +0 -0
  134. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_loaders_props.py +0 -0
  135. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_logging.py +0 -0
  136. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_losses.py +0 -0
  137. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_manifest.py +0 -0
  138. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_manifest_contamination_round_trip.py +0 -0
  139. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_manifest_props.py +0 -0
  140. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_manifest_validation.py +0 -0
  141. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_metrics_props.py +0 -0
  142. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_metrics_stratified_subsets.py +0 -0
  143. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_metrics_unit.py +0 -0
  144. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_numeric_edge_cases.py +0 -0
  145. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_ood_loader.py +0 -0
  146. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_operating_points.py +0 -0
  147. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_operating_points_props.py +0 -0
  148. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_parallel.py +0 -0
  149. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_paths.py +0 -0
  150. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_pipeline_e2e.py +0 -0
  151. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_plotting_edge.py +0 -0
  152. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_plotting_smoke.py +0 -0
  153. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_plotting_visual.py +0 -0
  154. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_preprocessing.py +0 -0
  155. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_probes.py +0 -0
  156. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_protocol_conformance.py +0 -0
  157. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_provenance.py +0 -0
  158. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_public_api.py +0 -0
  159. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_recall_at_fpr.py +0 -0
  160. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_reference_equivalence.py +0 -0
  161. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_reproducibility_integration.py +0 -0
  162. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_schemas.py +0 -0
  163. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_seeds.py +0 -0
  164. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_splits.py +0 -0
  165. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_splits_leakage_integration.py +0 -0
  166. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_splits_props.py +0 -0
  167. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_stacking.py +0 -0
  168. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_text_dedup.py +0 -0
  169. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_coverage.py +0 -0
  170. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_props.py +0 -0
  171. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_strategies.py +0 -0
  172. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_thresholds.py +0 -0
  173. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_thresholds_constant_score.py +0 -0
  174. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_thresholds_coverage.py +0 -0
  175. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_thresholds_props.py +0 -0
  176. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_thresholds_research_grounded.py +0 -0
  177. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_tokenization_leakage_check.py +0 -0
  178. {eval_toolkit-0.45.0 → eval_toolkit-0.46.1}/tests/test_v09_contracts.py +0 -0
@@ -45,6 +45,16 @@ coverage.json
45
45
  # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
46
  mutants/
47
47
 
48
+ # Local audit artifacts (Round 5+ Gate 3 LLM cross-review packets + reports).
49
+ # The canonical prompt lives at ~/.claude/plans/gate3-audit-prompt.md and the
50
+ # canonical findings ledger lives at docs/source/audit_findings.md; per-run
51
+ # raw model outputs are author-local working copies.
52
+ # Tracked: per-round briefing files (`gate3-audit-round-<N>.md`).
53
+ # Untracked: prompt template, generic report, per-round report files.
54
+ gate3-audit-prompt.md
55
+ gate3-audit-report.md
56
+ gate3-audit-round-*-report.md
57
+
48
58
  # Claude Code project settings (machine-local)
49
59
  .claude/
50
60
 
@@ -5,6 +5,146 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
9
+
10
+ Hotfix release per **Decision Q** (data correctness regression + time-sensitive
11
+ warning content) + **Decision R6-E** (scope: R6-F1 + R6-F2 only; R6-A docstring
12
+ rolls forward to v0.47). All other Round 6 findings dispositioned to v0.47.0.
13
+
14
+ See [`docs/source/audit_findings.md`](docs/source/audit_findings.md) Round 6 for
15
+ the full disposition ledger.
16
+
17
+ ### Fixed
18
+
19
+ - **`metric_specs.ece(strategy=<value>)` strategy validation** (Round 6 Codex
20
+ R6-F1). Prior to v0.46.1, an invalid strategy string (e.g.
21
+ `metric_specs.ece(strategy="typo")`) silently dispatched to quantile ECE and
22
+ returned a `scorecard()` cell with `status="ok"` under an invalid encoded key
23
+ (`"ece_n_bins_15_strategy_typo"`) — wrong-by-design data correctness path.
24
+ Verified by Codex via runtime probe. Now both the `ece()` factory and
25
+ `_EceSpec.compute()` raise:
26
+ ```
27
+ ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
28
+ ```
29
+ Defence-in-depth: the factory validates eagerly (before LRU cache hit) AND
30
+ `compute()` validates at the compute boundary so direct construction of
31
+ `_EceSpec(strategy="typo")` (bypassing the factory) also raises.
32
+
33
+ - **Deprecation warning content for all 5 ECE variants** (Round 6 Codex R6-F2 +
34
+ Gemini R6-F2, with Decisions R6-F + R6-G). The v0.46.0 `__getattr__`
35
+ deprecation shim's warning messages produced broken migration snippets:
36
+ - For `expected_calibration_error` + `expected_calibration_error_equal_mass`:
37
+ the suggested `Scorecard` lookup key was the factory-call expression
38
+ (`"ece(n_bins=10)"`) instead of the encoded spec name
39
+ (`"ece_n_bins_10_strategy_uniform"`). Now uses the correct encoded key.
40
+ - For `expected_calibration_error_debiased` / `_l2` / `_l2_debiased`: these
41
+ variants are not in the v0.46 `metric_specs` namespace (Decision R6-G;
42
+ research-completeness primitives, deferred to v1.x if user demand
43
+ surfaces). Their warnings now point at the submodule path
44
+ (`from eval_toolkit.metrics import expected_calibration_error_debiased`)
45
+ instead of an unconstructable scorecard snippet.
46
+ - Pre-v0.46 default verification: Gemini's report claimed
47
+ `expected_calibration_error` defaulted to `n_bins=15`; verified against
48
+ `metrics.py:730-734` that the actual default is `n_bins=10`. Per Decision
49
+ R6-F, warning snippets use `n_bins=10` to preserve bit-identical pre-v0.46
50
+ math + add a migration note explaining the new `metric_specs.ece()` factory
51
+ default of `n_bins=15` (matching Hines et al.).
52
+
53
+ ### Tests
54
+
55
+ - `tests/test_scorecard.py`: 4 new tests for ECE strategy validation
56
+ (parametrized factory-rejection + compute-defence-in-depth).
57
+ - `tests/test_deprecated_scalars_shim.py`: 4 new test classes — verify each
58
+ warning contains correct factory expression + encoded scorecard key, ECE
59
+ warnings carry the n_bins=10/15 migration note, submodule-only warnings cite
60
+ `eval_toolkit.metrics` path, and the snippet in each first-party warning is
61
+ EXECUTABLE (parses + runs against synthetic data + produces ok-status cell).
62
+
63
+ ### Rolled forward to v0.47 (Decision R6-E)
64
+
65
+ - R6-A `seed=None` docstring fix (non-blocker per Decision Q).
66
+ - R6-F3 duplicate `MetricSpec.name` rejection.
67
+ - R6-F5 (Codex) Protocol method-shape drift guard.
68
+ - R6-F3 (Gemini) `Scorecard.to_pandas()` schema expansion.
69
+ - R6-F4 (Gemini) `make_spec_name()` helper.
70
+ - R6-F5 (Gemini) narrow `_evaluate_spec()` exception catch.
71
+ - R6-F6 (Codex) plan + roadmap state-drift refresh.
72
+
73
+ ## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
74
+
75
+ Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
76
+ **Soft-breaking** — existing top-level scalar metric imports still work but
77
+ emit `DeprecationWarning` (hard-removed at v0.47).
78
+
79
+ See `docs/source/migration/v0.46.md` for the full consumer migration guide and
80
+ `docs/source/adr/0002-scorecard-as-primary-metric-surface.md` for the
81
+ decision record.
82
+
83
+ ### Added
84
+
85
+ - **`eval_toolkit.scorecard(y_true, y_score, metrics=[...], bootstrap=True)`**
86
+ — primary v1.0 metric surface. Single call computes multiple threshold-free
87
+ metrics + bootstrap CIs on one slice; returns a `Scorecard` (read-only
88
+ `Mapping[str, MetricResult]`). Type-safe dict-subscript access; status-aware
89
+ cells; per-cell error isolation.
90
+ - **`MetricSpec` Protocol** — v1.0 Tier-2 contract; `name: str` +
91
+ `compute(y_true, y_score) -> float`. Custom user specs satisfy structurally.
92
+ - **`MetricResult`** frozen dataclass — `value: float | None`, `status:
93
+ Literal["ok", "skipped", "error"]`, `reason: str`, `ci: BootstrapCI | None`.
94
+ Reuses the existing `MetricState` vocabulary from `artifacts.py:30-61`.
95
+ - **`Scorecard`** read-only `Mapping[str, MetricResult]` — `to_dict()`
96
+ JSON-friendly, `to_pandas()` one-row DataFrame (lazy pandas import).
97
+ - **`eval_toolkit.metric_specs`** namespace submodule with threshold-free
98
+ first-party specs:
99
+ - `pr_auc`, `roc_auc`, `brier` — module-level singletons (identity stable).
100
+ - `ece(n_bins, strategy)` — LRU-cached factory (identity stable per kwargs).
101
+ - **`SINGLE_CLASS_INCOMPATIBLE_METRICS`** extended with `pr_auc` / `roc_auc`
102
+ aliases (alongside existing `auroc` / `auprc`) so the v0.46 scorecard
103
+ surface and the v0.39 harness paths both produce correct skipped-status
104
+ behavior. Non-breaking; doctest + unit tests added.
105
+ - **`docs/source/adr/0002-scorecard-as-primary-metric-surface.md`** —
106
+ decision record covering single-surface rationale, threshold-free scope,
107
+ Tier-2 Protocol commitment, and v2.0 trigger conditions.
108
+ - **`docs/source/migration/v0.46.md`** — consumer migration guide with
109
+ side-by-side recipes for every common pattern.
110
+
111
+ ### Deprecated
112
+
113
+ The following 8 top-level scalar imports emit `DeprecationWarning` and will
114
+ be hard-removed at v0.47.0. Use `scorecard()` + `metric_specs` or the
115
+ `eval_toolkit.metrics` submodule path (internal API, no warning).
116
+
117
+ - `pr_auc`, `roc_auc`, `brier_score`
118
+ - `expected_calibration_error`
119
+ - `expected_calibration_error_debiased`
120
+ - `expected_calibration_error_equal_mass`
121
+ - `expected_calibration_error_l2`
122
+ - `expected_calibration_error_l2_debiased`
123
+
124
+ ### Audit findings integrated (Round 5)
125
+
126
+ Per `docs/source/audit_findings.md`:
127
+
128
+ - **F1** (scorecard threshold semantics) — addressed by Decision R: ship
129
+ threshold-free first-party specs only at v0.46. Threshold-dependent
130
+ metrics (F1, accuracy, precision, recall) deferred to v1.x with explicit
131
+ operating-point provenance.
132
+ - **F2** (scorecard cell-state semantics) — addressed by Decision S: reuse
133
+ existing `MetricState` (`ok`/`skipped`/`error`) vocabulary.
134
+ - **F4** (deprecation shim must extend the lazy resolver, not replace it) —
135
+ addressed: `__getattr__` deprecation branch sits between `__version__`
136
+ short-circuit and the base `_EXPORTS` lookup; tagged with BEGIN/END
137
+ TRANSITIONAL markers for clean v0.47 removal. Tests guard that every
138
+ remaining `_EXPORTS` symbol still resolves.
139
+ - **X.2 precondition** — `is_metric_defined_for_slice` aliases shipped
140
+ ahead of v0.46 (PR #62).
141
+
142
+ ### Protocol stability
143
+
144
+ Tier-2 streak continues: 7 of 7 consecutive minors (v0.40–v0.46) without
145
+ method-shape edits to any existing Tier-2 Protocol. `MetricSpec` is a NEW
146
+ Tier-2 Protocol added at v0.46; freezes at v1.0.
147
+
8
148
  ## [0.45.0] — 2026-05-21 — Stacking: MetaLearner Protocol + LogisticStacker (closes #52)
9
149
 
10
150
  First minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.45.0
3
+ Version: 0.46.1
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -73,4 +73,6 @@ What would have to change for this decision to be reopened?
73
73
 
74
74
  | # | Title | Status | Date |
75
75
  |---|---|---|---|
76
- | _none yet_ | | | |
76
+ | [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
77
+ | [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
78
+ | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
@@ -193,20 +193,18 @@ _EXPORTS: dict[str, str] = {
193
193
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
194
194
  "ThresholdResult": "eval_toolkit.metrics",
195
195
  "brier_decomposition": "eval_toolkit.metrics",
196
- "brier_score": "eval_toolkit.metrics",
197
- "expected_calibration_error": "eval_toolkit.metrics",
198
- "expected_calibration_error_debiased": "eval_toolkit.metrics",
199
- "expected_calibration_error_equal_mass": "eval_toolkit.metrics",
200
- "expected_calibration_error_l2": "eval_toolkit.metrics",
201
- "expected_calibration_error_l2_debiased": "eval_toolkit.metrics",
196
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
197
+ # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
198
+ # level via the `__getattr__` deprecation branch (emits
199
+ # `DeprecationWarning`; branch removed at v0.47) and via the metrics
200
+ # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
201
+ # per ADR 0002, not part of the v1.0 stability contract).
202
202
  "headline_metrics": "eval_toolkit.metrics",
203
203
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
204
204
  "metrics_at_threshold": "eval_toolkit.metrics",
205
- "pr_auc": "eval_toolkit.metrics",
206
205
  "precision_at_prior": "eval_toolkit.metrics",
207
206
  "quantile_stratified_pr_auc": "eval_toolkit.metrics",
208
207
  "quantile_stratified_report": "eval_toolkit.metrics",
209
- "roc_auc": "eval_toolkit.metrics",
210
208
  "score_distribution_summary": "eval_toolkit.metrics",
211
209
  "single_class_threshold_metrics": "eval_toolkit.metrics",
212
210
  "stratified_recall": "eval_toolkit.metrics",
@@ -296,15 +294,65 @@ _EXPORTS: dict[str, str] = {
296
294
  "wilson_interval": "eval_toolkit.thresholds",
297
295
  "LogisticStacker": "eval_toolkit.stacking",
298
296
  "MetaLearner": "eval_toolkit.stacking",
297
+ "MetricResult": "eval_toolkit._scorecard",
298
+ "MetricSpec": "eval_toolkit._scorecard",
299
+ "Scorecard": "eval_toolkit._scorecard",
300
+ "scorecard": "eval_toolkit._scorecard",
299
301
  }
300
302
 
301
303
  __all__ = ["__version__", *_EXPORTS.keys()]
302
304
 
303
305
 
306
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
307
+ # At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
308
+ # in favor of the `scorecard()` surface (Decision A). To give the consumer one
309
+ # release of overlap before the hard removal at v0.47, the names below remain
310
+ # reachable via the package-level `__getattr__` (which delegates to the
311
+ # `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
312
+ # lookup pointing at the new API.
313
+ #
314
+ # WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
315
+ # `__getattr__` below is the load-bearing lazy export resolver for every name
316
+ # in `_EXPORTS`. The deprecation branch is a discrete `if name in
317
+ # _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
318
+ # behavior for non-deprecated names is unchanged. At v0.47 we delete this
319
+ # transitional block and the resolver continues to work for every remaining
320
+ # `_EXPORTS` entry.
321
+ _DEPRECATED_SCALARS: frozenset[str] = frozenset(
322
+ {
323
+ "pr_auc",
324
+ "roc_auc",
325
+ "brier_score",
326
+ "expected_calibration_error",
327
+ "expected_calibration_error_debiased",
328
+ "expected_calibration_error_equal_mass",
329
+ "expected_calibration_error_l2",
330
+ "expected_calibration_error_l2_debiased",
331
+ }
332
+ )
333
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
334
+
335
+
304
336
  def __getattr__(name: str) -> Any:
305
337
  """Resolve public symbols lazily."""
306
338
  if name == "__version__":
307
339
  return __version__
340
+ # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
341
+ if name in _DEPRECATED_SCALARS:
342
+ import warnings
343
+
344
+ warnings.warn(
345
+ _deprecation_warning_for(name),
346
+ DeprecationWarning,
347
+ stacklevel=2,
348
+ )
349
+ module = import_module("eval_toolkit.metrics")
350
+ value = getattr(module, name)
351
+ # Do NOT cache in globals() — repeated lookups should keep re-warning
352
+ # (one warning per call site, modulo Python's default
353
+ # DeprecationWarning de-duplication).
354
+ return value
355
+ # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
308
356
  module_name = _EXPORTS.get(name)
309
357
  if module_name is None:
310
358
  raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
@@ -314,6 +362,113 @@ def __getattr__(name: str) -> Any:
314
362
  return value
315
363
 
316
364
 
365
+ # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
366
+ #
367
+ # Per Round 6 audit (Codex R6-F2 + Gemini R6-F2; Decisions R6-F + R6-G):
368
+ # - For deprecated scalars with a first-party `metric_specs` equivalent, the
369
+ # warning emits an EXECUTABLE scorecard snippet (factory expression + the
370
+ # correct encoded scorecard key, not the factory call string).
371
+ # - For the 3 ECE variants without a `metric_specs` equivalent
372
+ # (expected_calibration_error_debiased / _l2 / _l2_debiased), the warning
373
+ # instead points at the submodule path per Decision R6-G — no first-party
374
+ # replacement is shipped at v0.47.
375
+ # - ECE `n_bins=10` preserves the pre-v0.46 default (verified at
376
+ # `metrics.py:730-734`) — Decision R6-F. A migration note explains that
377
+ # the v0.46+ `metric_specs.ece()` factory defaults to `n_bins=15` (matching
378
+ # Hines et al.) and how to opt in.
379
+ _FirstParty = tuple[str, str] # (factory_expression, scorecard_key)
380
+ """Type alias for a deprecated-scalar that has a metric_specs replacement.
381
+
382
+ The factory expression is what the user types after ``metric_specs.``; the
383
+ scorecard key is the literal string that indexes ``Scorecard``.
384
+ """
385
+
386
+
387
+ _FIRST_PARTY_REPLACEMENTS: dict[str, _FirstParty] = {
388
+ "pr_auc": ("pr_auc", "pr_auc"),
389
+ "roc_auc": ("roc_auc", "roc_auc"),
390
+ "brier_score": ("brier", "brier"),
391
+ # ECE variants: use n_bins=10 (pre-v0.46 default per Decision R6-F).
392
+ # The migration note in the warning text explains how to switch to
393
+ # n_bins=15 if the user wants the v0.46+ metric_specs.ece() default.
394
+ "expected_calibration_error": (
395
+ "ece(n_bins=10)",
396
+ "ece_n_bins_10_strategy_uniform",
397
+ ),
398
+ "expected_calibration_error_equal_mass": (
399
+ 'ece(n_bins=10, strategy="quantile")',
400
+ "ece_n_bins_10_strategy_quantile",
401
+ ),
402
+ }
403
+ """Names that have a first-party metric_specs replacement at v0.46.
404
+
405
+ The 3 ECE variants NOT in this map (_debiased, _l2, _l2_debiased) get the
406
+ submodule-path warning template instead (Decision R6-G).
407
+ """
408
+
409
+
410
+ def _deprecation_warning_for(name: str) -> str:
411
+ """Render the DeprecationWarning message for a deprecated scalar name.
412
+
413
+ Branches on whether ``name`` has a first-party `metric_specs` replacement
414
+ (Decision R6-G):
415
+
416
+ - First-party (5 names): scorecard snippet with the correct encoded key
417
+ (Decision R6-F).
418
+ - Submodule-only (3 ECE variants): point at the submodule path per
419
+ Decision R6-G.
420
+
421
+ The first-party variants for ECE include a migration note explaining the
422
+ new ``metric_specs.ece()`` factory default of ``n_bins=15`` so users can
423
+ opt in to the new convention; the snippet itself uses ``n_bins=10`` for
424
+ bit-identical pre-v0.46 math (Decision R6-F).
425
+
426
+ Parameters
427
+ ----------
428
+ name : str
429
+ A name in ``_DEPRECATED_SCALARS``.
430
+
431
+ Returns
432
+ -------
433
+ str
434
+ The warning message, ready to pass to ``warnings.warn``.
435
+ """
436
+ first_party = _FIRST_PARTY_REPLACEMENTS.get(name)
437
+ if first_party is not None:
438
+ factory_expr, scorecard_key = first_party
439
+ msg = (
440
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
441
+ f"For the same math, use:\n"
442
+ f" scorecard(y, s, metrics=[metric_specs.{factory_expr}])"
443
+ f'["{scorecard_key}"].value\n'
444
+ f"Or import from the eval_toolkit.metrics submodule directly "
445
+ f"(internal API per ADR 0002 — stable across v1.x, subject to "
446
+ f"refactor in major versions)."
447
+ )
448
+ # ECE-specific migration note about the n_bins default change.
449
+ if name.startswith("expected_calibration_error"):
450
+ msg += (
451
+ "\nNote: the v0.46+ metric_specs.ece() factory defaults to "
452
+ "n_bins=15 (matching Hines et al.); the n_bins=10 in this "
453
+ "snippet preserves the pre-v0.46 math. Pass n_bins=15 to use "
454
+ "the new convention."
455
+ )
456
+ return msg
457
+ # Decision R6-G: 3 ECE variants without first-party replacements →
458
+ # submodule path only.
459
+ return (
460
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
461
+ f"This variant is NOT in v0.46+ metric_specs. Use:\n"
462
+ f" from eval_toolkit.metrics import {name}\n"
463
+ f"(internal API per ADR 0002 — stable across v1.x, subject to "
464
+ f"refactor in major versions). Or contribute the variant to "
465
+ f"metric_specs if you use it regularly."
466
+ )
467
+
468
+
469
+ # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
470
+
471
+
317
472
  def __dir__() -> list[str]:
318
473
  """Expose lazy public symbols to introspection."""
319
474
  return sorted(__all__)