eval-toolkit 0.46.0__tar.gz → 0.46.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/.gitignore +10 -0
  2. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/CHANGELOG.md +65 -0
  3. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/PKG-INFO +1 -1
  4. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/source/adr/README.md +3 -1
  5. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__init__.py +100 -19
  6. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/metric_specs.py +35 -0
  8. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/public_api/snapshot.json +1 -1
  9. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_deprecated_scalars_shim.py +157 -7
  10. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_scorecard.py +58 -13
  11. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/LICENSE +0 -0
  12. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/README.md +0 -0
  13. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/STYLE.md +0 -0
  14. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/archive/README.md +0 -0
  15. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/README.md +0 -0
  16. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/datasets/README.md +0 -0
  17. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/data-integrity/README.md +0 -0
  18. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/eval-ecosystem/README.md +0 -0
  19. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/inference/README.md +0 -0
  20. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/research/papers/prompt-injection/README.md +0 -0
  21. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/docs/source/methodology/README.md +0 -0
  22. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/pyproject.toml +0 -0
  23. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/__main__.py +0 -0
  24. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_deprecated.py +0 -0
  25. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_parallel.py +0 -0
  26. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/_scorecard.py +0 -0
  27. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/adversarial.py +0 -0
  28. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/bootstrap.py +0 -0
  31. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/calibration.py +0 -0
  32. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/claims.py +0 -0
  33. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/config.py +0 -0
  34. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/docs.py +0 -0
  35. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/embeddings.py +0 -0
  36. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/evidence.py +0 -0
  37. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/harness.py +0 -0
  38. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/leakage.py +0 -0
  39. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/loaders.py +0 -0
  40. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/losses.py +0 -0
  41. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/manifest.py +0 -0
  42. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/metrics.py +0 -0
  43. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/operating_points.py +0 -0
  44. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/paths.py +0 -0
  45. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/plotting.py +0 -0
  46. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/preprocessing.py +0 -0
  47. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/probes.py +0 -0
  48. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/protocols.py +0 -0
  49. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/provenance.py +0 -0
  50. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/py.typed +0 -0
  51. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  52. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  53. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  54. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  55. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results.v1.json +0 -0
  56. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  57. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/seeds.py +0 -0
  58. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/splits.py +0 -0
  59. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/stacking.py +0 -0
  60. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/text_dedup.py +0 -0
  61. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/src/eval_toolkit/thresholds.py +0 -0
  62. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  63. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  64. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  65. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  66. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  67. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  68. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  69. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  70. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  71. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  72. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/benchmarks/__init__.py +0 -0
  73. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  74. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/conftest.py +0 -0
  75. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/bootstrap_ci/cases.json +0 -0
  76. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout.jsonl +0 -0
  77. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_expected.json +0 -0
  78. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  79. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/expected.md +0 -0
  80. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/input.md +0 -0
  81. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/docs/metrics.json +0 -0
  82. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  83. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/strategies.py +0 -0
  84. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_adversarial.py +0 -0
  85. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_analysis.py +0 -0
  86. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_artifacts.py +0 -0
  87. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_block_bootstrap_on_folds.py +0 -0
  88. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_calibration_mc.py +0 -0
  89. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_edge_cases.py +0 -0
  90. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_golden.py +0 -0
  91. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_njobs.py +0 -0
  92. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_props.py +0 -0
  93. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_research_grounded.py +0 -0
  94. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_bootstrap_unit.py +0 -0
  95. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_binary_adapters.py +0 -0
  96. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_bootstrap_chain.py +0 -0
  97. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_determinism.py +0 -0
  98. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_optimization_failures.py +0 -0
  99. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_props.py +0 -0
  100. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_research_grounded.py +0 -0
  101. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_calibration_unit.py +0 -0
  102. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims.py +0 -0
  103. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims_coverage.py +0 -0
  104. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_claims_props.py +0 -0
  105. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_cli.py +0 -0
  106. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_config.py +0 -0
  107. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_bootstrap.py +0 -0
  108. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_calibration.py +0 -0
  109. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_harness.py +0 -0
  110. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_metrics.py +0 -0
  111. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_coverage_plotting.py +0 -0
  112. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_croissant_e2e.py +0 -0
  113. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_dedup_split_leakage_chain.py +0 -0
  114. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_deprecations.py +0 -0
  115. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_docs_golden.py +0 -0
  116. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_docs_props.py +0 -0
  117. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_embeddings.py +0 -0
  118. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_evidence_validators.py +0 -0
  119. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_edge_cases.py +0 -0
  120. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_fault_injection.py +0 -0
  121. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_folded.py +0 -0
  122. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_internals.py +0 -0
  123. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_metric_options.py +0 -0
  124. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_parallelism.py +0 -0
  125. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_harness_smoke.py +0 -0
  126. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_import_boundaries.py +0 -0
  127. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_is_metric_defined_for_slice.py +0 -0
  128. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage.py +0 -0
  129. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage_error_paths.py +0 -0
  130. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_leakage_props.py +0 -0
  131. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders.py +0 -0
  132. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders_coverage.py +0 -0
  133. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_loaders_props.py +0 -0
  134. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_logging.py +0 -0
  135. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_losses.py +0 -0
  136. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest.py +0 -0
  137. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_contamination_round_trip.py +0 -0
  138. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_props.py +0 -0
  139. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_manifest_validation.py +0 -0
  140. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_props.py +0 -0
  141. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_stratified_subsets.py +0 -0
  142. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_metrics_unit.py +0 -0
  143. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_misc_coverage.py +0 -0
  144. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_numeric_edge_cases.py +0 -0
  145. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_ood_loader.py +0 -0
  146. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_operating_points.py +0 -0
  147. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_operating_points_props.py +0 -0
  148. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_parallel.py +0 -0
  149. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_paths.py +0 -0
  150. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_pipeline_e2e.py +0 -0
  151. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_edge.py +0 -0
  152. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_smoke.py +0 -0
  153. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_plotting_visual.py +0 -0
  154. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_preprocessing.py +0 -0
  155. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_probes.py +0 -0
  156. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_protocol_conformance.py +0 -0
  157. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_provenance.py +0 -0
  158. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_public_api.py +0 -0
  159. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_recall_at_fpr.py +0 -0
  160. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_reference_equivalence.py +0 -0
  161. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_reproducibility_integration.py +0 -0
  162. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_schemas.py +0 -0
  163. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_seeds.py +0 -0
  164. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits.py +0 -0
  165. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits_leakage_integration.py +0 -0
  166. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_splits_props.py +0 -0
  167. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_stacking.py +0 -0
  168. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup.py +0 -0
  169. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_coverage.py +0 -0
  170. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_props.py +0 -0
  171. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_text_dedup_strategies.py +0 -0
  172. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds.py +0 -0
  173. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_constant_score.py +0 -0
  174. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_coverage.py +0 -0
  175. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_props.py +0 -0
  176. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_thresholds_research_grounded.py +0 -0
  177. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_tokenization_leakage_check.py +0 -0
  178. {eval_toolkit-0.46.0 → eval_toolkit-0.46.1}/tests/test_v09_contracts.py +0 -0
@@ -45,6 +45,16 @@ coverage.json
45
45
  # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
46
  mutants/
47
47
 
48
+ # Local audit artifacts (Round 5+ Gate 3 LLM cross-review packets + reports).
49
+ # The canonical prompt lives at ~/.claude/plans/gate3-audit-prompt.md and the
50
+ # canonical findings ledger lives at docs/source/audit_findings.md; per-run
51
+ # raw model outputs are author-local working copies.
52
+ # Tracked: per-round briefing files (`gate3-audit-round-<N>.md`).
53
+ # Untracked: prompt template, generic report, per-round report files.
54
+ gate3-audit-prompt.md
55
+ gate3-audit-report.md
56
+ gate3-audit-round-*-report.md
57
+
48
58
  # Claude Code project settings (machine-local)
49
59
  .claude/
50
60
 
@@ -5,6 +5,71 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
9
+
10
+ Hotfix release per **Decision Q** (data correctness regression + time-sensitive
11
+ warning content) + **Decision R6-E** (scope: R6-F1 + R6-F2 only; R6-A docstring
12
+ rolls forward to v0.47). All other Round 6 findings dispositioned to v0.47.0.
13
+
14
+ See [`docs/source/audit_findings.md`](docs/source/audit_findings.md) Round 6 for
15
+ the full disposition ledger.
16
+
17
+ ### Fixed
18
+
19
+ - **`metric_specs.ece(strategy=<value>)` strategy validation** (Round 6 Codex
20
+ R6-F1). Prior to v0.46.1, an invalid strategy string (e.g.
21
+ `metric_specs.ece(strategy="typo")`) silently dispatched to quantile ECE and
22
+ returned a `scorecard()` cell with `status="ok"` under an invalid encoded key
23
+ (`"ece_n_bins_15_strategy_typo"`) — wrong-by-design data correctness path.
24
+ Verified by Codex via runtime probe. Now both the `ece()` factory and
25
+ `_EceSpec.compute()` raise:
26
+ ```
27
+ ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
28
+ ```
29
+ Defence-in-depth: the factory validates eagerly (before LRU cache hit) AND
30
+ `compute()` validates at the compute boundary so direct construction of
31
+ `_EceSpec(strategy="typo")` (bypassing the factory) also raises.
32
+
33
+ - **Deprecation warning content for all 5 ECE variants** (Round 6 Codex R6-F2 +
34
+ Gemini R6-F2, with Decisions R6-F + R6-G). The v0.46.0 `__getattr__`
35
+ deprecation shim's warning messages produced broken migration snippets:
36
+ - For `expected_calibration_error` + `expected_calibration_error_equal_mass`:
37
+ the suggested `Scorecard` lookup key was the factory-call expression
38
+ (`"ece(n_bins=10)"`) instead of the encoded spec name
39
+ (`"ece_n_bins_10_strategy_uniform"`). Now uses the correct encoded key.
40
+ - For `expected_calibration_error_debiased` / `_l2` / `_l2_debiased`: these
41
+ variants are not in the v0.46 `metric_specs` namespace (Decision R6-G;
42
+ research-completeness primitives, deferred to v1.x if user demand
43
+ surfaces). Their warnings now point at the submodule path
44
+ (`from eval_toolkit.metrics import expected_calibration_error_debiased`)
45
+ instead of an unconstructable scorecard snippet.
46
+ - Pre-v0.46 default verification: Gemini's report claimed
47
+ `expected_calibration_error` defaulted to `n_bins=15`; verified against
48
+ `metrics.py:730-734` that the actual default is `n_bins=10`. Per Decision
49
+ R6-F, warning snippets use `n_bins=10` to preserve bit-identical pre-v0.46
50
+ math + add a migration note explaining the new `metric_specs.ece()` factory
51
+ default of `n_bins=15` (matching Hines et al.).
52
+
53
+ ### Tests
54
+
55
+ - `tests/test_scorecard.py`: 4 new tests for ECE strategy validation
56
+ (parametrized factory-rejection + compute-defence-in-depth).
57
+ - `tests/test_deprecated_scalars_shim.py`: 4 new test classes — verify each
58
+ warning contains correct factory expression + encoded scorecard key, ECE
59
+ warnings carry the n_bins=10/15 migration note, submodule-only warnings cite
60
+ `eval_toolkit.metrics` path, and the snippet in each first-party warning is
61
+ EXECUTABLE (parses + runs against synthetic data + produces ok-status cell).
62
+
63
+ ### Rolled forward to v0.47 (Decision R6-E)
64
+
65
+ - R6-A `seed=None` docstring fix (non-blocker per Decision Q).
66
+ - R6-F3 duplicate `MetricSpec.name` rejection.
67
+ - R6-F5 (Codex) Protocol method-shape drift guard.
68
+ - R6-F3 (Gemini) `Scorecard.to_pandas()` schema expansion.
69
+ - R6-F4 (Gemini) `make_spec_name()` helper.
70
+ - R6-F5 (Gemini) narrow `_evaluate_spec()` exception catch.
71
+ - R6-F6 (Codex) plan + roadmap state-drift refresh.
72
+
8
73
  ## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
9
74
 
10
75
  Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.46.0
3
+ Version: 0.46.1
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -73,4 +73,6 @@ What would have to change for this decision to be reopened?
73
73
 
74
74
  | # | Title | Status | Date |
75
75
  |---|---|---|---|
76
- | _none yet_ | | | |
76
+ | [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
77
+ | [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
78
+ | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
@@ -342,10 +342,7 @@ def __getattr__(name: str) -> Any:
342
342
  import warnings
343
343
 
344
344
  warnings.warn(
345
- f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
346
- f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
347
- f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
348
- f" `eval_toolkit.metrics` submodule directly (internal API).",
345
+ _deprecation_warning_for(name),
349
346
  DeprecationWarning,
350
347
  stacklevel=2,
351
348
  )
@@ -366,23 +363,107 @@ def __getattr__(name: str) -> Any:
366
363
 
367
364
 
368
365
  # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
369
- def _scorecard_spec_for(deprecated_name: str) -> str:
370
- """Map a deprecated-scalar name to its `metric_specs` replacement name.
366
+ #
367
+ # Per Round 6 audit (Codex R6-F2 + Gemini R6-F2; Decisions R6-F + R6-G):
368
+ # - For deprecated scalars with a first-party `metric_specs` equivalent, the
369
+ # warning emits an EXECUTABLE scorecard snippet (factory expression + the
370
+ # correct encoded scorecard key, not the factory call string).
371
+ # - For the 3 ECE variants without a `metric_specs` equivalent
372
+ # (expected_calibration_error_debiased / _l2 / _l2_debiased), the warning
373
+ # instead points at the submodule path per Decision R6-G — no first-party
374
+ # replacement is shipped at v0.47.
375
+ # - ECE `n_bins=10` preserves the pre-v0.46 default (verified at
376
+ # `metrics.py:730-734`) — Decision R6-F. A migration note explains that
377
+ # the v0.46+ `metric_specs.ece()` factory defaults to `n_bins=15` (matching
378
+ # Hines et al.) and how to opt in.
379
+ _FirstParty = tuple[str, str] # (factory_expression, scorecard_key)
380
+ """Type alias for a deprecated-scalar that has a metric_specs replacement.
381
+
382
+ The factory expression is what the user types after ``metric_specs.``; the
383
+ scorecard key is the literal string that indexes ``Scorecard``.
384
+ """
385
+
386
+
387
+ _FIRST_PARTY_REPLACEMENTS: dict[str, _FirstParty] = {
388
+ "pr_auc": ("pr_auc", "pr_auc"),
389
+ "roc_auc": ("roc_auc", "roc_auc"),
390
+ "brier_score": ("brier", "brier"),
391
+ # ECE variants: use n_bins=10 (pre-v0.46 default per Decision R6-F).
392
+ # The migration note in the warning text explains how to switch to
393
+ # n_bins=15 if the user wants the v0.46+ metric_specs.ece() default.
394
+ "expected_calibration_error": (
395
+ "ece(n_bins=10)",
396
+ "ece_n_bins_10_strategy_uniform",
397
+ ),
398
+ "expected_calibration_error_equal_mass": (
399
+ 'ece(n_bins=10, strategy="quantile")',
400
+ "ece_n_bins_10_strategy_quantile",
401
+ ),
402
+ }
403
+ """Names that have a first-party metric_specs replacement at v0.46.
404
+
405
+ The 3 ECE variants NOT in this map (_debiased, _l2, _l2_debiased) get the
406
+ submodule-path warning template instead (Decision R6-G).
407
+ """
371
408
 
372
- Used only inside the v0.46 deprecation warning message. Returns the
373
- closest equivalent first-party spec name where one exists; falls back
374
- to the original name for ECE variants whose exact-match spec isn't in
375
- the v0.46 first-party namespace (e.g., the L2 / debiased variants —
376
- callers either implement a custom `MetricSpec` or stay on the
377
- submodule path).
409
+
410
+ def _deprecation_warning_for(name: str) -> str:
411
+ """Render the DeprecationWarning message for a deprecated scalar name.
412
+
413
+ Branches on whether ``name`` has a first-party `metric_specs` replacement
414
+ (Decision R6-G):
415
+
416
+ - First-party (5 names): scorecard snippet with the correct encoded key
417
+ (Decision R6-F).
418
+ - Submodule-only (3 ECE variants): point at the submodule path per
419
+ Decision R6-G.
420
+
421
+ The first-party variants for ECE include a migration note explaining the
422
+ new ``metric_specs.ece()`` factory default of ``n_bins=15`` so users can
423
+ opt in to the new convention; the snippet itself uses ``n_bins=10`` for
424
+ bit-identical pre-v0.46 math (Decision R6-F).
425
+
426
+ Parameters
427
+ ----------
428
+ name : str
429
+ A name in ``_DEPRECATED_SCALARS``.
430
+
431
+ Returns
432
+ -------
433
+ str
434
+ The warning message, ready to pass to ``warnings.warn``.
378
435
  """
379
- return {
380
- "pr_auc": "pr_auc",
381
- "roc_auc": "roc_auc",
382
- "brier_score": "brier",
383
- "expected_calibration_error": "ece(n_bins=10)",
384
- "expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
385
- }.get(deprecated_name, deprecated_name)
436
+ first_party = _FIRST_PARTY_REPLACEMENTS.get(name)
437
+ if first_party is not None:
438
+ factory_expr, scorecard_key = first_party
439
+ msg = (
440
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
441
+ f"For the same math, use:\n"
442
+ f" scorecard(y, s, metrics=[metric_specs.{factory_expr}])"
443
+ f'["{scorecard_key}"].value\n'
444
+ f"Or import from the eval_toolkit.metrics submodule directly "
445
+ f"(internal API per ADR 0002 — stable across v1.x, subject to "
446
+ f"refactor in major versions)."
447
+ )
448
+ # ECE-specific migration note about the n_bins default change.
449
+ if name.startswith("expected_calibration_error"):
450
+ msg += (
451
+ "\nNote: the v0.46+ metric_specs.ece() factory defaults to "
452
+ "n_bins=15 (matching Hines et al.); the n_bins=10 in this "
453
+ "snippet preserves the pre-v0.46 math. Pass n_bins=15 to use "
454
+ "the new convention."
455
+ )
456
+ return msg
457
+ # Decision R6-G: 3 ECE variants without first-party replacements →
458
+ # submodule path only.
459
+ return (
460
+ f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
461
+ f"This variant is NOT in v0.46+ metric_specs. Use:\n"
462
+ f" from eval_toolkit.metrics import {name}\n"
463
+ f"(internal API per ADR 0002 — stable across v1.x, subject to "
464
+ f"refactor in major versions). Or contribute the variant to "
465
+ f"metric_specs if you use it regularly."
466
+ )
386
467
 
387
468
 
388
469
  # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.46.0"
5
+ __version__ = "0.46.1"
@@ -118,6 +118,23 @@ brier: MetricSpec = _BrierSpec()
118
118
  # ─────────────────────────────────────────────────────────────────────────────
119
119
 
120
120
 
121
+ # Valid strategy values for ECE specs. Locked at v0.46.1 to prevent the
122
+ # Round 6 R6-F1 footgun where `ece(strategy="typo")` silently dispatched to
123
+ # quantile ECE and returned a scorecard cell with status="ok" under an
124
+ # invalid key. See `docs/source/audit_findings.md` Round 6.
125
+ _ECE_VALID_STRATEGIES: frozenset[str] = frozenset({"uniform", "quantile"})
126
+
127
+
128
+ def _validate_ece_strategy(strategy: str) -> None:
129
+ """Validate ECE strategy value; raise ValueError with context if invalid.
130
+
131
+ Shared between the factory (eager validation) and ``_EceSpec.compute`` (defence in
132
+ depth for direct construction paths that bypass the factory).
133
+ """
134
+ if strategy not in _ECE_VALID_STRATEGIES:
135
+ raise ValueError(f"ECE strategy must be 'uniform' or 'quantile'; got {strategy!r}")
136
+
137
+
121
138
  @dataclass(frozen=True, slots=True)
122
139
  class _EceSpec:
123
140
  """Internal :class:`MetricSpec` for expected calibration error.
@@ -135,6 +152,10 @@ class _EceSpec:
135
152
  return f"ece_n_bins_{self.n_bins}_strategy_{self.strategy}"
136
153
 
137
154
  def compute(self, y_true: np.ndarray, y_score: np.ndarray) -> float:
155
+ # Defence-in-depth strategy validation — the factory validates first,
156
+ # but a caller bypassing the factory and constructing `_EceSpec` directly
157
+ # would otherwise produce a wrong-metric scorecard cell silently.
158
+ _validate_ece_strategy(self.strategy)
138
159
  if self.strategy == "uniform":
139
160
  return float(_ece_uniform(y_true, y_score, n_bins=self.n_bins))
140
161
  return float(_ece_equal_mass(y_true, y_score, n_bins=self.n_bins))
@@ -178,5 +199,19 @@ def ece(*, n_bins: int = 15, strategy: ECEStrategy = "uniform") -> MetricSpec:
178
199
  'ece_n_bins_15_strategy_uniform'
179
200
  >>> ece(n_bins=10, strategy="quantile").name
180
201
  'ece_n_bins_10_strategy_quantile'
202
+
203
+ Invalid strategies raise ``ValueError`` eagerly (v0.46.1+; Round 6 R6-F1
204
+ fix — prior to v0.46.1 this silently dispatched to quantile ECE):
205
+
206
+ >>> ece(strategy="typo")
207
+ Traceback (most recent call last):
208
+ ...
209
+ ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
210
+
211
+ Raises
212
+ ------
213
+ ValueError
214
+ If ``strategy`` is not in ``{"uniform", "quantile"}``.
181
215
  """
216
+ _validate_ece_strategy(strategy)
182
217
  return _EceSpec(n_bins=n_bins, strategy=strategy)
@@ -1192,7 +1192,7 @@
1192
1192
  "doc_first_line": "str(object='') -> str",
1193
1193
  "kind": "value",
1194
1194
  "type": "str",
1195
- "value": "'0.46.0'"
1195
+ "value": "'0.46.1'"
1196
1196
  },
1197
1197
  "apply_operating_points": {
1198
1198
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -58,7 +58,13 @@ def test_deprecated_names_not_in_exports(name: str) -> None:
58
58
  @pytest.mark.unit
59
59
  @pytest.mark.parametrize("name", sorted(DEPRECATED_SCALARS))
60
60
  def test_deprecated_name_emits_warning(name: str) -> None:
61
- """Looking up a deprecated name at the top level emits DeprecationWarning."""
61
+ """Looking up a deprecated name at the top level emits DeprecationWarning.
62
+
63
+ Updated v0.46.1 per Decision R6-G: the 3 ECE variants without first-party
64
+ `metric_specs` equivalents point at the submodule path
65
+ (`from eval_toolkit.metrics import ...`) rather than a scorecard snippet.
66
+ The other 5 first-party-replaceable names use the scorecard snippet.
67
+ """
62
68
  with warnings.catch_warnings(record=True) as caught:
63
69
  warnings.simplefilter("always")
64
70
  _ = getattr(eval_toolkit, name)
@@ -66,9 +72,16 @@ def test_deprecated_name_emits_warning(name: str) -> None:
66
72
  assert (
67
73
  len(deprecations) >= 1
68
74
  ), f"expected DeprecationWarning for {name}; got {[w.category.__name__ for w in caught]}"
69
- assert name in str(deprecations[0].message)
70
- assert "v0.47" in str(deprecations[0].message)
71
- assert "scorecard" in str(deprecations[0].message)
75
+ msg = str(deprecations[0].message)
76
+ # Universal assertions for ALL deprecated names:
77
+ assert name in msg
78
+ assert "v0.47" in msg
79
+ # Per-name-class assertions: scorecard for first-party, submodule for the rest.
80
+ if name in _EXPECTED_SUBMODULE_ONLY:
81
+ assert "eval_toolkit.metrics" in msg
82
+ assert "NOT in v0.46+ metric_specs" in msg
83
+ else:
84
+ assert "scorecard" in msg
72
85
 
73
86
 
74
87
  @pytest.mark.unit
@@ -76,7 +89,7 @@ def test_deprecated_pr_auc_still_functional() -> None:
76
89
  """The returned function still works — only the WAY it's imported is deprecated."""
77
90
  with warnings.catch_warnings():
78
91
  warnings.simplefilter("ignore", DeprecationWarning)
79
- pr_auc = eval_toolkit.pr_auc # type: ignore[attr-defined]
92
+ pr_auc = eval_toolkit.pr_auc
80
93
  y = np.array([0, 1, 0, 1, 1, 0, 1, 0])
81
94
  s = np.array([0.2, 0.8, 0.3, 0.7, 0.9, 0.1, 0.6, 0.4])
82
95
  assert 0.0 <= pr_auc(y, s) <= 1.0
@@ -86,7 +99,7 @@ def test_deprecated_pr_auc_still_functional() -> None:
86
99
  def test_deprecated_brier_score_still_functional() -> None:
87
100
  with warnings.catch_warnings():
88
101
  warnings.simplefilter("ignore", DeprecationWarning)
89
- brier_score = eval_toolkit.brier_score # type: ignore[attr-defined]
102
+ brier_score = eval_toolkit.brier_score
90
103
  y = np.array([0, 1, 0, 1])
91
104
  s = np.array([0.1, 0.9, 0.2, 0.8])
92
105
  assert 0.0 <= brier_score(y, s) <= 1.0
@@ -170,7 +183,7 @@ def test_full_all_resolves_without_attribute_error() -> None:
170
183
  def test_unknown_name_still_raises_attribute_error() -> None:
171
184
  """The deprecation branch must not swallow unknown-name errors."""
172
185
  with pytest.raises(AttributeError, match="no attribute"):
173
- _ = eval_toolkit.nonexistent_symbol_xyz # type: ignore[attr-defined]
186
+ _ = eval_toolkit.nonexistent_symbol_xyz
174
187
 
175
188
 
176
189
  # ─────────────────────────────────────────────────────────────────────────────
@@ -182,3 +195,140 @@ def test_unknown_name_still_raises_attribute_error() -> None:
182
195
  def test_deprecated_scalars_set_matches() -> None:
183
196
  """The internal `_DEPRECATED_SCALARS` set lines up with this test's expectations."""
184
197
  assert eval_toolkit._DEPRECATED_SCALARS == DEPRECATED_SCALARS
198
+
199
+
200
+ # ─────────────────────────────────────────────────────────────────────────────
201
+ # v0.46.1 — Round 6 R6-F2 + R6-F + R6-G: warning snippet content & executability
202
+ # ─────────────────────────────────────────────────────────────────────────────
203
+
204
+
205
+ # First-party replacements that should appear in warning snippets verbatim.
206
+ # (factory_expression, scorecard_key) per deprecated name. Matches
207
+ # `eval_toolkit._FIRST_PARTY_REPLACEMENTS`.
208
+ _EXPECTED_FIRST_PARTY: dict[str, tuple[str, str]] = {
209
+ "pr_auc": ("pr_auc", "pr_auc"),
210
+ "roc_auc": ("roc_auc", "roc_auc"),
211
+ "brier_score": ("brier", "brier"),
212
+ "expected_calibration_error": ("ece(n_bins=10)", "ece_n_bins_10_strategy_uniform"),
213
+ "expected_calibration_error_equal_mass": (
214
+ 'ece(n_bins=10, strategy="quantile")',
215
+ "ece_n_bins_10_strategy_quantile",
216
+ ),
217
+ }
218
+
219
+ # ECE variants without first-party metric_specs equivalents (Decision R6-G).
220
+ _EXPECTED_SUBMODULE_ONLY: frozenset[str] = frozenset(
221
+ {
222
+ "expected_calibration_error_debiased",
223
+ "expected_calibration_error_l2",
224
+ "expected_calibration_error_l2_debiased",
225
+ }
226
+ )
227
+
228
+
229
+ def _capture_warning_message(name: str) -> str:
230
+ """Trigger the deprecation shim for `name` and return the rendered message."""
231
+ with warnings.catch_warnings(record=True) as caught:
232
+ warnings.simplefilter("always")
233
+ getattr(eval_toolkit, name)
234
+ deprecations = [w for w in caught if issubclass(w.category, DeprecationWarning)]
235
+ assert deprecations, f"no DeprecationWarning emitted for {name}"
236
+ return str(deprecations[0].message)
237
+
238
+
239
+ @pytest.mark.unit
240
+ @pytest.mark.parametrize("name", sorted(_EXPECTED_FIRST_PARTY))
241
+ def test_first_party_warning_contains_correct_snippet(name: str) -> None:
242
+ """First-party replacements emit scorecard snippet with the encoded key.
243
+
244
+ Round 6 R6-F2: prior warnings used the factory-call expression
245
+ (e.g. ``"ece(n_bins=10)"``) as the scorecard lookup key. The shipped
246
+ Scorecard is a Mapping keyed by the encoded spec name
247
+ (e.g. ``"ece_n_bins_10_strategy_uniform"``). The v0.46.1 fix uses the
248
+ correct encoded key inline so blindly-copied snippets actually work.
249
+ """
250
+ factory_expr, scorecard_key = _EXPECTED_FIRST_PARTY[name]
251
+ msg = _capture_warning_message(name)
252
+ assert f"metric_specs.{factory_expr}" in msg
253
+ assert f'["{scorecard_key}"]' in msg
254
+
255
+
256
+ @pytest.mark.unit
257
+ def test_ece_first_party_warnings_carry_n_bins_10_migration_note() -> None:
258
+ """ECE first-party warnings preserve pre-v0.46 default (n_bins=10) + nudge.
259
+
260
+ Per Decision R6-F: pre-v0.46 `expected_calibration_error` defaulted to
261
+ n_bins=10; v0.46+ `metric_specs.ece()` defaults to n_bins=15. The
262
+ warning snippet uses n_bins=10 for bit-identical math; an appended note
263
+ explains the new convention.
264
+ """
265
+ for name in ("expected_calibration_error", "expected_calibration_error_equal_mass"):
266
+ msg = _capture_warning_message(name)
267
+ assert "n_bins=10" in msg
268
+ # Migration note about the new default:
269
+ assert "n_bins=15" in msg
270
+ assert "Hines" in msg or "new convention" in msg
271
+
272
+
273
+ @pytest.mark.unit
274
+ @pytest.mark.parametrize("name", sorted(_EXPECTED_SUBMODULE_ONLY))
275
+ def test_submodule_only_warning_points_at_submodule_path(name: str) -> None:
276
+ """The 3 ECE variants without first-party specs route users to the submodule.
277
+
278
+ Per Decision R6-G: `expected_calibration_error_debiased` / `_l2` /
279
+ `_l2_debiased` are research-completeness primitives without
280
+ `metric_specs` equivalents at v0.46. Their warnings cite
281
+ `eval_toolkit.metrics.<name>` rather than a scorecard snippet.
282
+ """
283
+ msg = _capture_warning_message(name)
284
+ assert f"from eval_toolkit.metrics import {name}" in msg
285
+ assert "NOT in v0.46+ metric_specs" in msg
286
+
287
+
288
+ @pytest.mark.unit
289
+ @pytest.mark.parametrize("name", sorted(_EXPECTED_FIRST_PARTY))
290
+ def test_first_party_warning_snippet_is_executable(name: str) -> None:
291
+ """The scorecard snippet in the warning produces a usable MetricResult.
292
+
293
+ Parses the snippet, executes it against a synthetic balanced slice, and
294
+ asserts that the resulting `MetricResult` has `status="ok"` and finite
295
+ `value`. This is the user-facing migration contract: copy the snippet,
296
+ run it, get a number.
297
+ """
298
+
299
+ from eval_toolkit import metric_specs as ms
300
+ from eval_toolkit import scorecard # noqa: F401
301
+
302
+ msg = _capture_warning_message(name)
303
+ factory_expr, scorecard_key = _EXPECTED_FIRST_PARTY[name]
304
+
305
+ # Build the snippet that the warning instructs the user to use:
306
+ # scorecard(y, s, metrics=[metric_specs.<factory_expr>])["<key>"].value
307
+ rng = np.random.default_rng(0)
308
+ y = rng.integers(0, 2, 200)
309
+ s = rng.random(200)
310
+
311
+ snippet = (
312
+ f"scorecard(y, s, metrics=[ms.{factory_expr}], bootstrap=False)" f'["{scorecard_key}"]'
313
+ )
314
+ # Confirm the warning actually contains the snippet shape it promises:
315
+ assert f"metric_specs.{factory_expr}" in msg
316
+ # Evaluate (safe — we constructed factory_expr from the known mapping):
317
+ cell = eval(snippet, {"scorecard": scorecard, "ms": ms, "y": y, "s": s}) # noqa: S307
318
+ assert cell.status == "ok", f"snippet for {name}: {cell.status} (reason: {cell.reason})"
319
+ assert cell.value is not None
320
+ assert isinstance(cell.value, float)
321
+
322
+
323
+ @pytest.mark.unit
324
+ @pytest.mark.parametrize("name", sorted(_EXPECTED_SUBMODULE_ONLY))
325
+ def test_submodule_only_snippet_is_importable(name: str) -> None:
326
+ """The submodule-import snippet in the warning actually imports something callable."""
327
+ import importlib
328
+
329
+ metrics_mod = importlib.import_module("eval_toolkit.metrics")
330
+ assert hasattr(metrics_mod, name), (
331
+ f"warning for {name} promises `from eval_toolkit.metrics import {name}` "
332
+ f"but the symbol isn't present in the submodule"
333
+ )
334
+ assert callable(getattr(metrics_mod, name))