eval-toolkit 0.46.1__tar.gz → 0.48.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/CHANGELOG.md +224 -0
  2. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/PKG-INFO +6 -3
  3. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/README.md +5 -2
  4. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/__init__.py +47 -166
  5. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/_scorecard.py +81 -5
  6. eval_toolkit-0.48.0/src/eval_toolkit/_sweep.py +302 -0
  7. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/_version.py +1 -1
  8. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/adversarial.py +333 -191
  9. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/bootstrap.py +69 -16
  10. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/calibration.py +41 -3
  11. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/config.py +1 -1
  12. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/embeddings.py +1 -1
  13. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/loaders.py +2 -3
  14. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/metric_specs.py +57 -0
  15. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/metrics.py +15 -0
  16. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/preprocessing.py +75 -99
  17. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/protocols.py +35 -0
  18. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/benchmarks/test_kernel_benchmarks.py +3 -1
  19. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/public_api/snapshot.json +178 -22
  20. eval_toolkit-0.48.0/tests/test_adversarial.py +420 -0
  21. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_analysis.py +2 -3
  22. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_edge_cases.py +57 -0
  23. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_unit.py +39 -6
  24. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_coverage_bootstrap.py +4 -2
  25. eval_toolkit-0.48.0/tests/test_deprecated_scalars_shim.py +211 -0
  26. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_metric_options.py +4 -2
  27. eval_toolkit-0.48.0/tests/test_lazy_extras_messages.py +283 -0
  28. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_loaders.py +8 -2
  29. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_logging.py +2 -1
  30. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_metrics_unit.py +153 -0
  31. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_preprocessing.py +91 -52
  32. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_public_api.py +125 -1
  33. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_scorecard.py +258 -1
  34. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_stacking.py +1 -1
  35. eval_toolkit-0.48.0/tests/test_sweep.py +426 -0
  36. eval_toolkit-0.46.1/tests/test_adversarial.py +0 -351
  37. eval_toolkit-0.46.1/tests/test_deprecated_scalars_shim.py +0 -334
  38. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/.gitignore +0 -0
  39. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/LICENSE +0 -0
  40. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/STYLE.md +0 -0
  41. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/archive/README.md +0 -0
  42. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/README.md +0 -0
  43. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/datasets/README.md +0 -0
  44. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/papers/data-integrity/README.md +0 -0
  45. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  46. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/papers/inference/README.md +0 -0
  47. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/research/papers/prompt-injection/README.md +0 -0
  48. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/source/adr/README.md +0 -0
  49. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/docs/source/methodology/README.md +0 -0
  50. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/pyproject.toml +0 -0
  51. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/__main__.py +0 -0
  52. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/_deprecated.py +0 -0
  53. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/_parallel.py +0 -0
  54. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/analysis.py +0 -0
  55. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/artifacts.py +0 -0
  56. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/claims.py +0 -0
  57. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/docs.py +0 -0
  58. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/evidence.py +0 -0
  59. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/harness.py +0 -0
  60. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/leakage.py +0 -0
  61. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/losses.py +0 -0
  62. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/manifest.py +0 -0
  63. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/operating_points.py +0 -0
  64. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/paths.py +0 -0
  65. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/plotting.py +0 -0
  66. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/probes.py +0 -0
  67. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/provenance.py +0 -0
  68. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/py.typed +0 -0
  69. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  70. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  71. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  72. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  73. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  74. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  75. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/seeds.py +0 -0
  76. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/splits.py +0 -0
  77. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/stacking.py +0 -0
  78. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/text_dedup.py +0 -0
  79. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/src/eval_toolkit/thresholds.py +0 -0
  80. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  81. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  82. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  83. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  84. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  85. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  86. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  87. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  88. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  89. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  90. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/benchmarks/__init__.py +0 -0
  91. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/conftest.py +0 -0
  92. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  93. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  94. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  95. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  96. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/docs/expected.md +0 -0
  97. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/docs/input.md +0 -0
  98. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/docs/metrics.json +0 -0
  99. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  100. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/strategies.py +0 -0
  101. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_artifacts.py +0 -0
  102. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  103. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  104. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_golden.py +0 -0
  105. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_njobs.py +0 -0
  106. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_props.py +0 -0
  107. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_bootstrap_research_grounded.py +0 -0
  108. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_binary_adapters.py +0 -0
  109. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  110. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_determinism.py +0 -0
  111. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_optimization_failures.py +0 -0
  112. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_props.py +0 -0
  113. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_research_grounded.py +0 -0
  114. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_calibration_unit.py +0 -0
  115. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_claims.py +0 -0
  116. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_claims_coverage.py +0 -0
  117. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_claims_props.py +0 -0
  118. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_cli.py +0 -0
  119. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_config.py +0 -0
  120. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_coverage_calibration.py +0 -0
  121. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_coverage_harness.py +0 -0
  122. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_coverage_metrics.py +0 -0
  123. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_coverage_plotting.py +0 -0
  124. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_croissant_e2e.py +0 -0
  125. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  126. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_deprecations.py +0 -0
  127. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_docs_golden.py +0 -0
  128. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_docs_props.py +0 -0
  129. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_embeddings.py +0 -0
  130. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_evidence_validators.py +0 -0
  131. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_edge_cases.py +0 -0
  132. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_fault_injection.py +0 -0
  133. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_folded.py +0 -0
  134. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_internals.py +0 -0
  135. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_parallelism.py +0 -0
  136. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_harness_smoke.py +0 -0
  137. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_import_boundaries.py +0 -0
  138. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  139. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_leakage.py +0 -0
  140. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_leakage_error_paths.py +0 -0
  141. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_leakage_props.py +0 -0
  142. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_loaders_coverage.py +0 -0
  143. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_loaders_props.py +0 -0
  144. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_losses.py +0 -0
  145. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_manifest.py +0 -0
  146. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  147. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_manifest_props.py +0 -0
  148. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_manifest_validation.py +0 -0
  149. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_metrics_props.py +0 -0
  150. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_metrics_stratified_subsets.py +0 -0
  151. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_misc_coverage.py +0 -0
  152. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_numeric_edge_cases.py +0 -0
  153. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_ood_loader.py +0 -0
  154. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_operating_points.py +0 -0
  155. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_operating_points_props.py +0 -0
  156. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_parallel.py +0 -0
  157. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_paths.py +0 -0
  158. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_pipeline_e2e.py +0 -0
  159. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_plotting_edge.py +0 -0
  160. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_plotting_smoke.py +0 -0
  161. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_plotting_visual.py +0 -0
  162. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_probes.py +0 -0
  163. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_protocol_conformance.py +0 -0
  164. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_provenance.py +0 -0
  165. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_recall_at_fpr.py +0 -0
  166. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_reference_equivalence.py +0 -0
  167. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_reproducibility_integration.py +0 -0
  168. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_schemas.py +0 -0
  169. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_seeds.py +0 -0
  170. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_splits.py +0 -0
  171. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_splits_leakage_integration.py +0 -0
  172. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_splits_props.py +0 -0
  173. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_text_dedup.py +0 -0
  174. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_text_dedup_coverage.py +0 -0
  175. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_text_dedup_props.py +0 -0
  176. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_text_dedup_strategies.py +0 -0
  177. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_thresholds.py +0 -0
  178. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_thresholds_constant_score.py +0 -0
  179. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_thresholds_coverage.py +0 -0
  180. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_thresholds_props.py +0 -0
  181. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_thresholds_research_grounded.py +0 -0
  182. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_tokenization_leakage_check.py +0 -0
  183. {eval_toolkit-0.46.1 → eval_toolkit-0.48.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,230 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
9
+
10
+ Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
11
+ → v0.48 → v1.0 release sequence (plan
12
+ ``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 4). Migration
13
+ guide: ``docs/source/migration/v0.48.md``.
14
+
15
+ Closes:
16
+
17
+ - Round 7 audit STOP-GATE per Decision Y.2 (Codex R7-F1/F2/F3 + 6 Gemini
18
+ observations; see ``docs/source/audit_findings.md`` for the per-finding
19
+ ledger).
20
+ - Audit-as-seed extensions surfaced during plan refinement: full
21
+ module-docstring sweep across ``src/eval_toolkit/``; expanded
22
+ ``.doctest-modules`` from 11 → 21 modules; comprehensive cross-API
23
+ shape-validation consistency sweep.
24
+ - Round 5 §5E-prep packet-drift fixes (7 methodology documentation
25
+ corrections).
26
+
27
+ After v0.48 observes ≥1 consumer cycle, the Round 8 audit STOP-GATE
28
+ opens before ``v1.0.0`` tag.
29
+
30
+ ### BREAKING
31
+
32
+ - **``BootstrapCI.to_dict()`` + ``PairedBootstrapCI.to_dict()`` schema
33
+ rewrite** (§5B). Pre-v0.48 hard-coded a ``"ci_95"`` key regardless of
34
+ the actual ``confidence`` field — the key contradicted the data.
35
+ v0.48 schema is self-describing:
36
+
37
+ Before: ``{"point_estimate": p, "ci_95": [l, h], "confidence": 0.95, ...}``
38
+ After: ``{"point": p, "low": l, "high": h, "confidence": 0.95, ...}``
39
+
40
+ Migration: ``d["point_estimate"]`` → ``d["point"]``; ``d["ci_95"]``
41
+ → ``(d["low"], d["high"])``. Same rewrite for ``PairedBootstrapCI``.
42
+ - **``sweep()`` schema grows by 1 column** (§5I, Decision R7-B option C).
43
+ New ``strategy_id`` column inserted between ``text_id`` and ``variant``
44
+ carries the canonical per-row identifier built from configured
45
+ kwargs. Callers indexing by column position must re-check offsets.
46
+ - **``sweep()`` rejects duplicate ``strategy_id``** (§5I). Mirrors
47
+ R6-B's duplicate ``MetricSpec.name`` rejection in ``scorecard()``.
48
+ - **``sweep()`` validates scorer output shape** (§5J, Decision R7-C).
49
+ Wrong-shape arrays from ``Scorer.predict_proba`` raise contextual
50
+ ``ValueError`` at the boundary. Pre-v0.48: silent truncation
51
+ (overlong), ``IndexError`` (short), or ``TypeError`` (matrix-shaped).
52
+ - **``paired_bootstrap_op_point_diff()`` rejects ``val_y is test_y``**
53
+ (§5E-prep). The two-level bootstrap assumes disjoint val + test
54
+ partitions; passing the same array causes ~63.2% silent overlap.
55
+
56
+ ### Added
57
+
58
+ - **``make pre-push``** Makefile target (§5L) running all 3 doc-
59
+ execution surfaces — Sybil-collected ``.md`` fences, MyST-NB example
60
+ notebooks, and in-source ``>>>`` docstring examples. Closes the
61
+ v0.47 Sub-PR 7 incident class.
62
+ - **``nb_execution_raise_on_error = True``** in ``docs/source/conf.py``
63
+ (§5H, Decision R7-A). Docs CI now fails on notebook execution errors.
64
+ - **``.doctest-modules`` expanded** from 11 → 21 modules (§5M).
65
+
66
+ ### Changed
67
+
68
+ - **Cross-API shape-validation consistency** (§5N). Every public-API
69
+ surface with array inputs now validates shape + raises ``ValueError``
70
+ with context (rather than leaking low-level numpy/sklearn errors).
71
+ - **Standardized ``ImportError`` messages** across lazy-extras (§5C).
72
+ Canonical template: ``"<feature> requires <pkg>. Install with: pip
73
+ install eval-toolkit[<extra>]"``.
74
+ - **Pin-exact-key-set regression-guards** (§5A) for every dict-returning
75
+ metrics function. Audit revealed no drift; the tests pin existing
76
+ key sets so future drift fails CI loud.
77
+ - **Docs polish** (§5K + §5E-prep): ``SynonymSubstitution`` whitelist
78
+ ``Notes``; ``Scorecard.to_pandas()`` dtype coercion ``Notes``;
79
+ ``CostSensitiveSelector`` calibrated-prior ``Warning``; ``cv_clt_ci``
80
+ docstring per Bayle et al. (2020) Theorem 3.1; ``methodology/parallelism.md``
81
+ post-v0.36 state; ``methodology/testing.md`` reference-equivalence-gap
82
+ framing; ``methodology/calibration.md`` 4-binary-adapter family;
83
+ ``methodology/bootstrap.md`` disjoint-split example; DeLong docs
84
+ aligned to shipped state (Decision U).
85
+
86
+ ### Fixed
87
+
88
+ - **R7-F1**: 6 MyST-NB example notebooks (``docs/source/examples/*.md``)
89
+ migrated to v0.47 API; 4 module-level docstrings rewritten; 5
90
+ drifted ``docs/source/api/*.md`` autosummary lists corrected;
91
+ 8 missing ``api/*.md`` pages created; roadmap "Sybil-validated
92
+ examples" wording corrected (§5G).
93
+ - **ADR 0001** (flat-module layout) + **ADR 0003** (stability contract
94
+ + Gate 3 methodology) finalized for v1.0 (§5E + §5F).
95
+ - **schemas.md** + **methodology/claims.md** + **getting-started.md**:
96
+ ``BootstrapCI`` schema references updated for the §5B rewrite.
97
+
98
+ ## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
99
+
100
+ Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
101
+ v0.48 → v1.0 release sequence (plan
102
+ ``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 3).
103
+
104
+ Closes:
105
+
106
+ - The v0.43 CHANGELOG forward-look re: advanced-6 character-injection
107
+ techniques (Decision Q11→11.3 — "12-technique suite + new sweep API
108
+ in one migration step").
109
+ - Round 6 audit follow-on items per Decision R6-E (R6-A docstring,
110
+ R6-B duplicate name guard, R6-C to_pandas schema, R6-D Protocol
111
+ method-shape drift guard, R6-F5 narrow except, R6-F6 plan/roadmap
112
+ refresh, R6-H make_spec_name helper) — see ``docs/source/audit_findings.md``
113
+ for the per-finding ledger.
114
+
115
+ ### Removed (BREAKING)
116
+
117
+ - **Top-level scalar metric names** (``eval_toolkit.pr_auc``,
118
+ ``eval_toolkit.roc_auc``, ``eval_toolkit.brier_score``, all 5
119
+ ``expected_calibration_*`` variants) — the v0.46 ``__getattr__``
120
+ deprecation shim has been deleted. These names now raise
121
+ ``AttributeError`` at the top-level. Migration: use ``scorecard(...)``
122
+ with ``metric_specs`` (primary) OR import from the
123
+ ``eval_toolkit.metrics`` submodule (internal API per ADR 0002).
124
+ (Decision L; plan §4D.)
125
+ - **Module-level ``adversarial.sweep`` + ``preprocessing.sweep``** —
126
+ consolidated into the top-level :func:`sweep` (Decision D + plan §4C).
127
+ Parity tests in Sub-PR 4 of this release proved 1:1 output equivalence
128
+ on the neutral subset.
129
+ - **``adversarial.character_injection`` + ``preprocessing.spotlighting``
130
+ ``SimpleNamespace`` shortcuts** — removed (Decision N + plan §4E).
131
+ The 12 adversarial dataclasses + the 3 preprocessing variants + the
132
+ underlying functional API are the only public paths.
133
+ - **``adversarial.CharacterInjectionStrategy``** per-module Protocol —
134
+ removed. The top-level :class:`TextTransform` Protocol (Decision K)
135
+ is the single canonical contract; all 12 character-injection
136
+ dataclasses + 3 preprocessing variants satisfy it structurally.
137
+
138
+ ### Added
139
+
140
+ - **``TextTransform`` Protocol** (top-level; ``eval_toolkit.protocols`` module).
141
+ Decision K + Audit R5-F3 (Codex Round 5): unifies the "name + transform(text)"
142
+ shape across preprocessing (defence) and adversarial (attack) strategies so
143
+ the v0.47 top-level :func:`sweep` (next sub-PR) can mix them in one call. The
144
+ 9th strict Tier-2 Protocol per ADR 0003.
145
+ - **Advanced 6 character-injection techniques** (plan §4F, Decision Q11→11.3) —
146
+ closes the v0.43.0 CHANGELOG forward-look that referenced these as "scheduled
147
+ for v0.43.1" (a version that never shipped). Each satisfies the top-level
148
+ :class:`TextTransform` Protocol structurally; all are frozen + ``slots=True``
149
+ dataclasses with deterministic behaviour under their ``seed`` kwarg where
150
+ applicable:
151
+
152
+ - :class:`BidiRTLInjection` — wrap input in ``U+202E … U+202C``
153
+ RIGHT-TO-LEFT OVERRIDE block.
154
+ - :class:`TagStrippingInjection` — strip HTML/XML-like ``<…>`` tags
155
+ (idempotent).
156
+ - :class:`SynonymSubstitution` — replace whitelisted prompt-injection-
157
+ relevant function words / verbs with semantic-preserving synonyms.
158
+ - :class:`TokenSplitting` — insert a single space inside long enough
159
+ words; forces subword tokenizers to re-segment.
160
+ - :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD; default NFKC
161
+ folds compatibility chars (e.g., fullwidth ``ABC`` → ``ABC``).
162
+ - :class:`InvisibleCharsInjection` — sample from the 5-element invisible-
163
+ code-point set (ZWSP, ZWNJ, ZWJ, word joiner, BOM) — distinct from the
164
+ single-codepoint :class:`ZeroWidthSpaceInjection`.
165
+
166
+ Also exported: ``ADVANCED_TECHNIQUES`` (6-tuple) and ``ALL_TECHNIQUES``
167
+ (12-tuple = core 6 + advanced 6).
168
+ - **Top-level :func:`sweep`** — single ``TextTransform`` enumeration entry
169
+ point (Decision K + Decision D + Audit R5-F3). Replaces the per-module
170
+ ``adversarial.sweep`` + ``preprocessing.sweep`` (those are removed in a
171
+ subsequent sub-PR of this release). New contract:
172
+
173
+ - ``sweep(strategies, texts)`` → neutral DataFrame with ``text_id`` /
174
+ ``variant`` / ``transformed_text`` columns. Pure text-transform
175
+ enumeration; defence + attack strategies compose freely.
176
+ - ``sweep(..., scorer=...)`` → also emits ``original_score`` /
177
+ ``transformed_score`` columns (single batched scorer call per
178
+ strategy, not per-row).
179
+ - ``sweep(..., scorer=..., attack_threshold=t)`` → also emits ``asr``
180
+ (per-row attack-success flag). Explicit threshold REQUIRED to
181
+ materialize ``asr``; no magic ``threshold=0.5`` default.
182
+ ``attack_threshold`` without ``scorer`` raises ``ValueError``.
183
+
184
+ Parity tests against the existing module-level sweeps ship in this
185
+ sub-PR (``tests/test_sweep.py``) and prove the v0.47 consolidation
186
+ produces identical transformed-text rows for the 6 core character-
187
+ injection techniques + the 3 spotlighting variants.
188
+ - **3 preprocessing dataclasses** (``DelimitVariant``, ``DatamarkVariant``,
189
+ ``EncodeVariant``) in :mod:`eval_toolkit.preprocessing`. Frozen +
190
+ ``slots=True`` thin wrappers over the existing :func:`delimit` /
191
+ :func:`datamark` / :func:`encode` functions. Closes Audit R5-F3
192
+ (Codex Round 5) — prior to this commit, ``preprocessing.__all__`` exported
193
+ only functions, so the "concrete classes satisfy ``TextTransform``
194
+ structurally" claim only held on the adversarial side. Now both sides
195
+ share the dataclass-strategy shape.
196
+ - ``metric_specs.make_spec_name(prefix, **kwargs)`` canonicalization helper
197
+ for custom parameterized :class:`MetricSpec` implementations. Alphabetized
198
+ kwargs joined by underscore — same convention the v0.46 ECE factory uses.
199
+ Lands in ``metric_specs.__all__`` only; **not** top-level ``__all__`` per
200
+ Decision R6-H. (Closes Round 6 Gemini R6-F4.)
201
+
202
+ ### Changed (Round 6 follow-on)
203
+
204
+ - ``scorecard()`` now raises ``ValueError`` when two :class:`MetricSpec`
205
+ instances in the ``metrics`` list share a ``name``. Forces caller
206
+ disambiguation; the ``Mapping[str, MetricResult]`` contract never silently
207
+ drops a cell. Error message reports both indices. (Decision R6-B; closes
208
+ Round 6 Codex R6-F3.)
209
+ - ``scorecard(seed=None)`` docstring rewritten to document the deterministic-
210
+ by-default contract (``None`` is treated as ``seed=0``). No behavior
211
+ change; v0.46 documented the wrong contract. (Decision R6-A; closes Round 6
212
+ Codex R6-F4 + Gemini R6-F1.)
213
+ - ``_evaluate_spec()`` exception catches narrowed: ``MemoryError``,
214
+ ``RecursionError``, ``KeyboardInterrupt``, and ``SystemExit`` now propagate
215
+ out of ``scorecard()`` instead of being captured as a ``status="error"``
216
+ cell. Per-cell isolation remains for ordinary application errors.
217
+ (Decision R6-F5; closes Round 6 Gemini R6-F5.)
218
+ - ``Scorecard.to_pandas()`` MultiIndex schema extended with two new inner-
219
+ field columns: ``n_resamples`` (int / NaN sentinel) and ``method``
220
+ (string / ``""`` sentinel). The DataFrame view is now lossless against
221
+ :meth:`BootstrapCI.to_dict` — trace provenance (resample count + CI
222
+ method) no longer drops at the DataFrame boundary. Callers indexing the
223
+ MultiIndex by name keep working; callers indexing by position must
224
+ re-check column offsets. (Decision R6-C; closes Round 6 Gemini R6-F3.)
225
+ - ``tests/test_public_api.py`` drift guard now captures method signatures
226
+ for ``typing.Protocol`` classes in ``__all__`` (a ``protocol_methods``
227
+ sub-entry in the snapshot). Together with a Tier-2 coverage test, this
228
+ actually enforces the strict method-shape stability ADR 0003 promises
229
+ for the 9 Tier-2 Protocols. (Decision R6-D; closes Round 6 Codex R6-F5.)
230
+ Public-API golden regenerated alongside this change.
231
+
8
232
  ## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
9
233
 
10
234
  Hotfix release per **Decision Q** (data correctness regression + time-sensitive
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.46.1
3
+ Version: 0.48.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -215,7 +215,7 @@ pip install "eval-toolkit[all]" # everything
215
215
 
216
216
  ```python
217
217
  import numpy as np
218
- from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
218
+ from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
219
219
 
220
220
  rng = np.random.default_rng(42)
221
221
  y = rng.integers(0, 2, size=200)
@@ -230,7 +230,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
230
230
  ### Bootstrap confidence intervals
231
231
 
232
232
  ```python
233
- from eval_toolkit import bootstrap_ci, paired_bootstrap_diff, pr_auc
233
+ from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
234
+ from eval_toolkit.metrics import pr_auc
234
235
 
235
236
  ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
236
237
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
@@ -244,8 +245,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
244
245
  ### Temperature scaling (Guo et al. 2017)
245
246
 
246
247
  ```python
248
+ import numpy as np
247
249
  from eval_toolkit import fit_temperature
248
250
 
251
+ rng = np.random.default_rng(42)
249
252
  logits = rng.normal(size=(500, 2))
250
253
  labels = (logits[:, 1] > logits[:, 0]).astype(int)
251
254
  result = fit_temperature(logits, labels)
@@ -132,7 +132,7 @@ pip install "eval-toolkit[all]" # everything
132
132
 
133
133
  ```python
134
134
  import numpy as np
135
- from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
135
+ from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
136
136
 
137
137
  rng = np.random.default_rng(42)
138
138
  y = rng.integers(0, 2, size=200)
@@ -147,7 +147,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
147
147
  ### Bootstrap confidence intervals
148
148
 
149
149
  ```python
150
- from eval_toolkit import bootstrap_ci, paired_bootstrap_diff, pr_auc
150
+ from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
151
+ from eval_toolkit.metrics import pr_auc
151
152
 
152
153
  ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
153
154
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
@@ -161,8 +162,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
161
162
  ### Temperature scaling (Guo et al. 2017)
162
163
 
163
164
  ```python
165
+ import numpy as np
164
166
  from eval_toolkit import fit_temperature
165
167
 
168
+ rng = np.random.default_rng(42)
166
169
  logits = rng.normal(size=(500, 2))
167
170
  labels = (logits[:, 1] > logits[:, 0]).astype(int)
168
171
  result = fit_temperature(logits, labels)
@@ -1,9 +1,12 @@
1
1
  """eval-toolkit — reusable evaluation contracts for binary classification.
2
2
 
3
- Public API remains available from ``eval_toolkit`` and from submodules:
3
+ The v1.0 primary metric surface is :func:`~eval_toolkit.scorecard` plus the
4
+ :mod:`~eval_toolkit.metric_specs` namespace (ADR 0002). Submodule paths
5
+ remain available for scalar primitives and adapter authors:
4
6
 
5
- from eval_toolkit import pr_auc, bootstrap_ci, BootstrapCI
6
- from eval_toolkit.metrics import pr_auc
7
+ from eval_toolkit import scorecard, metric_specs as ms
8
+ from eval_toolkit import bootstrap_ci, BootstrapCI
9
+ from eval_toolkit.metrics import pr_auc # internal API, ADR 0002
7
10
 
8
11
  The package root uses lazy exports so importing ``eval_toolkit`` does not
9
12
  eagerly import optional-heavy modules such as plotting, loaders, or harnesses.
@@ -31,22 +34,36 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
31
34
  # tests/golden/public_api/ reads dict keys + values, not comments.
32
35
  _EXPORTS: dict[str, str] = {
33
36
  # --- adversarial ---
37
+ "ADVANCED_TECHNIQUES": "eval_toolkit.adversarial",
38
+ "ALL_TECHNIQUES": "eval_toolkit.adversarial",
39
+ "BidiRTLInjection": "eval_toolkit.adversarial",
34
40
  "CORE_TECHNIQUES": "eval_toolkit.adversarial",
35
41
  "CaseRandomization": "eval_toolkit.adversarial",
36
- "CharacterInjectionStrategy": "eval_toolkit.adversarial",
37
42
  "DiacriticInjection": "eval_toolkit.adversarial",
38
43
  "HomoglyphSubstitution": "eval_toolkit.adversarial",
44
+ "InvisibleCharsInjection": "eval_toolkit.adversarial",
39
45
  "PunctuationInjection": "eval_toolkit.adversarial",
46
+ "SynonymSubstitution": "eval_toolkit.adversarial",
47
+ "TagStrippingInjection": "eval_toolkit.adversarial",
48
+ "TokenSplitting": "eval_toolkit.adversarial",
49
+ "UnicodeNormalization": "eval_toolkit.adversarial",
40
50
  "WhitespaceInjection": "eval_toolkit.adversarial",
41
51
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
42
- "character_injection": "eval_toolkit.adversarial",
52
+ # CharacterInjectionStrategy + character_injection SimpleNamespace
53
+ # removed at v0.47 (Decision N + plan §4E). TextTransform Protocol +
54
+ # the 12 concrete dataclasses are now the only public path.
43
55
  # --- losses ---
44
56
  "RecallAtLowFPR": "eval_toolkit.losses",
45
57
  # --- preprocessing ---
58
+ # `spotlighting` SimpleNamespace removed at v0.47 (Decision N + plan §4E).
59
+ # The 3 Variant dataclasses + the underlying functional API are the
60
+ # only public path.
61
+ "DatamarkVariant": "eval_toolkit.preprocessing",
62
+ "DelimitVariant": "eval_toolkit.preprocessing",
63
+ "EncodeVariant": "eval_toolkit.preprocessing",
46
64
  "datamark": "eval_toolkit.preprocessing",
47
65
  "delimit": "eval_toolkit.preprocessing",
48
66
  "encode": "eval_toolkit.preprocessing",
49
- "spotlighting": "eval_toolkit.preprocessing",
50
67
  # --- probes ---
51
68
  "ActivationDeltaProbe": "eval_toolkit.probes",
52
69
  "ActivationExtractor": "eval_toolkit.probes",
@@ -193,12 +210,15 @@ _EXPORTS: dict[str, str] = {
193
210
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
194
211
  "ThresholdResult": "eval_toolkit.metrics",
195
212
  "brier_decomposition": "eval_toolkit.metrics",
196
- # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
197
- # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
198
- # level via the `__getattr__` deprecation branch (emits
199
- # `DeprecationWarning`; branch removed at v0.47) and via the metrics
200
- # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
201
- # per ADR 0002, not part of the v1.0 stability contract).
213
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants were removed
214
+ # from `_EXPORTS` at v0.46 (Decision L); the v0.46 `__getattr__`
215
+ # deprecation branch that kept them reachable with `DeprecationWarning`
216
+ # was removed at v0.47. They now raise `AttributeError` at the top level.
217
+ # The metrics submodule (`from eval_toolkit.metrics import pr_auc`)
218
+ # remains the only stable import path for scalar primitives — internal
219
+ # API per ADR 0002, not part of the v1.0 stability contract. The
220
+ # `scorecard()` + `metric_specs` surface is the primary path going
221
+ # forward (`metric_specs.pr_auc`, `metric_specs.roc_auc`, etc.).
202
222
  "headline_metrics": "eval_toolkit.metrics",
203
223
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
204
224
  "metrics_at_threshold": "eval_toolkit.metrics",
@@ -247,6 +267,7 @@ _EXPORTS: dict[str, str] = {
247
267
  "PredictionReader": "eval_toolkit.protocols",
248
268
  "Scorer": "eval_toolkit.protocols",
249
269
  "SliceAwareScorer": "eval_toolkit.protocols",
270
+ "TextTransform": "eval_toolkit.protocols",
250
271
  "Versioned": "eval_toolkit.protocols",
251
272
  # --- seeds ---
252
273
  "set_global_seeds": "eval_toolkit.seeds",
@@ -298,61 +319,28 @@ _EXPORTS: dict[str, str] = {
298
319
  "MetricSpec": "eval_toolkit._scorecard",
299
320
  "Scorecard": "eval_toolkit._scorecard",
300
321
  "scorecard": "eval_toolkit._scorecard",
322
+ # --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
323
+ "sweep": "eval_toolkit._sweep",
301
324
  }
302
325
 
303
326
  __all__ = ["__version__", *_EXPORTS.keys()]
304
327
 
305
328
 
306
- # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
307
- # At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
308
- # in favor of the `scorecard()` surface (Decision A). To give the consumer one
309
- # release of overlap before the hard removal at v0.47, the names below remain
310
- # reachable via the package-level `__getattr__` (which delegates to the
311
- # `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
312
- # lookup pointing at the new API.
313
- #
314
- # WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
315
- # `__getattr__` below is the load-bearing lazy export resolver for every name
316
- # in `_EXPORTS`. The deprecation branch is a discrete `if name in
317
- # _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
318
- # behavior for non-deprecated names is unchanged. At v0.47 we delete this
319
- # transitional block and the resolver continues to work for every remaining
320
- # `_EXPORTS` entry.
321
- _DEPRECATED_SCALARS: frozenset[str] = frozenset(
322
- {
323
- "pr_auc",
324
- "roc_auc",
325
- "brier_score",
326
- "expected_calibration_error",
327
- "expected_calibration_error_debiased",
328
- "expected_calibration_error_equal_mass",
329
- "expected_calibration_error_l2",
330
- "expected_calibration_error_l2_debiased",
331
- }
332
- )
333
- # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
334
-
335
-
336
329
  def __getattr__(name: str) -> Any:
337
- """Resolve public symbols lazily."""
330
+ """Resolve public symbols lazily.
331
+
332
+ v0.47 cleanup (Decision L): the BEGIN/END TRANSITIONAL DEPRECATION
333
+ BRANCH that v0.46 inserted in front of the resolver — together with the
334
+ ``_DEPRECATED_SCALARS`` frozenset and the ``_deprecation_warning_for``
335
+ helper — has been removed. The lazy resolver below is the v0.46 base
336
+ behavior; with the transitional block gone, deprecated v0.45 scalar names
337
+ (``pr_auc``, ``roc_auc``, ``brier_score``, the 5 ``expected_calibration_*``
338
+ variants) now raise :class:`AttributeError` cleanly. Submodule-level
339
+ access (e.g., ``from eval_toolkit.metrics import pr_auc``) is unaffected
340
+ per Decision C / ADR 0002.
341
+ """
338
342
  if name == "__version__":
339
343
  return __version__
340
- # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
341
- if name in _DEPRECATED_SCALARS:
342
- import warnings
343
-
344
- warnings.warn(
345
- _deprecation_warning_for(name),
346
- DeprecationWarning,
347
- stacklevel=2,
348
- )
349
- module = import_module("eval_toolkit.metrics")
350
- value = getattr(module, name)
351
- # Do NOT cache in globals() — repeated lookups should keep re-warning
352
- # (one warning per call site, modulo Python's default
353
- # DeprecationWarning de-duplication).
354
- return value
355
- # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
356
344
  module_name = _EXPORTS.get(name)
357
345
  if module_name is None:
358
346
  raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
@@ -362,113 +350,6 @@ def __getattr__(name: str) -> Any:
362
350
  return value
363
351
 
364
352
 
365
- # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
366
- #
367
- # Per Round 6 audit (Codex R6-F2 + Gemini R6-F2; Decisions R6-F + R6-G):
368
- # - For deprecated scalars with a first-party `metric_specs` equivalent, the
369
- # warning emits an EXECUTABLE scorecard snippet (factory expression + the
370
- # correct encoded scorecard key, not the factory call string).
371
- # - For the 3 ECE variants without a `metric_specs` equivalent
372
- # (expected_calibration_error_debiased / _l2 / _l2_debiased), the warning
373
- # instead points at the submodule path per Decision R6-G — no first-party
374
- # replacement is shipped at v0.47.
375
- # - ECE `n_bins=10` preserves the pre-v0.46 default (verified at
376
- # `metrics.py:730-734`) — Decision R6-F. A migration note explains that
377
- # the v0.46+ `metric_specs.ece()` factory defaults to `n_bins=15` (matching
378
- # Hines et al.) and how to opt in.
379
- _FirstParty = tuple[str, str] # (factory_expression, scorecard_key)
380
- """Type alias for a deprecated-scalar that has a metric_specs replacement.
381
-
382
- The factory expression is what the user types after ``metric_specs.``; the
383
- scorecard key is the literal string that indexes ``Scorecard``.
384
- """
385
-
386
-
387
- _FIRST_PARTY_REPLACEMENTS: dict[str, _FirstParty] = {
388
- "pr_auc": ("pr_auc", "pr_auc"),
389
- "roc_auc": ("roc_auc", "roc_auc"),
390
- "brier_score": ("brier", "brier"),
391
- # ECE variants: use n_bins=10 (pre-v0.46 default per Decision R6-F).
392
- # The migration note in the warning text explains how to switch to
393
- # n_bins=15 if the user wants the v0.46+ metric_specs.ece() default.
394
- "expected_calibration_error": (
395
- "ece(n_bins=10)",
396
- "ece_n_bins_10_strategy_uniform",
397
- ),
398
- "expected_calibration_error_equal_mass": (
399
- 'ece(n_bins=10, strategy="quantile")',
400
- "ece_n_bins_10_strategy_quantile",
401
- ),
402
- }
403
- """Names that have a first-party metric_specs replacement at v0.46.
404
-
405
- The 3 ECE variants NOT in this map (_debiased, _l2, _l2_debiased) get the
406
- submodule-path warning template instead (Decision R6-G).
407
- """
408
-
409
-
410
- def _deprecation_warning_for(name: str) -> str:
411
- """Render the DeprecationWarning message for a deprecated scalar name.
412
-
413
- Branches on whether ``name`` has a first-party `metric_specs` replacement
414
- (Decision R6-G):
415
-
416
- - First-party (5 names): scorecard snippet with the correct encoded key
417
- (Decision R6-F).
418
- - Submodule-only (3 ECE variants): point at the submodule path per
419
- Decision R6-G.
420
-
421
- The first-party variants for ECE include a migration note explaining the
422
- new ``metric_specs.ece()`` factory default of ``n_bins=15`` so users can
423
- opt in to the new convention; the snippet itself uses ``n_bins=10`` for
424
- bit-identical pre-v0.46 math (Decision R6-F).
425
-
426
- Parameters
427
- ----------
428
- name : str
429
- A name in ``_DEPRECATED_SCALARS``.
430
-
431
- Returns
432
- -------
433
- str
434
- The warning message, ready to pass to ``warnings.warn``.
435
- """
436
- first_party = _FIRST_PARTY_REPLACEMENTS.get(name)
437
- if first_party is not None:
438
- factory_expr, scorecard_key = first_party
439
- msg = (
440
- f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
441
- f"For the same math, use:\n"
442
- f" scorecard(y, s, metrics=[metric_specs.{factory_expr}])"
443
- f'["{scorecard_key}"].value\n'
444
- f"Or import from the eval_toolkit.metrics submodule directly "
445
- f"(internal API per ADR 0002 — stable across v1.x, subject to "
446
- f"refactor in major versions)."
447
- )
448
- # ECE-specific migration note about the n_bins default change.
449
- if name.startswith("expected_calibration_error"):
450
- msg += (
451
- "\nNote: the v0.46+ metric_specs.ece() factory defaults to "
452
- "n_bins=15 (matching Hines et al.); the n_bins=10 in this "
453
- "snippet preserves the pre-v0.46 math. Pass n_bins=15 to use "
454
- "the new convention."
455
- )
456
- return msg
457
- # Decision R6-G: 3 ECE variants without first-party replacements →
458
- # submodule path only.
459
- return (
460
- f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
461
- f"This variant is NOT in v0.46+ metric_specs. Use:\n"
462
- f" from eval_toolkit.metrics import {name}\n"
463
- f"(internal API per ADR 0002 — stable across v1.x, subject to "
464
- f"refactor in major versions). Or contribute the variant to "
465
- f"metric_specs if you use it regularly."
466
- )
467
-
468
-
469
- # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
470
-
471
-
472
353
  def __dir__() -> list[str]:
473
354
  """Expose lazy public symbols to introspection."""
474
355
  return sorted(__all__)