eval-toolkit 0.46.0__tar.gz → 0.47.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/.gitignore +10 -0
  2. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/CHANGELOG.md +199 -0
  3. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/PKG-INFO +6 -3
  4. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/README.md +5 -2
  5. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/source/adr/README.md +3 -1
  6. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/__init__.py +32 -76
  7. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_scorecard.py +49 -5
  8. eval_toolkit-0.47.0/src/eval_toolkit/_sweep.py +184 -0
  9. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_version.py +1 -1
  10. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/adversarial.py +293 -173
  11. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/metric_specs.py +92 -0
  12. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/preprocessing.py +75 -99
  13. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/protocols.py +35 -0
  14. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/benchmarks/test_kernel_benchmarks.py +3 -1
  15. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/public_api/snapshot.json +178 -22
  16. eval_toolkit-0.47.0/tests/test_adversarial.py +420 -0
  17. eval_toolkit-0.47.0/tests/test_deprecated_scalars_shim.py +211 -0
  18. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_logging.py +2 -1
  19. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_preprocessing.py +91 -52
  20. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_public_api.py +125 -1
  21. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_scorecard.py +316 -14
  22. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_stacking.py +1 -1
  23. eval_toolkit-0.47.0/tests/test_sweep.py +180 -0
  24. eval_toolkit-0.46.0/tests/test_adversarial.py +0 -351
  25. eval_toolkit-0.46.0/tests/test_deprecated_scalars_shim.py +0 -184
  26. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/LICENSE +0 -0
  27. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/STYLE.md +0 -0
  28. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/archive/README.md +0 -0
  29. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/README.md +0 -0
  30. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/datasets/README.md +0 -0
  31. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/data-integrity/README.md +0 -0
  32. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  33. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/inference/README.md +0 -0
  34. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/research/papers/prompt-injection/README.md +0 -0
  35. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/docs/source/methodology/README.md +0 -0
  36. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/pyproject.toml +0 -0
  37. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/__main__.py +0 -0
  38. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_deprecated.py +0 -0
  39. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/_parallel.py +0 -0
  40. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/analysis.py +0 -0
  41. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/artifacts.py +0 -0
  42. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/bootstrap.py +0 -0
  43. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/calibration.py +0 -0
  44. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/claims.py +0 -0
  45. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/config.py +0 -0
  46. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/docs.py +0 -0
  47. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/embeddings.py +0 -0
  48. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/evidence.py +0 -0
  49. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/harness.py +0 -0
  50. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/leakage.py +0 -0
  51. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/loaders.py +0 -0
  52. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/losses.py +0 -0
  53. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/manifest.py +0 -0
  54. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/metrics.py +0 -0
  55. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/operating_points.py +0 -0
  56. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/paths.py +0 -0
  57. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/plotting.py +0 -0
  58. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/probes.py +0 -0
  59. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/provenance.py +0 -0
  60. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/py.typed +0 -0
  61. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  62. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  63. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  64. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  65. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  66. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  67. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/seeds.py +0 -0
  68. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/splits.py +0 -0
  69. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/stacking.py +0 -0
  70. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/text_dedup.py +0 -0
  71. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/src/eval_toolkit/thresholds.py +0 -0
  72. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  73. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  74. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  75. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  76. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  77. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  78. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  79. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  80. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  81. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  82. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/benchmarks/__init__.py +0 -0
  83. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/conftest.py +0 -0
  84. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  85. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  86. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  87. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  88. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/expected.md +0 -0
  89. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/input.md +0 -0
  90. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/docs/metrics.json +0 -0
  91. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  92. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/strategies.py +0 -0
  93. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_analysis.py +0 -0
  94. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_artifacts.py +0 -0
  95. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  96. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  97. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_edge_cases.py +0 -0
  98. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_golden.py +0 -0
  99. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_njobs.py +0 -0
  100. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_props.py +0 -0
  101. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_research_grounded.py +0 -0
  102. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_bootstrap_unit.py +0 -0
  103. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_binary_adapters.py +0 -0
  104. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  105. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_determinism.py +0 -0
  106. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_optimization_failures.py +0 -0
  107. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_props.py +0 -0
  108. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_research_grounded.py +0 -0
  109. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_calibration_unit.py +0 -0
  110. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims.py +0 -0
  111. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims_coverage.py +0 -0
  112. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_claims_props.py +0 -0
  113. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_cli.py +0 -0
  114. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_config.py +0 -0
  115. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_bootstrap.py +0 -0
  116. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_calibration.py +0 -0
  117. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_harness.py +0 -0
  118. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_metrics.py +0 -0
  119. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_coverage_plotting.py +0 -0
  120. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_croissant_e2e.py +0 -0
  121. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  122. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_deprecations.py +0 -0
  123. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_docs_golden.py +0 -0
  124. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_docs_props.py +0 -0
  125. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_embeddings.py +0 -0
  126. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_evidence_validators.py +0 -0
  127. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_edge_cases.py +0 -0
  128. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_fault_injection.py +0 -0
  129. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_folded.py +0 -0
  130. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_internals.py +0 -0
  131. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_metric_options.py +0 -0
  132. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_parallelism.py +0 -0
  133. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_harness_smoke.py +0 -0
  134. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_import_boundaries.py +0 -0
  135. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  136. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_losses.py +0 -0
  143. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest.py +0 -0
  144. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  145. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_props.py +0 -0
  146. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_manifest_validation.py +0 -0
  147. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_props.py +0 -0
  148. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_stratified_subsets.py +0 -0
  149. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_metrics_unit.py +0 -0
  150. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_misc_coverage.py +0 -0
  151. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_numeric_edge_cases.py +0 -0
  152. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_ood_loader.py +0 -0
  153. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_operating_points.py +0 -0
  154. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_operating_points_props.py +0 -0
  155. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_parallel.py +0 -0
  156. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_paths.py +0 -0
  157. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_pipeline_e2e.py +0 -0
  158. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_edge.py +0 -0
  159. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_smoke.py +0 -0
  160. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_plotting_visual.py +0 -0
  161. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_probes.py +0 -0
  162. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_protocol_conformance.py +0 -0
  163. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_provenance.py +0 -0
  164. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_recall_at_fpr.py +0 -0
  165. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_reference_equivalence.py +0 -0
  166. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_reproducibility_integration.py +0 -0
  167. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_schemas.py +0 -0
  168. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_seeds.py +0 -0
  169. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits.py +0 -0
  170. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits_leakage_integration.py +0 -0
  171. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_splits_props.py +0 -0
  172. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup.py +0 -0
  173. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_coverage.py +0 -0
  174. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_props.py +0 -0
  175. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_text_dedup_strategies.py +0 -0
  176. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds.py +0 -0
  177. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_constant_score.py +0 -0
  178. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_coverage.py +0 -0
  179. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_props.py +0 -0
  180. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_thresholds_research_grounded.py +0 -0
  181. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_tokenization_leakage_check.py +0 -0
  182. {eval_toolkit-0.46.0 → eval_toolkit-0.47.0}/tests/test_v09_contracts.py +0 -0
@@ -45,6 +45,16 @@ coverage.json
45
45
  # Mutation-testing output (mutmut / cargo-mutants — local run artifacts)
46
46
  mutants/
47
47
 
48
+ # Local audit artifacts (Round 5+ Gate 3 LLM cross-review packets + reports).
49
+ # The canonical prompt lives at ~/.claude/plans/gate3-audit-prompt.md and the
50
+ # canonical findings ledger lives at docs/source/audit_findings.md; per-run
51
+ # raw model outputs are author-local working copies.
52
+ # Tracked: per-round briefing files (`gate3-audit-round-<N>.md`).
53
+ # Untracked: prompt template, generic report, per-round report files.
54
+ gate3-audit-prompt.md
55
+ gate3-audit-report.md
56
+ gate3-audit-round-*-report.md
57
+
48
58
  # Claude Code project settings (machine-local)
49
59
  .claude/
50
60
 
@@ -5,6 +5,205 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
9
+
10
+ Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
11
+ v0.48 → v1.0 release sequence (plan
12
+ ``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 3).
13
+
14
+ Closes:
15
+
16
+ - The v0.43 CHANGELOG forward-look re: advanced-6 character-injection
17
+ techniques (Decision Q11→11.3 — "12-technique suite + new sweep API
18
+ in one migration step").
19
+ - Round 6 audit follow-on items per Decision R6-E (R6-A docstring,
20
+ R6-B duplicate name guard, R6-C to_pandas schema, R6-D Protocol
21
+ method-shape drift guard, R6-F5 narrow except, R6-F6 plan/roadmap
22
+ refresh, R6-H make_spec_name helper) — see ``docs/source/audit_findings.md``
23
+ for the per-finding ledger.
24
+
25
+ ### Removed (BREAKING)
26
+
27
+ - **Top-level scalar metric names** (``eval_toolkit.pr_auc``,
28
+ ``eval_toolkit.roc_auc``, ``eval_toolkit.brier_score``, all 5
29
+ ``expected_calibration_*`` variants) — the v0.46 ``__getattr__``
30
+ deprecation shim has been deleted. These names now raise
31
+ ``AttributeError`` at the top-level. Migration: use ``scorecard(...)``
32
+ with ``metric_specs`` (primary) OR import from the
33
+ ``eval_toolkit.metrics`` submodule (internal API per ADR 0002).
34
+ (Decision L; plan §4D.)
35
+ - **Module-level ``adversarial.sweep`` + ``preprocessing.sweep``** —
36
+ consolidated into the top-level :func:`sweep` (Decision D + plan §4C).
37
+ Parity tests in Sub-PR 4 of this release proved 1:1 output equivalence
38
+ on the neutral subset.
39
+ - **``adversarial.character_injection`` + ``preprocessing.spotlighting``
40
+ ``SimpleNamespace`` shortcuts** — removed (Decision N + plan §4E).
41
+ The 12 adversarial dataclasses + the 3 preprocessing variants + the
42
+ underlying functional API are the only public paths.
43
+ - **``adversarial.CharacterInjectionStrategy``** per-module Protocol —
44
+ removed. The top-level :class:`TextTransform` Protocol (Decision K)
45
+ is the single canonical contract; all 12 character-injection
46
+ dataclasses + 3 preprocessing variants satisfy it structurally.
47
+
48
+ ### Added
49
+
50
+ - **``TextTransform`` Protocol** (top-level; ``eval_toolkit.protocols`` module).
51
+ Decision K + Audit R5-F3 (Codex Round 5): unifies the "name + transform(text)"
52
+ shape across preprocessing (defence) and adversarial (attack) strategies so
53
+ the v0.47 top-level :func:`sweep` (next sub-PR) can mix them in one call. The
54
+ 9th strict Tier-2 Protocol per ADR 0003.
55
+ - **Advanced 6 character-injection techniques** (plan §4F, Decision Q11→11.3) —
56
+ closes the v0.43.0 CHANGELOG forward-look that referenced these as "scheduled
57
+ for v0.43.1" (a version that never shipped). Each satisfies the top-level
58
+ :class:`TextTransform` Protocol structurally; all are frozen + ``slots=True``
59
+ dataclasses with deterministic behaviour under their ``seed`` kwarg where
60
+ applicable:
61
+
62
+ - :class:`BidiRTLInjection` — wrap input in ``U+202E … U+202C``
63
+ RIGHT-TO-LEFT OVERRIDE block.
64
+ - :class:`TagStrippingInjection` — strip HTML/XML-like ``<…>`` tags
65
+ (idempotent).
66
+ - :class:`SynonymSubstitution` — replace whitelisted prompt-injection-
67
+ relevant function words / verbs with semantic-preserving synonyms.
68
+ - :class:`TokenSplitting` — insert a single space inside long enough
69
+ words; forces subword tokenizers to re-segment.
70
+ - :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD; default NFKC
71
+ folds compatibility chars (e.g., fullwidth ``ABC`` → ``ABC``).
72
+ - :class:`InvisibleCharsInjection` — sample from the 5-element invisible-
73
+ code-point set (ZWSP, ZWNJ, ZWJ, word joiner, BOM) — distinct from the
74
+ single-codepoint :class:`ZeroWidthSpaceInjection`.
75
+
76
+ Also exported: ``ADVANCED_TECHNIQUES`` (6-tuple) and ``ALL_TECHNIQUES``
77
+ (12-tuple = core 6 + advanced 6).
78
+ - **Top-level :func:`sweep`** — single ``TextTransform`` enumeration entry
79
+ point (Decision K + Decision D + Audit R5-F3). Replaces the per-module
80
+ ``adversarial.sweep`` + ``preprocessing.sweep`` (those are removed in a
81
+ subsequent sub-PR of this release). New contract:
82
+
83
+ - ``sweep(strategies, texts)`` → neutral DataFrame with ``text_id`` /
84
+ ``variant`` / ``transformed_text`` columns. Pure text-transform
85
+ enumeration; defence + attack strategies compose freely.
86
+ - ``sweep(..., scorer=...)`` → also emits ``original_score`` /
87
+ ``transformed_score`` columns (single batched scorer call per
88
+ strategy, not per-row).
89
+ - ``sweep(..., scorer=..., attack_threshold=t)`` → also emits ``asr``
90
+ (per-row attack-success flag). Explicit threshold REQUIRED to
91
+ materialize ``asr``; no magic ``threshold=0.5`` default.
92
+ ``attack_threshold`` without ``scorer`` raises ``ValueError``.
93
+
94
+ Parity tests against the existing module-level sweeps ship in this
95
+ sub-PR (``tests/test_sweep.py``) and prove the v0.47 consolidation
96
+ produces identical transformed-text rows for the 6 core character-
97
+ injection techniques + the 3 spotlighting variants.
98
+ - **3 preprocessing dataclasses** (``DelimitVariant``, ``DatamarkVariant``,
99
+ ``EncodeVariant``) in :mod:`eval_toolkit.preprocessing`. Frozen +
100
+ ``slots=True`` thin wrappers over the existing :func:`delimit` /
101
+ :func:`datamark` / :func:`encode` functions. Closes Audit R5-F3
102
+ (Codex Round 5) — prior to this commit, ``preprocessing.__all__`` exported
103
+ only functions, so the "concrete classes satisfy ``TextTransform``
104
+ structurally" claim only held on the adversarial side. Now both sides
105
+ share the dataclass-strategy shape.
106
+ - ``metric_specs.make_spec_name(prefix, **kwargs)`` canonicalization helper
107
+ for custom parameterized :class:`MetricSpec` implementations. Alphabetized
108
+ kwargs joined by underscore — same convention the v0.46 ECE factory uses.
109
+ Lands in ``metric_specs.__all__`` only; **not** top-level ``__all__`` per
110
+ Decision R6-H. (Closes Round 6 Gemini R6-F4.)
111
+
112
+ ### Changed (Round 6 follow-on)
113
+
114
+ - ``scorecard()`` now raises ``ValueError`` when two :class:`MetricSpec`
115
+ instances in the ``metrics`` list share a ``name``. Forces caller
116
+ disambiguation; the ``Mapping[str, MetricResult]`` contract never silently
117
+ drops a cell. Error message reports both indices. (Decision R6-B; closes
118
+ Round 6 Codex R6-F3.)
119
+ - ``scorecard(seed=None)`` docstring rewritten to document the deterministic-
120
+ by-default contract (``None`` is treated as ``seed=0``). No behavior
121
+ change; v0.46 documented the wrong contract. (Decision R6-A; closes Round 6
122
+ Codex R6-F4 + Gemini R6-F1.)
123
+ - ``_evaluate_spec()`` exception catches narrowed: ``MemoryError``,
124
+ ``RecursionError``, ``KeyboardInterrupt``, and ``SystemExit`` now propagate
125
+ out of ``scorecard()`` instead of being captured as a ``status="error"``
126
+ cell. Per-cell isolation remains for ordinary application errors.
127
+ (Decision R6-F5; closes Round 6 Gemini R6-F5.)
128
+ - ``Scorecard.to_pandas()`` MultiIndex schema extended with two new inner-
129
+ field columns: ``n_resamples`` (int / NaN sentinel) and ``method``
130
+ (string / ``""`` sentinel). The DataFrame view is now lossless against
131
+ :meth:`BootstrapCI.to_dict` — trace provenance (resample count + CI
132
+ method) no longer drops at the DataFrame boundary. Callers indexing the
133
+ MultiIndex by name keep working; callers indexing by position must
134
+ re-check column offsets. (Decision R6-C; closes Round 6 Gemini R6-F3.)
135
+ - ``tests/test_public_api.py`` drift guard now captures method signatures
136
+ for ``typing.Protocol`` classes in ``__all__`` (a ``protocol_methods``
137
+ sub-entry in the snapshot). Together with a Tier-2 coverage test, this
138
+ actually enforces the strict method-shape stability ADR 0003 promises
139
+ for the 9 Tier-2 Protocols. (Decision R6-D; closes Round 6 Codex R6-F5.)
140
+ Public-API golden regenerated alongside this change.
141
+
142
+ ## [0.46.1] — 2026-05-21 — Round 6 hotfix: ECE strategy validation + deprecation warning content
143
+
144
+ Hotfix release per **Decision Q** (data correctness regression + time-sensitive
145
+ warning content) + **Decision R6-E** (scope: R6-F1 + R6-F2 only; R6-A docstring
146
+ rolls forward to v0.47). All other Round 6 findings dispositioned to v0.47.0.
147
+
148
+ See [`docs/source/audit_findings.md`](docs/source/audit_findings.md) Round 6 for
149
+ the full disposition ledger.
150
+
151
+ ### Fixed
152
+
153
+ - **`metric_specs.ece(strategy=<value>)` strategy validation** (Round 6 Codex
154
+ R6-F1). Prior to v0.46.1, an invalid strategy string (e.g.
155
+ `metric_specs.ece(strategy="typo")`) silently dispatched to quantile ECE and
156
+ returned a `scorecard()` cell with `status="ok"` under an invalid encoded key
157
+ (`"ece_n_bins_15_strategy_typo"`) — wrong-by-design data correctness path.
158
+ Verified by Codex via runtime probe. Now both the `ece()` factory and
159
+ `_EceSpec.compute()` raise:
160
+ ```
161
+ ValueError: ECE strategy must be 'uniform' or 'quantile'; got 'typo'
162
+ ```
163
+ Defence-in-depth: the factory validates eagerly (before LRU cache hit) AND
164
+ `compute()` validates at the compute boundary so direct construction of
165
+ `_EceSpec(strategy="typo")` (bypassing the factory) also raises.
166
+
167
+ - **Deprecation warning content for all 5 ECE variants** (Round 6 Codex R6-F2 +
168
+ Gemini R6-F2, with Decisions R6-F + R6-G). The v0.46.0 `__getattr__`
169
+ deprecation shim's warning messages produced broken migration snippets:
170
+ - For `expected_calibration_error` + `expected_calibration_error_equal_mass`:
171
+ the suggested `Scorecard` lookup key was the factory-call expression
172
+ (`"ece(n_bins=10)"`) instead of the encoded spec name
173
+ (`"ece_n_bins_10_strategy_uniform"`). Now uses the correct encoded key.
174
+ - For `expected_calibration_error_debiased` / `_l2` / `_l2_debiased`: these
175
+ variants are not in the v0.46 `metric_specs` namespace (Decision R6-G;
176
+ research-completeness primitives, deferred to v1.x if user demand
177
+ surfaces). Their warnings now point at the submodule path
178
+ (`from eval_toolkit.metrics import expected_calibration_error_debiased`)
179
+ instead of an unconstructable scorecard snippet.
180
+ - Pre-v0.46 default verification: Gemini's report claimed
181
+ `expected_calibration_error` defaulted to `n_bins=15`; verified against
182
+ `metrics.py:730-734` that the actual default is `n_bins=10`. Per Decision
183
+ R6-F, warning snippets use `n_bins=10` to preserve bit-identical pre-v0.46
184
+ math + add a migration note explaining the new `metric_specs.ece()` factory
185
+ default of `n_bins=15` (matching Hines et al.).
186
+
187
+ ### Tests
188
+
189
+ - `tests/test_scorecard.py`: 4 new tests for ECE strategy validation
190
+ (parametrized factory-rejection + compute-defence-in-depth).
191
+ - `tests/test_deprecated_scalars_shim.py`: 4 new test classes — verify each
192
+ warning contains correct factory expression + encoded scorecard key, ECE
193
+ warnings carry the n_bins=10/15 migration note, submodule-only warnings cite
194
+ `eval_toolkit.metrics` path, and the snippet in each first-party warning is
195
+ EXECUTABLE (parses + runs against synthetic data + produces ok-status cell).
196
+
197
+ ### Rolled forward to v0.47 (Decision R6-E)
198
+
199
+ - R6-A `seed=None` docstring fix (non-blocker per Decision Q).
200
+ - R6-F3 duplicate `MetricSpec.name` rejection.
201
+ - R6-F5 (Codex) Protocol method-shape drift guard.
202
+ - R6-F3 (Gemini) `Scorecard.to_pandas()` schema expansion.
203
+ - R6-F4 (Gemini) `make_spec_name()` helper.
204
+ - R6-F5 (Gemini) narrow `_evaluate_spec()` exception catch.
205
+ - R6-F6 (Codex) plan + roadmap state-drift refresh.
206
+
8
207
  ## [0.46.0] — 2026-05-21 — Scorecard: primary v1.0 metric surface (closes #36)
9
208
 
10
209
  Second minor of the staggered v0.45 → v0.46 → v0.47 → v0.48 → v1.0 sequence.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.46.0
3
+ Version: 0.47.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -215,7 +215,7 @@ pip install "eval-toolkit[all]" # everything
215
215
 
216
216
  ```python
217
217
  import numpy as np
218
- from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
218
+ from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
219
219
 
220
220
  rng = np.random.default_rng(42)
221
221
  y = rng.integers(0, 2, size=200)
@@ -230,7 +230,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
230
230
  ### Bootstrap confidence intervals
231
231
 
232
232
  ```python
233
- from eval_toolkit import bootstrap_ci, paired_bootstrap_diff, pr_auc
233
+ from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
234
+ from eval_toolkit.metrics import pr_auc
234
235
 
235
236
  ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
236
237
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
@@ -244,8 +245,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
244
245
  ### Temperature scaling (Guo et al. 2017)
245
246
 
246
247
  ```python
248
+ import numpy as np
247
249
  from eval_toolkit import fit_temperature
248
250
 
251
+ rng = np.random.default_rng(42)
249
252
  logits = rng.normal(size=(500, 2))
250
253
  labels = (logits[:, 1] > logits[:, 0]).astype(int)
251
254
  result = fit_temperature(logits, labels)
@@ -132,7 +132,7 @@ pip install "eval-toolkit[all]" # everything
132
132
 
133
133
  ```python
134
134
  import numpy as np
135
- from eval_toolkit import pr_auc, roc_auc, expected_calibration_error
135
+ from eval_toolkit.metrics import pr_auc, roc_auc, expected_calibration_error
136
136
 
137
137
  rng = np.random.default_rng(42)
138
138
  y = rng.integers(0, 2, size=200)
@@ -147,7 +147,8 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
147
147
  ### Bootstrap confidence intervals
148
148
 
149
149
  ```python
150
- from eval_toolkit import bootstrap_ci, paired_bootstrap_diff, pr_auc
150
+ from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
151
+ from eval_toolkit.metrics import pr_auc
151
152
 
152
153
  ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
153
154
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
@@ -161,8 +162,10 @@ print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
161
162
  ### Temperature scaling (Guo et al. 2017)
162
163
 
163
164
  ```python
165
+ import numpy as np
164
166
  from eval_toolkit import fit_temperature
165
167
 
168
+ rng = np.random.default_rng(42)
166
169
  logits = rng.normal(size=(500, 2))
167
170
  labels = (logits[:, 1] > logits[:, 0]).astype(int)
168
171
  result = fit_temperature(logits, labels)
@@ -73,4 +73,6 @@ What would have to change for this decision to be reopened?
73
73
 
74
74
  | # | Title | Status | Date |
75
75
  |---|---|---|---|
76
- | _none yet_ | | | |
76
+ | [0001](0001-flat-module-layout.md) | Flat single-file modules through v1.x | Accepted | 2026-05-21 |
77
+ | [0002](0002-scorecard-as-primary-metric-surface.md) | `scorecard()` as the primary v1.0 metric surface | Accepted | 2026-05-21 |
78
+ | [0003](0003-stability-contract-and-gate3-methodology.md) | v1.0 stability contract + Gate 3 methodology | Accepted | 2026-05-21 |
@@ -31,22 +31,36 @@ _logging.getLogger("eval_toolkit").addHandler(_logging.NullHandler())
31
31
  # tests/golden/public_api/ reads dict keys + values, not comments.
32
32
  _EXPORTS: dict[str, str] = {
33
33
  # --- adversarial ---
34
+ "ADVANCED_TECHNIQUES": "eval_toolkit.adversarial",
35
+ "ALL_TECHNIQUES": "eval_toolkit.adversarial",
36
+ "BidiRTLInjection": "eval_toolkit.adversarial",
34
37
  "CORE_TECHNIQUES": "eval_toolkit.adversarial",
35
38
  "CaseRandomization": "eval_toolkit.adversarial",
36
- "CharacterInjectionStrategy": "eval_toolkit.adversarial",
37
39
  "DiacriticInjection": "eval_toolkit.adversarial",
38
40
  "HomoglyphSubstitution": "eval_toolkit.adversarial",
41
+ "InvisibleCharsInjection": "eval_toolkit.adversarial",
39
42
  "PunctuationInjection": "eval_toolkit.adversarial",
43
+ "SynonymSubstitution": "eval_toolkit.adversarial",
44
+ "TagStrippingInjection": "eval_toolkit.adversarial",
45
+ "TokenSplitting": "eval_toolkit.adversarial",
46
+ "UnicodeNormalization": "eval_toolkit.adversarial",
40
47
  "WhitespaceInjection": "eval_toolkit.adversarial",
41
48
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
42
- "character_injection": "eval_toolkit.adversarial",
49
+ # CharacterInjectionStrategy + character_injection SimpleNamespace
50
+ # removed at v0.47 (Decision N + plan §4E). TextTransform Protocol +
51
+ # the 12 concrete dataclasses are now the only public path.
43
52
  # --- losses ---
44
53
  "RecallAtLowFPR": "eval_toolkit.losses",
45
54
  # --- preprocessing ---
55
+ # `spotlighting` SimpleNamespace removed at v0.47 (Decision N + plan §4E).
56
+ # The 3 Variant dataclasses + the underlying functional API are the
57
+ # only public path.
58
+ "DatamarkVariant": "eval_toolkit.preprocessing",
59
+ "DelimitVariant": "eval_toolkit.preprocessing",
60
+ "EncodeVariant": "eval_toolkit.preprocessing",
46
61
  "datamark": "eval_toolkit.preprocessing",
47
62
  "delimit": "eval_toolkit.preprocessing",
48
63
  "encode": "eval_toolkit.preprocessing",
49
- "spotlighting": "eval_toolkit.preprocessing",
50
64
  # --- probes ---
51
65
  "ActivationDeltaProbe": "eval_toolkit.probes",
52
66
  "ActivationExtractor": "eval_toolkit.probes",
@@ -247,6 +261,7 @@ _EXPORTS: dict[str, str] = {
247
261
  "PredictionReader": "eval_toolkit.protocols",
248
262
  "Scorer": "eval_toolkit.protocols",
249
263
  "SliceAwareScorer": "eval_toolkit.protocols",
264
+ "TextTransform": "eval_toolkit.protocols",
250
265
  "Versioned": "eval_toolkit.protocols",
251
266
  # --- seeds ---
252
267
  "set_global_seeds": "eval_toolkit.seeds",
@@ -298,64 +313,28 @@ _EXPORTS: dict[str, str] = {
298
313
  "MetricSpec": "eval_toolkit._scorecard",
299
314
  "Scorecard": "eval_toolkit._scorecard",
300
315
  "scorecard": "eval_toolkit._scorecard",
316
+ # --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
317
+ "sweep": "eval_toolkit._sweep",
301
318
  }
302
319
 
303
320
  __all__ = ["__version__", *_EXPORTS.keys()]
304
321
 
305
322
 
306
- # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
307
- # At v0.46 the scalar metric functions left the top-level `_EXPORTS` map (above)
308
- # in favor of the `scorecard()` surface (Decision A). To give the consumer one
309
- # release of overlap before the hard removal at v0.47, the names below remain
310
- # reachable via the package-level `__getattr__` (which delegates to the
311
- # `eval_toolkit.metrics` submodule) but emit a `DeprecationWarning` on first
312
- # lookup pointing at the new API.
313
- #
314
- # WHY THIS IS A BRANCH, NOT A REPLACEMENT (Audit F4 — Round 5):
315
- # `__getattr__` below is the load-bearing lazy export resolver for every name
316
- # in `_EXPORTS`. The deprecation branch is a discrete `if name in
317
- # _DEPRECATED_SCALARS` check ABOVE the resolver — the resolver's existing
318
- # behavior for non-deprecated names is unchanged. At v0.47 we delete this
319
- # transitional block and the resolver continues to work for every remaining
320
- # `_EXPORTS` entry.
321
- _DEPRECATED_SCALARS: frozenset[str] = frozenset(
322
- {
323
- "pr_auc",
324
- "roc_auc",
325
- "brier_score",
326
- "expected_calibration_error",
327
- "expected_calibration_error_debiased",
328
- "expected_calibration_error_equal_mass",
329
- "expected_calibration_error_l2",
330
- "expected_calibration_error_l2_debiased",
331
- }
332
- )
333
- # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
334
-
335
-
336
323
  def __getattr__(name: str) -> Any:
337
- """Resolve public symbols lazily."""
324
+ """Resolve public symbols lazily.
325
+
326
+ v0.47 cleanup (Decision L): the BEGIN/END TRANSITIONAL DEPRECATION
327
+ BRANCH that v0.46 inserted in front of the resolver — together with the
328
+ ``_DEPRECATED_SCALARS`` frozenset and the ``_deprecation_warning_for``
329
+ helper — has been removed. The lazy resolver below is the v0.46 base
330
+ behavior; with the transitional block gone, deprecated v0.45 scalar names
331
+ (``pr_auc``, ``roc_auc``, ``brier_score``, the 5 ``expected_calibration_*``
332
+ variants) now raise :class:`AttributeError` cleanly. Submodule-level
333
+ access (e.g., ``from eval_toolkit.metrics import pr_auc``) is unaffected
334
+ per Decision C / ADR 0002.
335
+ """
338
336
  if name == "__version__":
339
337
  return __version__
340
- # ── BEGIN TRANSITIONAL DEPRECATION BRANCH (Decision L; REMOVE AT v0.47) ──
341
- if name in _DEPRECATED_SCALARS:
342
- import warnings
343
-
344
- warnings.warn(
345
- f"eval_toolkit.{name} is deprecated and will be removed in v0.47. "
346
- f"Use `scorecard(y, s, metrics=[metric_specs.{_scorecard_spec_for(name)}])"
347
- f'["{_scorecard_spec_for(name)}"].value` instead, or "import from the'
348
- f" `eval_toolkit.metrics` submodule directly (internal API).",
349
- DeprecationWarning,
350
- stacklevel=2,
351
- )
352
- module = import_module("eval_toolkit.metrics")
353
- value = getattr(module, name)
354
- # Do NOT cache in globals() — repeated lookups should keep re-warning
355
- # (one warning per call site, modulo Python's default
356
- # DeprecationWarning de-duplication).
357
- return value
358
- # ── END TRANSITIONAL DEPRECATION (Decision L; REMOVE AT v0.47) ──
359
338
  module_name = _EXPORTS.get(name)
360
339
  if module_name is None:
361
340
  raise AttributeError(f"module 'eval_toolkit' has no attribute {name!r}")
@@ -365,29 +344,6 @@ def __getattr__(name: str) -> Any:
365
344
  return value
366
345
 
367
346
 
368
- # ── BEGIN TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
369
- def _scorecard_spec_for(deprecated_name: str) -> str:
370
- """Map a deprecated-scalar name to its `metric_specs` replacement name.
371
-
372
- Used only inside the v0.46 deprecation warning message. Returns the
373
- closest equivalent first-party spec name where one exists; falls back
374
- to the original name for ECE variants whose exact-match spec isn't in
375
- the v0.46 first-party namespace (e.g., the L2 / debiased variants —
376
- callers either implement a custom `MetricSpec` or stay on the
377
- submodule path).
378
- """
379
- return {
380
- "pr_auc": "pr_auc",
381
- "roc_auc": "roc_auc",
382
- "brier_score": "brier",
383
- "expected_calibration_error": "ece(n_bins=10)",
384
- "expected_calibration_error_equal_mass": 'ece(n_bins=10, strategy="quantile")',
385
- }.get(deprecated_name, deprecated_name)
386
-
387
-
388
- # ── END TRANSITIONAL DEPRECATION HELPER (Decision L; REMOVE AT v0.47) ──
389
-
390
-
391
347
  def __dir__() -> list[str]:
392
348
  """Expose lazy public symbols to introspection."""
393
349
  return sorted(__all__)
@@ -261,10 +261,17 @@ class Scorecard(Mapping[str, MetricResult]):
261
261
  ``ImportError`` with an install hint when pandas is missing.
262
262
 
263
263
  The DataFrame has 1 row (one slice) and a 2-level column index:
264
- outer = metric name, inner = field name in
265
- ``{"value", "status", "reason", "ci_low", "ci_high", "confidence"}``.
266
- ``ci_low`` / ``ci_high`` / ``confidence`` are ``NaN`` / ``""`` when
267
- no CI is present.
264
+ outer = metric name, inner = field name in ``{"value", "status",
265
+ "reason", "ci_low", "ci_high", "confidence", "n_resamples",
266
+ "method"}``. CI-related columns (``ci_low``, ``ci_high``,
267
+ ``confidence``, ``n_resamples``, ``method``) are sentinel-valued
268
+ (``NaN`` for numeric, ``""`` for string) when no CI is present
269
+ (status="skipped" / "error", or bootstrap=False).
270
+
271
+ Decision R6-C (Round 6 audit, Gemini F3): the v0.47 expansion adds
272
+ ``n_resamples`` + ``method`` so the schema is lossless against
273
+ :meth:`BootstrapCI.to_dict` — trace provenance no longer drops in
274
+ the DataFrame view.
268
275
  """
269
276
  try:
270
277
  import pandas as pd
@@ -285,6 +292,8 @@ class Scorecard(Mapping[str, MetricResult]):
285
292
  (name, "ci_low"),
286
293
  (name, "ci_high"),
287
294
  (name, "confidence"),
295
+ (name, "n_resamples"),
296
+ (name, "method"),
288
297
  ]
289
298
  )
290
299
  values.extend(
@@ -295,6 +304,8 @@ class Scorecard(Mapping[str, MetricResult]):
295
304
  result.ci.ci_low if result.ci is not None else float("nan"),
296
305
  result.ci.ci_high if result.ci is not None else float("nan"),
297
306
  result.ci.confidence if result.ci is not None else float("nan"),
307
+ result.ci.n_resamples if result.ci is not None else float("nan"),
308
+ result.ci.method if result.ci is not None else "",
298
309
  ]
299
310
  )
300
311
 
@@ -341,7 +352,13 @@ def scorecard(
341
352
  confidence : float, optional
342
353
  Two-sided CI level ∈ ``(0, 1)``. Default ``0.95``.
343
354
  seed : int or None, optional
344
- Bootstrap RNG seed. Default ``None`` (non-deterministic).
355
+ Bootstrap RNG seed. Default ``None``, which is treated as ``seed=0``
356
+ for reproducibility — eval-toolkit's evaluation pipelines are
357
+ deterministic by default. Pass an explicit integer to control the
358
+ bootstrap RNG; pass a value derived from
359
+ ``np.random.SeedSequence().entropy`` for non-deterministic sampling.
360
+ Decision R6-A (Round 6 audit) locked the deterministic-by-default
361
+ contract; the prior docstring framing was incorrect.
345
362
 
346
363
  Returns
347
364
  -------
@@ -410,6 +427,7 @@ def scorecard(
410
427
  confidence=confidence,
411
428
  bootstrap=bootstrap,
412
429
  )
430
+ _validate_unique_spec_names(metrics)
413
431
 
414
432
  is_single_class = bool(np.unique(y_true_arr).size < 2)
415
433
 
@@ -450,6 +468,11 @@ def _evaluate_spec(
450
468
 
451
469
  try:
452
470
  point = float(spec.compute(y_true, y_score))
471
+ except (MemoryError, RecursionError, KeyboardInterrupt, SystemExit):
472
+ # Process-exhaustion / user-interrupt signals must propagate;
473
+ # per-cell isolation is for application-level errors only.
474
+ # Decision R6-F5 (Round 6 audit, Gemini).
475
+ raise
453
476
  except Exception as exc: # noqa: BLE001 — broad catch is intentional (per-cell isolation)
454
477
  return MetricResult(
455
478
  value=None,
@@ -469,6 +492,9 @@ def _evaluate_spec(
469
492
  confidence=confidence,
470
493
  seed=seed if seed is not None else 0,
471
494
  )
495
+ except (MemoryError, RecursionError, KeyboardInterrupt, SystemExit):
496
+ # Same R6-F5 invariant for the bootstrap path.
497
+ raise
472
498
  except Exception as exc: # noqa: BLE001
473
499
  # Point estimate succeeded; the bootstrap couldn't (e.g., n < 10
474
500
  # floor from bootstrap.py:198, BCa degeneracy, etc.). Record the
@@ -507,3 +533,21 @@ def _validate_scorecard_inputs(
507
533
  raise ValueError(f"n_resamples must be >= 1 when bootstrap=True; got {n_resamples}")
508
534
  if not 0.0 < confidence < 1.0:
509
535
  raise ValueError(f"confidence must be in (0, 1); got {confidence}")
536
+
537
+
538
+ def _validate_unique_spec_names(metrics: Sequence[MetricSpec]) -> None:
539
+ """Reject duplicate MetricSpec.name values in a single scorecard() call.
540
+
541
+ Locked by Decision R6-B (Round 6 audit, Codex R6-F3): silent last-wins
542
+ overwrite is not a documented Mapping[str, MetricResult] contract. Force
543
+ the caller to disambiguate so we never lose data on user error.
544
+ """
545
+ seen: dict[str, int] = {}
546
+ for i, spec in enumerate(metrics):
547
+ if spec.name in seen:
548
+ raise ValueError(
549
+ f"Duplicate MetricSpec name {spec.name!r} at index {i} "
550
+ f"(previously at index {seen[spec.name]}); each spec must have a "
551
+ f"unique name for the Scorecard Mapping[str, MetricResult] contract."
552
+ )
553
+ seen[spec.name] = i