eval-toolkit 0.47.0__tar.gz → 0.48.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/CHANGELOG.md +90 -0
  2. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/__init__.py +15 -9
  4. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_scorecard.py +32 -0
  5. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_sweep.py +120 -2
  6. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/adversarial.py +40 -18
  8. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/bootstrap.py +69 -16
  9. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/calibration.py +41 -3
  10. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/config.py +1 -1
  11. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/embeddings.py +1 -1
  12. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/loaders.py +2 -3
  13. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/metrics.py +15 -0
  14. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/public_api/snapshot.json +1 -1
  15. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_analysis.py +2 -3
  16. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_edge_cases.py +57 -0
  17. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_unit.py +39 -6
  18. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_bootstrap.py +4 -2
  19. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_metric_options.py +4 -2
  20. eval_toolkit-0.48.0/tests/test_lazy_extras_messages.py +283 -0
  21. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders.py +8 -2
  22. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_unit.py +153 -0
  23. eval_toolkit-0.48.0/tests/test_sweep.py +426 -0
  24. eval_toolkit-0.47.0/tests/test_sweep.py +0 -180
  25. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/.gitignore +0 -0
  26. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/LICENSE +0 -0
  27. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/README.md +0 -0
  28. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/STYLE.md +0 -0
  29. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/archive/README.md +0 -0
  30. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/README.md +0 -0
  31. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/datasets/README.md +0 -0
  32. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/data-integrity/README.md +0 -0
  33. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  34. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/inference/README.md +0 -0
  35. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/research/papers/prompt-injection/README.md +0 -0
  36. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/source/adr/README.md +0 -0
  37. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/docs/source/methodology/README.md +0 -0
  38. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/pyproject.toml +0 -0
  39. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/__main__.py +0 -0
  40. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_deprecated.py +0 -0
  41. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/_parallel.py +0 -0
  42. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/analysis.py +0 -0
  43. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/artifacts.py +0 -0
  44. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/claims.py +0 -0
  45. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/docs.py +0 -0
  46. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/evidence.py +0 -0
  47. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/harness.py +0 -0
  48. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/leakage.py +0 -0
  49. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/losses.py +0 -0
  50. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/manifest.py +0 -0
  51. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/metric_specs.py +0 -0
  52. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/operating_points.py +0 -0
  53. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/paths.py +0 -0
  54. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/plotting.py +0 -0
  55. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/preprocessing.py +0 -0
  56. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/probes.py +0 -0
  57. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/protocols.py +0 -0
  58. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/provenance.py +0 -0
  59. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/py.typed +0 -0
  60. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  61. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  62. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  63. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  64. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  65. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  66. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/seeds.py +0 -0
  67. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/splits.py +0 -0
  68. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/stacking.py +0 -0
  69. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/text_dedup.py +0 -0
  70. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/src/eval_toolkit/thresholds.py +0 -0
  71. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  72. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  73. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  74. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  75. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  76. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  77. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  78. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  79. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  80. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  81. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/benchmarks/__init__.py +0 -0
  82. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  83. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/conftest.py +0 -0
  84. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  85. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  86. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  87. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  88. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/expected.md +0 -0
  89. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/input.md +0 -0
  90. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/docs/metrics.json +0 -0
  91. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  92. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/strategies.py +0 -0
  93. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_adversarial.py +0 -0
  94. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_artifacts.py +0 -0
  95. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  96. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  97. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_golden.py +0 -0
  98. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_njobs.py +0 -0
  99. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_props.py +0 -0
  100. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_bootstrap_research_grounded.py +0 -0
  101. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_binary_adapters.py +0 -0
  102. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  103. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_determinism.py +0 -0
  104. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_optimization_failures.py +0 -0
  105. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_props.py +0 -0
  106. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_research_grounded.py +0 -0
  107. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_calibration_unit.py +0 -0
  108. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims.py +0 -0
  109. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims_coverage.py +0 -0
  110. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_claims_props.py +0 -0
  111. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_cli.py +0 -0
  112. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_config.py +0 -0
  113. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_calibration.py +0 -0
  114. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_harness.py +0 -0
  115. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_metrics.py +0 -0
  116. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_coverage_plotting.py +0 -0
  117. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_croissant_e2e.py +0 -0
  118. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  119. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_deprecated_scalars_shim.py +0 -0
  120. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_deprecations.py +0 -0
  121. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_docs_golden.py +0 -0
  122. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_docs_props.py +0 -0
  123. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_embeddings.py +0 -0
  124. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_evidence_validators.py +0 -0
  125. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_edge_cases.py +0 -0
  126. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_fault_injection.py +0 -0
  127. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_folded.py +0 -0
  128. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_internals.py +0 -0
  129. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_parallelism.py +0 -0
  130. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_harness_smoke.py +0 -0
  131. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_import_boundaries.py +0 -0
  132. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  133. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage.py +0 -0
  134. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage_error_paths.py +0 -0
  135. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_leakage_props.py +0 -0
  136. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders_coverage.py +0 -0
  137. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_loaders_props.py +0 -0
  138. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_logging.py +0 -0
  139. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_losses.py +0 -0
  140. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest.py +0 -0
  141. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  142. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_props.py +0 -0
  143. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_manifest_validation.py +0 -0
  144. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_props.py +0 -0
  145. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_metrics_stratified_subsets.py +0 -0
  146. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_misc_coverage.py +0 -0
  147. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_numeric_edge_cases.py +0 -0
  148. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_ood_loader.py +0 -0
  149. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_operating_points.py +0 -0
  150. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_operating_points_props.py +0 -0
  151. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_parallel.py +0 -0
  152. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_paths.py +0 -0
  153. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_pipeline_e2e.py +0 -0
  154. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_edge.py +0 -0
  155. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_smoke.py +0 -0
  156. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_plotting_visual.py +0 -0
  157. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_preprocessing.py +0 -0
  158. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_probes.py +0 -0
  159. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_protocol_conformance.py +0 -0
  160. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_provenance.py +0 -0
  161. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_public_api.py +0 -0
  162. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_recall_at_fpr.py +0 -0
  163. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_reference_equivalence.py +0 -0
  164. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_reproducibility_integration.py +0 -0
  165. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_schemas.py +0 -0
  166. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_scorecard.py +0 -0
  167. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_seeds.py +0 -0
  168. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits.py +0 -0
  169. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits_leakage_integration.py +0 -0
  170. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_splits_props.py +0 -0
  171. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_stacking.py +0 -0
  172. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup.py +0 -0
  173. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_coverage.py +0 -0
  174. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_props.py +0 -0
  175. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_text_dedup_strategies.py +0 -0
  176. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds.py +0 -0
  177. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_constant_score.py +0 -0
  178. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_coverage.py +0 -0
  179. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_props.py +0 -0
  180. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_thresholds_research_grounded.py +0 -0
  181. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_tokenization_leakage_check.py +0 -0
  182. {eval_toolkit-0.47.0 → eval_toolkit-0.48.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,96 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
9
+
10
+ Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
11
+ → v0.48 → v1.0 release sequence (plan
12
+ ``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 4). Migration
13
+ guide: ``docs/source/migration/v0.48.md``.
14
+
15
+ Closes:
16
+
17
+ - Round 7 audit STOP-GATE per Decision Y.2 (Codex R7-F1/F2/F3 + 6 Gemini
18
+ observations; see ``docs/source/audit_findings.md`` for the per-finding
19
+ ledger).
20
+ - Audit-as-seed extensions surfaced during plan refinement: full
21
+ module-docstring sweep across ``src/eval_toolkit/``; expanded
22
+ ``.doctest-modules`` from 11 → 21 modules; comprehensive cross-API
23
+ shape-validation consistency sweep.
24
+ - Round 5 §5E-prep packet-drift fixes (7 methodology documentation
25
+ corrections).
26
+
27
+ After v0.48 observes ≥1 consumer cycle, the Round 8 audit STOP-GATE
28
+ opens before ``v1.0.0`` tag.
29
+
30
+ ### BREAKING
31
+
32
+ - **``BootstrapCI.to_dict()`` + ``PairedBootstrapCI.to_dict()`` schema
33
+ rewrite** (§5B). Pre-v0.48 hard-coded a ``"ci_95"`` key regardless of
34
+ the actual ``confidence`` field — the key contradicted the data.
35
+ v0.48 schema is self-describing:
36
+
37
+ Before: ``{"point_estimate": p, "ci_95": [l, h], "confidence": 0.95, ...}``
38
+ After: ``{"point": p, "low": l, "high": h, "confidence": 0.95, ...}``
39
+
40
+ Migration: ``d["point_estimate"]`` → ``d["point"]``; ``d["ci_95"]``
41
+ → ``(d["low"], d["high"])``. Same rewrite for ``PairedBootstrapCI``.
42
+ - **``sweep()`` schema grows by 1 column** (§5I, Decision R7-B option C).
43
+ New ``strategy_id`` column inserted between ``text_id`` and ``variant``
44
+ carries the canonical per-row identifier built from configured
45
+ kwargs. Callers indexing by column position must re-check offsets.
46
+ - **``sweep()`` rejects duplicate ``strategy_id``** (§5I). Mirrors
47
+ R6-B's duplicate ``MetricSpec.name`` rejection in ``scorecard()``.
48
+ - **``sweep()`` validates scorer output shape** (§5J, Decision R7-C).
49
+ Wrong-shape arrays from ``Scorer.predict_proba`` raise contextual
50
+ ``ValueError`` at the boundary. Pre-v0.48: silent truncation
51
+ (overlong), ``IndexError`` (short), or ``TypeError`` (matrix-shaped).
52
+ - **``paired_bootstrap_op_point_diff()`` rejects ``val_y is test_y``**
53
+ (§5E-prep). The two-level bootstrap assumes disjoint val + test
54
+ partitions; passing the same array causes ~63.2% silent overlap.
55
+
56
+ ### Added
57
+
58
+ - **``make pre-push``** Makefile target (§5L) running all 3 doc-
59
+ execution surfaces — Sybil-collected ``.md`` fences, MyST-NB example
60
+ notebooks, and in-source ``>>>`` docstring examples. Closes the
61
+ v0.47 Sub-PR 7 incident class.
62
+ - **``nb_execution_raise_on_error = True``** in ``docs/source/conf.py``
63
+ (§5H, Decision R7-A). Docs CI now fails on notebook execution errors.
64
+ - **``.doctest-modules`` expanded** from 11 → 21 modules (§5M).
65
+
66
+ ### Changed
67
+
68
+ - **Cross-API shape-validation consistency** (§5N). Every public-API
69
+ surface with array inputs now validates shape + raises ``ValueError``
70
+ with context (rather than leaking low-level numpy/sklearn errors).
71
+ - **Standardized ``ImportError`` messages** across lazy-extras (§5C).
72
+ Canonical template: ``"<feature> requires <pkg>. Install with: pip
73
+ install eval-toolkit[<extra>]"``.
74
+ - **Pin-exact-key-set regression-guards** (§5A) for every dict-returning
75
+ metrics function. Audit revealed no drift; the tests pin existing
76
+ key sets so future drift fails CI loud.
77
+ - **Docs polish** (§5K + §5E-prep): ``SynonymSubstitution`` whitelist
78
+ ``Notes``; ``Scorecard.to_pandas()`` dtype coercion ``Notes``;
79
+ ``CostSensitiveSelector`` calibrated-prior ``Warning``; ``cv_clt_ci``
80
+ docstring per Bayle et al. (2020) Theorem 3.1; ``methodology/parallelism.md``
81
+ post-v0.36 state; ``methodology/testing.md`` reference-equivalence-gap
82
+ framing; ``methodology/calibration.md`` 4-binary-adapter family;
83
+ ``methodology/bootstrap.md`` disjoint-split example; DeLong docs
84
+ aligned to shipped state (Decision U).
85
+
86
+ ### Fixed
87
+
88
+ - **R7-F1**: 6 MyST-NB example notebooks (``docs/source/examples/*.md``)
89
+ migrated to v0.47 API; 4 module-level docstrings rewritten; 5
90
+ drifted ``docs/source/api/*.md`` autosummary lists corrected;
91
+ 8 missing ``api/*.md`` pages created; roadmap "Sybil-validated
92
+ examples" wording corrected (§5G).
93
+ - **ADR 0001** (flat-module layout) + **ADR 0003** (stability contract
94
+ + Gate 3 methodology) finalized for v1.0 (§5E + §5F).
95
+ - **schemas.md** + **methodology/claims.md** + **getting-started.md**:
96
+ ``BootstrapCI`` schema references updated for the §5B rewrite.
97
+
8
98
  ## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
9
99
 
10
100
  Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.47.0
3
+ Version: 0.48.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -1,9 +1,12 @@
1
1
  """eval-toolkit — reusable evaluation contracts for binary classification.
2
2
 
3
- Public API remains available from ``eval_toolkit`` and from submodules:
3
+ The v1.0 primary metric surface is :func:`~eval_toolkit.scorecard` plus the
4
+ :mod:`~eval_toolkit.metric_specs` namespace (ADR 0002). Submodule paths
5
+ remain available for scalar primitives and adapter authors:
4
6
 
5
- from eval_toolkit import pr_auc, bootstrap_ci, BootstrapCI
6
- from eval_toolkit.metrics import pr_auc
7
+ from eval_toolkit import scorecard, metric_specs as ms
8
+ from eval_toolkit import bootstrap_ci, BootstrapCI
9
+ from eval_toolkit.metrics import pr_auc # internal API, ADR 0002
7
10
 
8
11
  The package root uses lazy exports so importing ``eval_toolkit`` does not
9
12
  eagerly import optional-heavy modules such as plotting, loaders, or harnesses.
@@ -207,12 +210,15 @@ _EXPORTS: dict[str, str] = {
207
210
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
208
211
  "ThresholdResult": "eval_toolkit.metrics",
209
212
  "brier_decomposition": "eval_toolkit.metrics",
210
- # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
211
- # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
212
- # level via the `__getattr__` deprecation branch (emits
213
- # `DeprecationWarning`; branch removed at v0.47) and via the metrics
214
- # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
215
- # per ADR 0002, not part of the v1.0 stability contract).
213
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants were removed
214
+ # from `_EXPORTS` at v0.46 (Decision L); the v0.46 `__getattr__`
215
+ # deprecation branch that kept them reachable with `DeprecationWarning`
216
+ # was removed at v0.47. They now raise `AttributeError` at the top level.
217
+ # The metrics submodule (`from eval_toolkit.metrics import pr_auc`)
218
+ # remains the only stable import path for scalar primitives — internal
219
+ # API per ADR 0002, not part of the v1.0 stability contract. The
220
+ # `scorecard()` + `metric_specs` surface is the primary path going
221
+ # forward (`metric_specs.pr_auc`, `metric_specs.roc_auc`, etc.).
216
222
  "headline_metrics": "eval_toolkit.metrics",
217
223
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
218
224
  "metrics_at_threshold": "eval_toolkit.metrics",
@@ -272,6 +272,38 @@ class Scorecard(Mapping[str, MetricResult]):
272
272
  ``n_resamples`` + ``method`` so the schema is lossless against
273
273
  :meth:`BootstrapCI.to_dict` — trace provenance no longer drops in
274
274
  the DataFrame view.
275
+
276
+ Notes
277
+ -----
278
+ **Dtype coercion: ``n_resamples`` is ``float64``, not ``Int64``.**
279
+ ``BootstrapCI.n_resamples`` is an ``int`` at the Python level, but
280
+ pandas treats a mixed ``int`` + ``NaN`` column as ``float64`` —
281
+ any row with ``status != "ok"`` or ``bootstrap=False`` carries
282
+ ``NaN`` in the CI columns, and NaN forces the whole column to
283
+ floating-point. So ``df["pr_auc"]["n_resamples"].dtype`` is
284
+ ``float64``, and individual values read back as e.g. ``1000.0``
285
+ rather than ``1000`` (the trade-off Decision R6-C accepted to
286
+ keep the schema lossless).
287
+
288
+ Consumers expecting strict ``Int64`` semantics (e.g., for joins
289
+ against an integer-typed table, or for SQL emission where
290
+ ``float64`` would round-trip as ``DOUBLE``) need to cast
291
+ explicitly *after* dropping NaN rows:
292
+
293
+ ::
294
+
295
+ df["pr_auc"]["n_resamples"].dropna().astype("Int64")
296
+
297
+ or use pandas' nullable integer extension dtype at construction
298
+ time::
299
+
300
+ df["pr_auc"]["n_resamples"] = df["pr_auc"]["n_resamples"].astype("Int64")
301
+
302
+ which preserves NaN as ``pd.NA`` and the rest as integer.
303
+ ``Scorecard.to_pandas()`` does not perform this coercion by
304
+ default because it would force a pandas-nullable-dtype dependency
305
+ on every consumer; the float64 default works under any pandas
306
+ version.
275
307
  """
276
308
  try:
277
309
  import pandas as pd
@@ -103,7 +103,7 @@ def sweep(
103
103
  >>> from eval_toolkit import DelimitVariant, DatamarkVariant, sweep
104
104
  >>> df = sweep([DelimitVariant(), DatamarkVariant()], ["hello world"])
105
105
  >>> sorted(df.columns.tolist())
106
- ['text_id', 'transformed_text', 'variant']
106
+ ['strategy_id', 'text_id', 'transformed_text', 'variant']
107
107
  >>> df[df["variant"] == "delimit"].iloc[0]["transformed_text"]
108
108
  '<<hello world>>'
109
109
 
@@ -144,6 +144,7 @@ def sweep(
144
144
  f"sweep(): strategy at index {i} ({type(strategy).__name__}) "
145
145
  f"does not satisfy TextTransform (missing 'name' or 'transform')."
146
146
  )
147
+ _validate_unique_strategy_ids(strategies)
147
148
 
148
149
  text_list = list(texts)
149
150
  rows: list[dict[str, object]] = []
@@ -153,15 +154,25 @@ def sweep(
153
154
  original_scores: np.ndarray | None = None
154
155
  if scorer is not None and text_list:
155
156
  original_scores = np.asarray(scorer.predict_proba(text_list))
157
+ _validate_scorer_output(
158
+ original_scores, expected_n=len(text_list), label="original-texts batch"
159
+ )
156
160
 
157
161
  for strategy in strategies:
162
+ sid = _strategy_id_for(strategy)
158
163
  transformed_list = [strategy.transform(t) for t in text_list]
159
164
  transformed_scores: np.ndarray | None = None
160
165
  if scorer is not None and transformed_list:
161
166
  transformed_scores = np.asarray(scorer.predict_proba(transformed_list))
167
+ _validate_scorer_output(
168
+ transformed_scores,
169
+ expected_n=len(text_list),
170
+ label=f"transformed-texts batch for strategy {strategy.name!r}",
171
+ )
162
172
  for text_id, (_, transformed) in enumerate(zip(text_list, transformed_list, strict=True)):
163
173
  row: dict[str, object] = {
164
174
  "text_id": text_id,
175
+ "strategy_id": sid,
165
176
  "variant": strategy.name,
166
177
  "transformed_text": transformed,
167
178
  }
@@ -176,9 +187,116 @@ def sweep(
176
187
  row["asr"] = bool(s_orig >= attack_threshold > s_adv)
177
188
  rows.append(row)
178
189
 
179
- base_cols = ["text_id", "variant", "transformed_text"]
190
+ base_cols = ["text_id", "strategy_id", "variant", "transformed_text"]
180
191
  if scorer is not None:
181
192
  base_cols += ["original_score", "transformed_score"]
182
193
  if attack_threshold is not None:
183
194
  base_cols += ["asr"]
184
195
  return pd.DataFrame(rows, columns=base_cols)
196
+
197
+
198
+ # ─────────────────────────────────────────────────────────────────────────────
199
+ # Helpers — strategy identity (Decision R7-B; v0.48 §5I)
200
+ # ─────────────────────────────────────────────────────────────────────────────
201
+
202
+
203
+ def _strategy_id_for(strategy: TextTransform) -> str:
204
+ """Build a stable, repr-stable identifier from a strategy's configured state.
205
+
206
+ Decision R7-B (Round 7 audit, Codex R7-F2): a strategy's ``name`` alone is
207
+ not enough to identify a configured instance. Two instances of the same
208
+ dataclass with different kwargs (e.g., ``DelimitVariant(delimiter="<<")``
209
+ and ``DelimitVariant(delimiter="[[")``) share ``name == "delimit"`` and
210
+ would silently merge under ``groupby("variant")``. The ``strategy_id``
211
+ column carries the canonical configured identity so downstream
212
+ analysis can disambiguate.
213
+
214
+ Format (pseudo-URI; chosen for groupby-friendliness + special-char
215
+ safety via ``repr()``):
216
+
217
+ - Frozen dataclass strategies: ``"<name>/<k1>=<repr(v1)>,<k2>=<repr(v2)>,..."``
218
+ with kwargs alphabetized (excluding the ``name`` field itself). Mirrors
219
+ :func:`eval_toolkit.metric_specs.make_spec_name` but uses ``repr(value)``
220
+ instead of ``str(value)`` so string kwargs with special chars (``<<``,
221
+ ``[[``, ``^``, etc.) round-trip cleanly.
222
+ - Plain :class:`TextTransform`-Protocol-satisfying objects without
223
+ ``__dataclass_fields__``: falls back to ``strategy.name``.
224
+
225
+ Examples
226
+ --------
227
+ >>> from eval_toolkit.preprocessing import DelimitVariant
228
+ >>> _strategy_id_for(DelimitVariant(delimiter="<<", end=">>"))
229
+ "delimit/delimiter='<<',end='>>'"
230
+ >>> from eval_toolkit.adversarial import ZeroWidthSpaceInjection
231
+ >>> _strategy_id_for(ZeroWidthSpaceInjection(ratio=0.5, seed=42))
232
+ 'zero_width_space/ratio=0.5,seed=42'
233
+ """
234
+ fields = getattr(strategy, "__dataclass_fields__", None)
235
+ if fields is None:
236
+ return strategy.name
237
+ kw_pairs = sorted((f, getattr(strategy, f)) for f in fields if f != "name")
238
+ if not kw_pairs:
239
+ return strategy.name
240
+ return f"{strategy.name}/" + ",".join(f"{k}={v!r}" for k, v in kw_pairs)
241
+
242
+
243
+ def _validate_scorer_output(scores: np.ndarray, *, expected_n: int, label: str) -> None:
244
+ """Validate the shape of a batched ``Scorer.predict_proba`` result.
245
+
246
+ Decision R7-C (Round 7 audit, Codex R7-F3): three failure modes Codex
247
+ surfaced via runtime probe — too many 1-D scores (silent truncation,
248
+ worst class), too few (later ``IndexError``), and matrix-shaped
249
+ (later ``TypeError`` when ``float(...)`` is applied to a row). All
250
+ three become a single API-level ``ValueError`` with context.
251
+
252
+ Style invariants 1 (no silent failures) + 3 (API-level errors, never
253
+ low-level exceptions through the boundary). Drives Decision R7-C.
254
+
255
+ Parameters
256
+ ----------
257
+ scores : np.ndarray
258
+ The ``np.asarray()``-wrapped result of ``scorer.predict_proba(...)``.
259
+ expected_n : int
260
+ The expected length — ``len(texts)`` for the current sweep call.
261
+ label : str
262
+ Context for the error message naming the offending batch
263
+ (e.g., ``"original-texts batch"`` or
264
+ ``"transformed-texts batch for strategy 'zero_width_space'"``).
265
+
266
+ Raises
267
+ ------
268
+ ValueError
269
+ If ``scores.shape != (expected_n,)``.
270
+ """
271
+ if scores.shape != (expected_n,):
272
+ raise ValueError(
273
+ f"sweep(): scorer.predict_proba({label}) returned shape "
274
+ f"{scores.shape}; expected ({expected_n},). The Scorer Protocol "
275
+ f"requires one float P(positive) per input row (see "
276
+ f"`eval_toolkit.protocols.Scorer`); ensure your adapter returns "
277
+ f"a 1-D array of length len(texts)."
278
+ )
279
+
280
+
281
+ def _validate_unique_strategy_ids(strategies: Sequence[TextTransform]) -> None:
282
+ """Reject duplicate ``strategy_id`` values in a single ``sweep()`` call.
283
+
284
+ Decision R7-B (Round 7 audit, Codex R7-F2): mirrors R6-B's duplicate
285
+ ``MetricSpec.name`` rejection in ``scorecard()`` — same anti-silent-merge
286
+ invariant, applied to the sweep surface. No methodology-honest reason to
287
+ put the same configured strategy twice in one sweep; cache-warming +
288
+ reproducibility re-runs use ``strategy.transform()`` directly outside
289
+ ``sweep()``.
290
+ """
291
+ seen: dict[str, int] = {}
292
+ for i, strategy in enumerate(strategies):
293
+ sid = _strategy_id_for(strategy)
294
+ if sid in seen:
295
+ raise ValueError(
296
+ f"sweep(): duplicate strategy_id {sid!r} at index {i} "
297
+ f"(previously at index {seen[sid]}); each strategy must "
298
+ f"produce a unique strategy_id. If you want two configurations "
299
+ f"of the same dataclass in the same sweep, vary their kwargs "
300
+ f"so the canonical identifier differs."
301
+ )
302
+ seen[sid] = i
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.47.0"
5
+ __version__ = "0.48.0"
@@ -1,4 +1,4 @@
1
- """Adversarial robustness: character-injection bypass suite + Scorer-Protocol sweep.
1
+ """Adversarial robustness: 12-technique character-injection bypass suite.
2
2
 
3
3
  Implements the character-injection bypass techniques from Microsoft Research
4
4
  2024 ([1]_) for testing prompt-injection-detection scorers under adversarial
@@ -6,7 +6,7 @@ input perturbation. Each technique is deterministic given a ``seed`` and
6
6
  preserves the surface meaning of the text from a human reader's perspective
7
7
  while shifting the tokenizer / scorer's representation.
8
8
 
9
- Core techniques shipped in v0.43.0:
9
+ Core techniques (shipped in v0.43.0):
10
10
 
11
11
  - :class:`ZeroWidthSpaceInjection` — insert U+200B zero-width spaces
12
12
  - :class:`HomoglyphSubstitution` — Latin → Cyrillic/Greek lookalikes
@@ -15,25 +15,24 @@ Core techniques shipped in v0.43.0:
15
15
  - :class:`CaseRandomization` — random case-flipping per character
16
16
  - :class:`PunctuationInjection` — non-semantic punctuation insertion
17
17
 
18
- The :func:`sweep` function applies a set of techniques against a
19
- :class:`~eval_toolkit.protocols.Scorer`-Protocol-compliant scorer and
20
- returns a DataFrame of
21
- ``(text_id, technique, original_score, transformed_score, asr)``
22
- for adversarial robustness analysis. ASR (attack success rate) is the
23
- fraction of inputs where the scorer crossed the threshold from positive
24
- to negative under the transformation.
18
+ Advanced techniques (shipped in v0.47 per Decision Q11.3):
25
19
 
26
- The six advanced techniques (bidi RTL override, tag stripping, synonym
27
- substitution, token splitting, Unicode normalization, invisible
28
- characters) are scheduled for v0.43.1 as a follow-up patch; the sweep
29
- API stabilizes in v0.43.0 so the v0.43.1 additions are pure extensions.
20
+ - :class:`BidiRTLInjection` U+202E…U+202C override block
21
+ - :class:`TagStrippingInjection` ``<…>`` tag removal (idempotent)
22
+ - :class:`SynonymSubstitution` whitelisted-word swap, seed-deterministic
23
+ - :class:`TokenSplitting` mid-word single-space insertion
24
+ - :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD form switch
25
+ - :class:`InvisibleCharsInjection` — 5 invisible code points
30
26
 
31
- A module-level :data:`character_injection` namespace exposes the
32
- function-style API from the upstream issue spec:
27
+ The convenience tuples :data:`CORE_TECHNIQUES` (6-tuple),
28
+ :data:`ADVANCED_TECHNIQUES` (6-tuple), and :data:`ALL_TECHNIQUES`
29
+ (12-tuple = core + advanced) enumerate the suite for sweep callers.
33
30
 
34
- >>> from eval_toolkit.adversarial import character_injection
35
- >>> character_injection.zero_width_space("hello") # doctest: +SKIP
36
- 'h​e​l​l​o'
31
+ Use the v0.47 top-level :func:`eval_toolkit.sweep` to apply any set of
32
+ :class:`~eval_toolkit.TextTransform` strategies against a corpus (and
33
+ optionally a :class:`~eval_toolkit.protocols.Scorer`); the v0.43–v0.46
34
+ module-level ``sweep()`` function and the ``character_injection``
35
+ ``SimpleNamespace`` were removed at v0.47 (Decisions D + K + N).
37
36
 
38
37
  References
39
38
  ----------
@@ -469,6 +468,29 @@ class SynonymSubstitution:
469
468
  Random seed for determinism. Default ``42``.
470
469
  name : str, optional
471
470
  Override technique name. Default ``"synonym"``.
471
+
472
+ Notes
473
+ -----
474
+ The eligible-word set is the module-level ``_SYNONYMS`` dict, a fixed
475
+ 6-entry whitelist hand-curated to preserve semantics:
476
+
477
+ - ``ignore`` → ``disregard``, ``overlook``
478
+ - ``instructions`` → ``directions``, ``guidance``
479
+ - ``system`` → ``framework``, ``platform``
480
+ - ``secret`` → ``private``, ``confidential``
481
+ - ``send`` → ``transmit``, ``forward``
482
+ - ``all`` → ``every``, ``all of``
483
+
484
+ Inputs containing none of those whitelist words are returned unchanged
485
+ — the transform is a no-op on such inputs. This is intentional: the
486
+ technique's invariant is "looks like the original," so the substitution
487
+ deliberately stays small. The trade-off is easy to be surprised by
488
+ when running ``SynonymSubstitution`` on a corpus that doesn't share
489
+ the prompt-injection vocabulary the whitelist was built from. If you
490
+ need broader substitution, the whitelist isn't extension-friendly
491
+ today — fork the dict at the module level, or treat
492
+ ``SynonymSubstitution`` as a reference implementation for your own
493
+ text-transform with a richer table.
472
494
  """
473
495
 
474
496
  ratio: float = 1.0
@@ -120,10 +120,29 @@ class BootstrapCI:
120
120
  method: str
121
121
 
122
122
  def to_dict(self) -> dict[str, object]:
123
- """Serialize to a stable dict schema for JSON output."""
123
+ """Serialize to a stable, self-describing dict schema for JSON output.
124
+
125
+ v0.48 BREAKING (§5B): schema rewritten to drop the hard-coded
126
+ ``"ci_95"`` key that lied when ``confidence != 0.95``. The new
127
+ schema names the bounds neutrally and carries the actual
128
+ confidence level in a dedicated field; consumers can read
129
+ ``confidence`` to interpret the bound semantics.
130
+
131
+ Before v0.48:
132
+ {"point_estimate": p, "ci_95": [l, h], "confidence": 0.95,
133
+ "n_resamples": N, "method": "BCa"}
134
+
135
+ v0.48+:
136
+ {"point": p, "low": l, "high": h, "confidence": 0.95,
137
+ "n_resamples": N, "method": "BCa"}
138
+
139
+ Migration: rename ``point_estimate`` → ``point``; replace the
140
+ ``ci_95`` list-of-two with separate ``low`` + ``high`` keys.
141
+ """
124
142
  return {
125
- "point_estimate": self.point_estimate,
126
- "ci_95": [self.ci_low, self.ci_high],
143
+ "point": self.point_estimate,
144
+ "low": self.ci_low,
145
+ "high": self.ci_high,
127
146
  "confidence": self.confidence,
128
147
  "n_resamples": self.n_resamples,
129
148
  "method": self.method,
@@ -185,10 +204,24 @@ class PairedBootstrapCI:
185
204
  n_resamples: int
186
205
 
187
206
  def to_dict(self) -> dict[str, object]:
188
- """Serialize to a stable dict schema for JSON output."""
207
+ """Serialize to a stable, self-describing dict schema for JSON output.
208
+
209
+ v0.48 BREAKING (§5B): same rewrite as :meth:`BootstrapCI.to_dict`.
210
+ ``"ci_95"`` is replaced by ``"low"`` + ``"high"``; ``"confidence"``
211
+ carries the actual level.
212
+
213
+ Before v0.48:
214
+ {"delta": d, "ci_95": [l, h], "overlaps_zero": b,
215
+ "confidence": 0.95, "n_resamples": N}
216
+
217
+ v0.48+:
218
+ {"delta": d, "low": l, "high": h, "overlaps_zero": b,
219
+ "confidence": 0.95, "n_resamples": N}
220
+ """
189
221
  return {
190
222
  "delta": self.delta,
191
- "ci_95": [self.ci_low, self.ci_high],
223
+ "low": self.ci_low,
224
+ "high": self.ci_high,
192
225
  "overlaps_zero": self.overlaps_zero,
193
226
  "confidence": self.confidence,
194
227
  "n_resamples": self.n_resamples,
@@ -843,6 +876,21 @@ def paired_bootstrap_op_point_diff(
843
876
  .. [2] Bouckaert, R. R. "Choosing between two learning algorithms
844
877
  based on calibrated tests." ICML 2003.
845
878
  """
879
+ # Defensive identity-guard: the two-level bootstrap resamples val + test
880
+ # indices INDEPENDENTLY (see _paired_bootstrap_op_point_diff_step). Passing
881
+ # the same Python object for val and test causes ~63.2% overlap on each
882
+ # resample, violating the val/test independence assumption that lets the
883
+ # CI absorb threshold-selection variance honestly. Partition the data
884
+ # before calling — see docs/source/methodology/thresholds.md.
885
+ if val_y is test_y:
886
+ raise ValueError(
887
+ "paired_bootstrap_op_point_diff: val_y and test_y are the same array. "
888
+ "The two-level bootstrap requires DISJOINT val + test slices; the "
889
+ "resampler draws val_idx and test_idx independently, so identical "
890
+ "arrays cause ~63.2% overlap and violate the independence assumption. "
891
+ "Partition your data first (e.g., val = arr[:n//2], test = arr[n//2:])."
892
+ )
893
+
846
894
  val_y_arr = np.asarray(val_y)
847
895
  val_a, val_b = np.asarray(val_score_a), np.asarray(val_score_b)
848
896
  test_y_arr = np.asarray(test_y)
@@ -1157,12 +1205,16 @@ def cv_clt_ci(
1157
1205
 
1158
1206
  Computes a confidence interval on the cross-validation mean metric
1159
1207
  that correctly accounts for fold-level dependence. The standard
1160
- "naive" CI (compute std-of-folds then divide by sqrt(K)) is anti-
1161
- conservative because the folds share training data; Bayle et al.
1162
- 2020 prove a CV-CLT with a correction factor that gives valid
1163
- coverage asymptotically.
1208
+ "naive" CI (compute std-of-folds then divide by sqrt(K)) had long
1209
+ been suspected to be anti-conservative because the folds share
1210
+ training data. Bayle et al. 2020 prove that the naive sample-variance
1211
+ estimator (with ``ddof=1``) gives valid asymptotic coverage under
1212
+ stability conditions, resolving the historical concern that fold
1213
+ correlation makes it anti-conservative. No additional correction
1214
+ factor is applied.
1164
1215
 
1165
- The corrected variance estimator (Bayle 2020 Theorem 3.1):
1216
+ The variance estimator (Bayle 2020 Theorem 3.1) is just the standard
1217
+ sample variance over per-fold metrics:
1166
1218
 
1167
1219
  .. math::
1168
1220
 
@@ -1233,9 +1285,9 @@ def cv_clt_ci(
1233
1285
  raise ValueError(f"confidence must be in (0, 1), got {confidence}")
1234
1286
 
1235
1287
  point = float(arr.mean())
1236
- # Bayle 2020 Theorem 3.1 variance: sample variance with (K-1) denom; the
1237
- # CV-CLT correction is captured in this estimator's asymptotic guarantee
1238
- # (no extra fold-correlation factor needed for a balanced K-fold CV).
1288
+ # Bayle 2020 Theorem 3.1: the naive sample-variance estimator (ddof=1)
1289
+ # gives valid asymptotic coverage under stability conditions no extra
1290
+ # correction factor is applied for fold correlation.
1239
1291
  sigma_hat = float(np.std(arr, ddof=1))
1240
1292
  z = _normal_quantile(0.5 + confidence / 2.0)
1241
1293
  margin = z * sigma_hat / np.sqrt(K)
@@ -1258,9 +1310,10 @@ def block_bootstrap_on_folds(
1258
1310
  ) -> BootstrapCI:
1259
1311
  r"""Block bootstrap on folds: resample K folds with replacement; percentile CI on mean.
1260
1312
 
1261
- Sibling primitive to :func:`cv_clt_ci`. Where :func:`cv_clt_ci` applies
1262
- the Bayle et al. 2020 CV-CLT correction (correct asymptotically under
1263
- fold exchangeability), the block bootstrap is more *conservative* under
1313
+ Sibling primitive to :func:`cv_clt_ci`. Where :func:`cv_clt_ci` relies on
1314
+ Bayle et al. 2020's CV-CLT the naive sample-variance estimator gives
1315
+ valid asymptotic coverage under stability + fold exchangeability — the
1316
+ block bootstrap is more *conservative* under
1264
1317
  fold-level **non-exchangeability** — situations where the K folds are
1265
1318
  not interchangeable (e.g., source-disjoint LODO folds where one source
1266
1319
  is intrinsically harder than the others). The sensitivity-check
@@ -356,6 +356,20 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
356
356
 
357
357
  .. math:: t^* = \frac{c_{FP} \cdot (1 - π)}{c_{FP} \cdot (1 - π) + c_{FN} \cdot π}
358
358
 
359
+ .. warning::
360
+
361
+ This formula assumes ``y_score`` is a calibrated probability with
362
+ respect to a **balanced prior** (or equivalently, a raw likelihood
363
+ ratio). If your scores are calibrated to the deployment prior (e.g.,
364
+ via :func:`fit_platt_binary` on a representative validation set), the
365
+ prior is already incorporated into the score and applying this
366
+ formula will **double-count it**. For deployment-prior-calibrated
367
+ scores, use the simpler prior-independent form
368
+ ``t* = c_fp / (c_fp + c_fn)`` (no ``prior`` kwarg) — that's literal
369
+ Elkan 2001 §4. The function in this file is the prior-corrected
370
+ variant for raw / balanced-prior scores; see the Examples for both
371
+ usage patterns.
372
+
359
373
  Parameters
360
374
  ----------
361
375
  π : float
@@ -396,6 +410,29 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
396
410
  >>> bayes_optimal_threshold(1.0, c_fp=1.0, c_fn=1.0)
397
411
  0.0
398
412
 
413
+ **Two correct usages, side by side.** The choice depends on what your
414
+ ``y_score`` is calibrated to.
415
+
416
+ Usage A — raw or balanced-prior scores (use this function, pass ``π``):
417
+
418
+ >>> # Score from a model trained on a balanced (50/50) corpus, deployed
419
+ >>> # at a 1% positive prior, with FN cost 10× the FP cost.
420
+ >>> t_balanced = bayes_optimal_threshold(0.01, c_fp=1.0, c_fn=10.0)
421
+ >>> round(t_balanced, 4)
422
+ 0.9083
423
+
424
+ Usage B — deployment-prior-calibrated scores (skip this function, use
425
+ the literal Elkan 2001 §4 prior-independent form):
426
+
427
+ >>> # Score already calibrated to the 1% deployment prior via
428
+ >>> # fit_platt_binary on a representative val slice — DO NOT pass π
429
+ >>> # to this function (you'd double-count it). Threshold the
430
+ >>> # already-prior-corrected probability against the cost ratio:
431
+ >>> c_fp, c_fn = 1.0, 10.0
432
+ >>> t_calibrated = c_fp / (c_fp + c_fn)
433
+ >>> round(t_calibrated, 4)
434
+ 0.0909
435
+
399
436
  Notes
400
437
  -----
401
438
  Symmetric costs (c_fp == c_fn) collapse the formula to t* = 1 - π.
@@ -407,9 +444,10 @@ def bayes_optimal_threshold(π: float, c_fp: float, c_fn: float) -> float:
407
444
  *Bayes-calibrated* posterior P(y=1 | x). The formula implemented here
408
445
  is the **prior-corrected** form for thresholding raw scores at a known
409
446
  deployment prior π, which agrees with Elkan only under symmetric costs.
410
- For our intended use (deployment prior + asymmetric costs) the
411
- prior-corrected form is what the user wants — but the citation should
412
- be read as "Elkan 2001 cost-sensitive framework", not literal §4.
447
+ For our intended use (deployment prior + asymmetric costs on raw /
448
+ balanced-prior scores) the prior-corrected form is what the user wants
449
+ — but the citation should be read as "Elkan 2001 cost-sensitive
450
+ framework", not literal §4.
413
451
 
414
452
  References
415
453
  ----------
@@ -89,7 +89,7 @@ def from_yaml[T](path: Path | str, cls: type[T]) -> T:
89
89
  import yaml # noqa: PLC0415
90
90
  except ImportError as exc:
91
91
  raise ImportError(
92
- "from_yaml requires pyyaml; install with `pip install eval-toolkit[yaml]`"
92
+ "from_yaml requires pyyaml. Install with: pip install eval-toolkit[yaml]"
93
93
  ) from exc
94
94
 
95
95
  if not is_dataclass(cls):
@@ -85,7 +85,7 @@ def make_minilm_embedder(
85
85
  except ImportError as e:
86
86
  raise ImportError(
87
87
  "make_minilm_embedder requires sentence-transformers. "
88
- "Install via: pip install eval-toolkit[embeddings]"
88
+ "Install with: pip install eval-toolkit[embeddings]"
89
89
  ) from e
90
90
 
91
91
  # sentence-transformers-active path: excluded from CI coverage