eval-toolkit 0.49.0__tar.gz → 0.50.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/CHANGELOG.md +56 -0
  2. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/PKG-INFO +3 -3
  3. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/README.md +2 -2
  4. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/STYLE.md +1 -1
  5. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_rng.py +19 -0
  6. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/analysis.py +5 -4
  8. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/bootstrap.py +42 -33
  9. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/harness.py +31 -24
  10. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metrics.py +7 -5
  11. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/scorecards.py +14 -11
  12. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/stacking.py +16 -4
  13. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/thresholds.py +5 -4
  14. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/bootstrap_ci/cases.json +6 -6
  15. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/public_api/snapshot.json +14 -14
  16. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_analysis.py +5 -5
  17. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_block_bootstrap_on_folds.py +7 -7
  18. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_calibration_mc.py +4 -4
  19. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_edge_cases.py +3 -3
  20. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_golden.py +18 -18
  21. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_njobs.py +12 -12
  22. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_props.py +11 -11
  23. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_research_grounded.py +3 -3
  24. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_unit.py +18 -18
  25. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_bootstrap_chain.py +2 -2
  26. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_bootstrap.py +6 -6
  27. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_metrics.py +1 -1
  28. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_fault_injection.py +2 -2
  29. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_internals.py +3 -3
  30. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_metric_options.py +2 -2
  31. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_parallelism.py +10 -10
  32. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_smoke.py +2 -2
  33. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_logging.py +1 -1
  34. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_pipeline_e2e.py +5 -5
  35. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_reference_equivalence.py +2 -2
  36. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_reproducibility_integration.py +10 -10
  37. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_scorecard.py +16 -16
  38. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_stacking.py +17 -19
  39. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds.py +2 -2
  40. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_coverage.py +1 -1
  41. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_v09_contracts.py +3 -3
  42. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/.gitignore +0 -0
  43. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/LICENSE +0 -0
  44. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/archive/README.md +0 -0
  45. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/README.md +0 -0
  46. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/datasets/README.md +0 -0
  47. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/data-integrity/README.md +0 -0
  48. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  49. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/inference/README.md +0 -0
  50. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/research/papers/prompt-injection/README.md +0 -0
  51. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/source/adr/README.md +0 -0
  52. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/docs/source/methodology/README.md +0 -0
  53. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/pyproject.toml +0 -0
  54. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__init__.py +0 -0
  55. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__main__.py +0 -0
  56. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_deprecated.py +0 -0
  57. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_parallel.py +0 -0
  58. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_sweep.py +0 -0
  59. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/adversarial.py +0 -0
  60. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/artifacts.py +0 -0
  61. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/calibration.py +0 -0
  62. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/claims.py +0 -0
  63. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/config.py +0 -0
  64. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/docs.py +0 -0
  65. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/embeddings.py +0 -0
  66. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/evidence.py +0 -0
  67. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/leakage.py +0 -0
  68. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/loaders.py +0 -0
  69. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/losses.py +0 -0
  70. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/manifest.py +0 -0
  71. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metric_specs.py +0 -0
  72. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/operating_points.py +0 -0
  73. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/paths.py +0 -0
  74. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/plotting.py +0 -0
  75. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/preprocessing.py +0 -0
  76. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/probes.py +0 -0
  77. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/protocols.py +0 -0
  78. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/provenance.py +0 -0
  79. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/py.typed +0 -0
  80. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  81. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  82. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  83. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  84. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  85. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  86. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/seeds.py +0 -0
  87. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/splits.py +0 -0
  88. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/src/eval_toolkit/text_dedup.py +0 -0
  89. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  90. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  91. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  92. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  93. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  94. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  95. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  96. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  97. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  98. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  99. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/benchmarks/__init__.py +0 -0
  100. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  101. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/conftest.py +0 -0
  102. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  103. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  104. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  105. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/expected.md +0 -0
  106. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/input.md +0 -0
  107. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/docs/metrics.json +0 -0
  108. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  109. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/strategies.py +0 -0
  110. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_adversarial.py +0 -0
  111. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_artifacts.py +0 -0
  112. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_binary_adapters.py +0 -0
  113. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_determinism.py +0 -0
  114. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_optimization_failures.py +0 -0
  115. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_props.py +0 -0
  116. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_research_grounded.py +0 -0
  117. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_calibration_unit.py +0 -0
  118. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims.py +0 -0
  119. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims_coverage.py +0 -0
  120. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_claims_props.py +0 -0
  121. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_cli.py +0 -0
  122. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_config.py +0 -0
  123. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_calibration.py +0 -0
  124. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_harness.py +0 -0
  125. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_coverage_plotting.py +0 -0
  126. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_croissant_e2e.py +0 -0
  127. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  128. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_deprecated_scalars_shim.py +0 -0
  129. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_deprecations.py +0 -0
  130. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_docs_golden.py +0 -0
  131. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_docs_props.py +0 -0
  132. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_embeddings.py +0 -0
  133. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_evidence_validators.py +0 -0
  134. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_edge_cases.py +0 -0
  135. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_harness_folded.py +0 -0
  136. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_import_boundaries.py +0 -0
  137. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  138. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_lazy_extras_messages.py +0 -0
  139. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage.py +0 -0
  140. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage_error_paths.py +0 -0
  141. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_leakage_props.py +0 -0
  142. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders.py +0 -0
  143. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders_coverage.py +0 -0
  144. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_loaders_props.py +0 -0
  145. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_losses.py +0 -0
  146. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest.py +0 -0
  147. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  148. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_props.py +0 -0
  149. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_manifest_validation.py +0 -0
  150. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_props.py +0 -0
  151. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_stratified_subsets.py +0 -0
  152. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_metrics_unit.py +0 -0
  153. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_misc_coverage.py +0 -0
  154. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_numeric_edge_cases.py +0 -0
  155. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_ood_loader.py +0 -0
  156. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_operating_points.py +0 -0
  157. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_operating_points_props.py +0 -0
  158. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_parallel.py +0 -0
  159. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_paths.py +0 -0
  160. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_edge.py +0 -0
  161. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_smoke.py +0 -0
  162. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_plotting_visual.py +0 -0
  163. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_preprocessing.py +0 -0
  164. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_probes.py +0 -0
  165. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_protocol_conformance.py +0 -0
  166. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_provenance.py +0 -0
  167. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_public_api.py +0 -0
  168. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_recall_at_fpr.py +0 -0
  169. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_schemas.py +0 -0
  170. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_seeds.py +0 -0
  171. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits.py +0 -0
  172. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits_leakage_integration.py +0 -0
  173. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_splits_props.py +0 -0
  174. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_sweep.py +0 -0
  175. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup.py +0 -0
  176. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_coverage.py +0 -0
  177. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_props.py +0 -0
  178. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_strategies.py +0 -0
  179. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_constant_score.py +0 -0
  180. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_props.py +0 -0
  181. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_thresholds_research_grounded.py +0 -0
  182. {eval_toolkit-0.49.0 → eval_toolkit-0.50.0}/tests/test_tokenization_leakage_check.py +0 -0
@@ -5,6 +5,62 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.50.0] — 2026-05-23 — SPEC 7 `rng` parameter adoption
9
+
10
+ The SPEC 7 follow-up to v0.49.0. The `_rng.py` scaffold shipped at
11
+ v0.49.0 (SeedLike + RNGLike type aliases per
12
+ [Scientific Python SPEC 7](https://scientific-python.org/specs/spec-0007/))
13
+ is now wired into every Tier-1 public function that consumes a NumPy RNG.
14
+
15
+ ### BREAKING
16
+
17
+ **22 Tier-1 function signatures**: `seed: int = X` / `random_state: int | None` → `rng: RNGLike | SeedLike | None = X`. Pre-v1.0 SemVer-minor BREAKING (v0.34.0 precedent). Defaults preserved (still deterministic-by-default).
18
+
19
+ Affected functions:
20
+
21
+ - `bootstrap.py` (7 public + 1 private): `bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`, `paired_bootstrap_op_point_diff`, `paired_mde`, `block_bootstrap_on_folds`, `cross_validate_metric`, `_bootstrap_t_ci`.
22
+ - `metrics.py:1063`: `expected_calibration_error_debiased`.
23
+ - `thresholds.py`: `selected_operating_point` + `_bootstrap_threshold_metric_cis`.
24
+ - `analysis.py`: `bootstrap_metric_from_predictions`, `paired_diff_from_prediction_refs`.
25
+ - `harness.py` (6 sites): `evaluate`, `evaluate_scorer_on_slice`, `_bootstrap_auc_ci`, `_evaluate_scores`, `_compute_paired_diffs`, `_score_all_slices`.
26
+ - `scorecards.py`: `scorecard`, `_evaluate_spec`.
27
+ - `stacking.py`: `LogisticStacker.random_state` → `LogisticStacker.rng` class-field rename (sklearn pass-through derives int at the boundary).
28
+
29
+ **Body refactors**:
30
+
31
+ - 4 SeedSequence.spawn() sites converted from `np.random.SeedSequence(seed).spawn(n)` to `rng.bit_generator.seed_seq.spawn(n)` (Option A — preserves existing worker SeedSequence signatures).
32
+ - 2 sklearn-bridge sites in `cross_validate_metric` derive int from rng before passing to `StratifiedKFold`/`KFold(random_state=...)` (defensive across sklearn versions <1.4).
33
+ - `LogisticStacker.fit` derives sklearn int from `self.rng` at the boundary.
34
+
35
+ **Config schema** (Tier-2 additive): `evaluate()` config dict key `"seed"` → `"rng"`. Generator-typed input serializes as `repr(rng)`; int/None serialize as-is (backward-compatible for prior int-seed usage).
36
+
37
+ ### Added
38
+
39
+ - **Docstrings**: NumPy-style parameter doc for every renamed function now references `rng : RNGLike | SeedLike | None` with explicit link to SPEC 7.
40
+ - **STYLE.md §3a** + **ADR 0004 D4**: `rng` row flipped from "target convention; adopted in v0.50.0" → "**canonical** convention (adopted v0.50.0)".
41
+
42
+ ### Changed
43
+
44
+ - **Test sweep** (~230+ test sites): `seed=X` → `rng=X` in test kwarg calls, EXCEPT in test files that test legitimate `seed`-as-int contexts (`test_adversarial.py` for Python `random.Random`, `test_seeds.py` for `set_global_seeds`, `test_splits*.py` for Splitter dataclass fields, `test_text_dedup*.py` for MinHashLSHStrategy class field).
45
+ - **CHANGELOG header**: this release.
46
+
47
+ ### Exceptions to SPEC 7 (KEPT `seed:` — documented in STYLE.md §3a + ADR 0004 D4)
48
+
49
+ - `seeds.set_global_seeds(seed: int)` — global-state setter, not per-function RNG.
50
+ - `adversarial.py` dataclass fields + functional wrappers — use Python stdlib `random.Random(seed)`, not NumPy.
51
+ - `splits.py` Splitter dataclass class-fields (`HoldoutSplitter.seed`, `StratifiedKFoldSplitter.seed`, etc.) — configuration storage, not user-facing RNG parameter.
52
+ - `loaders.py:903` YAML config schema key — declarative; renaming would break consumer YAMLs.
53
+
54
+ ### Migration
55
+
56
+ - Consumer (`prompt-injection-detection-submission`) lockstep: bump dep pin `>=0.49.0` → `>=0.50.0`; rename `seed=` → `rng=` on eval-toolkit-bound call sites (estimated 5-8 sites).
57
+ - Bit-for-bit reproducibility preserved when migrating `seed=42` → `rng=42` (int seed is SeedLike; `np.random.default_rng(42)` is the canonical normalization).
58
+
59
+ ### Notes
60
+
61
+ - Ships in parallel with Round 8 audit STOP-GATE (Decision Y.2); R8 briefing at commit `6f6839a`, awaiting Codex+Gemini reports.
62
+ - Memory pattern captured at v0.49.0: pre-flight grep MUST cover `README.md`, `.doctest-modules`, and any config files (per `feedback_sybil_runs_readme.md`). Applied to v0.50.0 pre-flight.
63
+
8
64
  ## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
9
65
 
10
66
  Final pre-v1.0 minor consolidating the naming-convention standardization
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.49.0
3
+ Version: 0.50.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -233,12 +233,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
233
233
  from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
234
234
  from eval_toolkit.metrics import pr_auc
235
235
 
236
- ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
236
+ ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
237
237
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
238
238
 
239
239
  # Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
240
240
  s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
241
- diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, seed=42)
241
+ diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
242
242
  print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
243
243
  ```
244
244
 
@@ -150,12 +150,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
150
150
  from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
151
151
  from eval_toolkit.metrics import pr_auc
152
152
 
153
- ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
153
+ ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
154
154
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
155
155
 
156
156
  # Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
157
157
  s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
158
- diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, seed=42)
158
+ diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
159
159
  print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
160
160
  ```
161
161
 
@@ -76,7 +76,7 @@ them; deviations need justification in the PR description.
76
76
  | `n_jobs` | Parallelism (joblib + sklearn convention) |
77
77
  | `ax` | Matplotlib axis (matplotlib convention) |
78
78
  | `metric` | Callable `(y_true, y_score) -> float` |
79
- | `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — target convention; adopted in v0.50.0 |
79
+ | `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — **canonical** convention (adopted v0.50.0). Accepts `int`, `np.random.Generator`, `BitGenerator`, `SeedSequence`, or `None`. |
80
80
 
81
81
  The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
82
82
  `set_global_seeds(seed: int)` (global-state setter, not per-function
@@ -26,6 +26,7 @@ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
26
26
  from __future__ import annotations
27
27
 
28
28
  from collections.abc import Sequence
29
+ from typing import cast
29
30
 
30
31
  import numpy as np
31
32
 
@@ -44,3 +45,21 @@ type RNGLike = np.random.Generator | np.random.BitGenerator
44
45
  ``Generator`` inputs and lifts ``BitGenerator`` inputs into a
45
46
  ``Generator`` — both forms compose cleanly.
46
47
  """
48
+
49
+
50
+ def spawn_seed_sequences(rng: RNGLike | SeedLike | None, n: int) -> list[np.random.SeedSequence]:
51
+ """Spawn ``n`` independent SeedSequences from any SPEC 7 ``rng`` input.
52
+
53
+ Normalizes the input to a ``Generator``, then extracts the underlying
54
+ ``SeedSequence`` via the bit-generator and spawns ``n`` children.
55
+ The cast satisfies mypy strict: the ``seed_seq`` attribute on a
56
+ concrete BitGenerator is a ``SeedSequence`` instance, but the type
57
+ stub on ``BitGenerator.seed_seq`` returns the abstract
58
+ ``ISeedSequence`` interface (which lacks ``spawn``).
59
+
60
+ Used by the bootstrap parallel workers (which take spawned
61
+ ``SeedSequence`` objects to seed their internal ``default_rng()`` calls).
62
+ """
63
+ gen = np.random.default_rng(rng)
64
+ seed_seq = cast(np.random.SeedSequence, gen.bit_generator.seed_seq)
65
+ return seed_seq.spawn(n)
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.49.0"
5
+ __version__ = "0.50.0"
@@ -11,6 +11,7 @@ from typing import Any
11
11
 
12
12
  import numpy as np
13
13
 
14
+ from eval_toolkit._rng import RNGLike, SeedLike
14
15
  from eval_toolkit.bootstrap import bootstrap_ci, paired_bootstrap_diff
15
16
  from eval_toolkit.metrics import pr_auc
16
17
  from eval_toolkit.protocols import PredictionReader
@@ -121,7 +122,7 @@ def bootstrap_metric_from_predictions(
121
122
  *,
122
123
  reader: PredictionReader | None = None,
123
124
  n_resamples: int = 1000,
124
- seed: int = 42,
125
+ rng: RNGLike | SeedLike | None = 42,
125
126
  ) -> dict[str, object]:
126
127
  """Compute a PR-AUC bootstrap CI from one prediction ref."""
127
128
  arrays = load_prediction_arrays(ref, reader=reader)
@@ -130,7 +131,7 @@ def bootstrap_metric_from_predictions(
130
131
  arrays.scores,
131
132
  pr_auc,
132
133
  n_resamples=n_resamples,
133
- seed=seed,
134
+ rng=rng,
134
135
  ).to_dict()
135
136
 
136
137
 
@@ -141,7 +142,7 @@ def paired_diff_from_prediction_refs(
141
142
  baseline_reader: PredictionReader | None = None,
142
143
  candidate_reader: PredictionReader | None = None,
143
144
  n_resamples: int = 1000,
144
- seed: int = 42,
145
+ rng: RNGLike | SeedLike | None = 42,
145
146
  ) -> dict[str, object]:
146
147
  """Compute paired PR-AUC delta from two prediction refs.
147
148
 
@@ -172,7 +173,7 @@ def paired_diff_from_prediction_refs(
172
173
  candidate.scores,
173
174
  pr_auc,
174
175
  n_resamples=n_resamples,
175
- seed=seed,
176
+ rng=rng,
176
177
  ).to_dict()
177
178
 
178
179
 
@@ -31,6 +31,7 @@ from scipy.stats import norm as _scipy_norm
31
31
  from scipy.stats import rankdata as _scipy_rankdata
32
32
 
33
33
  from eval_toolkit._parallel import parallel_map
34
+ from eval_toolkit._rng import RNGLike, SeedLike, spawn_seed_sequences
34
35
 
35
36
  _logger = logging.getLogger(__name__)
36
37
 
@@ -236,7 +237,7 @@ def bootstrap_ci(
236
237
  n_resamples: int = DEFAULT_N_RESAMPLES,
237
238
  confidence: float = DEFAULT_CONFIDENCE,
238
239
  method: Literal["BCa", "percentile", "studentized"] = DEFAULT_METHOD,
239
- seed: int = DEFAULT_SEED,
240
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
240
241
  n_jobs: int = 1,
241
242
  ) -> BootstrapCI:
242
243
  """Per-condition CI via :func:`scipy.stats.bootstrap`.
@@ -257,8 +258,9 @@ def bootstrap_ci(
257
258
  Two-sided confidence level (default 0.95).
258
259
  method : {"BCa", "percentile", "studentized"}, optional
259
260
  Default "BCa".
260
- seed : int, optional
261
- RNG seed for reproducibility.
261
+ rng : RNGLike | SeedLike | None, optional
262
+ RNG argument per `Scientific Python SPEC 7 <https://scientific-python.org/specs/spec-0007/>`_.
263
+ Int seed (default ``DEFAULT_SEED=42``), ``Generator``, or ``None`` (entropy).
262
264
  n_jobs : int, optional
263
265
  Parallel workers (default 1 — sequential). Only effective when
264
266
  ``method='studentized'`` (which has the only Python-level outer loop
@@ -284,7 +286,7 @@ def bootstrap_ci(
284
286
  >>> rng = np.random.default_rng(42)
285
287
  >>> y = rng.integers(0, 2, size=200)
286
288
  >>> s = y + rng.normal(0, 0.3, size=200)
287
- >>> ci = bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, seed=42)
289
+ >>> ci = bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, rng=42)
288
290
  >>> ci.ci_low <= ci.point_estimate <= ci.ci_high
289
291
  True
290
292
 
@@ -319,13 +321,13 @@ def bootstrap_ci(
319
321
  )
320
322
 
321
323
  _logger.debug(
322
- "bootstrap_ci: metric=%s n=%d n_resamples=%d method=%s confidence=%.3f seed=%d n_jobs=%d",
324
+ "bootstrap_ci: metric=%s n=%d n_resamples=%d method=%s confidence=%.3f rng=%r n_jobs=%d",
323
325
  getattr(metric, "__name__", repr(metric)),
324
326
  n,
325
327
  n_resamples,
326
328
  method,
327
329
  confidence,
328
- seed,
330
+ rng,
329
331
  n_jobs,
330
332
  )
331
333
 
@@ -342,11 +344,11 @@ def bootstrap_ci(
342
344
  point,
343
345
  n_resamples=n_resamples,
344
346
  confidence=confidence,
345
- seed=seed,
347
+ rng=rng,
346
348
  n_jobs=n_jobs,
347
349
  )
348
350
  else:
349
- rng = np.random.default_rng(seed)
351
+ rng = np.random.default_rng(rng)
350
352
  res = _scipy_bootstrap(
351
353
  (y_true_arr, y_score_arr),
352
354
  statistic=_statistic,
@@ -423,7 +425,7 @@ def _bootstrap_t_ci(
423
425
  *,
424
426
  n_resamples: int,
425
427
  confidence: float,
426
- seed: int,
428
+ rng: RNGLike | SeedLike | None,
427
429
  n_jobs: int = 1,
428
430
  ) -> tuple[float, float]:
429
431
  r"""Studentized bootstrap-t CI per Algeshiemer 2024 / Davison & Hinkley §5.2.
@@ -441,7 +443,7 @@ def _bootstrap_t_ci(
441
443
  Skips degenerate resamples (single-class draws causing the metric to
442
444
  raise); raises if > 5% of resamples are degenerate.
443
445
  """
444
- seed_seqs = np.random.SeedSequence(seed).spawn(n_resamples)
446
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
445
447
  step = functools.partial(_bootstrap_t_step, y_true=y_true, y_score=y_score, metric=metric)
446
448
  raw_results = parallel_map(step, seed_seqs, n_jobs=n_jobs, description="bootstrap_t")
447
449
  valid_pairs = [r for r, _ in raw_results if r is not None]
@@ -505,7 +507,7 @@ def paired_bootstrap_diff(
505
507
  *,
506
508
  n_resamples: int = DEFAULT_N_RESAMPLES,
507
509
  confidence: float = DEFAULT_CONFIDENCE,
508
- seed: int = DEFAULT_SEED,
510
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
509
511
  n_jobs: int = 1,
510
512
  ) -> PairedBootstrapCI:
511
513
  """Paired-bootstrap CI on ``metric(B) − metric(A)`` using the same resample indices.
@@ -518,7 +520,7 @@ def paired_bootstrap_diff(
518
520
  Scores from two scorers on the same rows.
519
521
  metric : callable ``(y_true, y_score) -> float``
520
522
  Must be picklable when ``n_jobs != 1`` (lambdas not supported).
521
- n_resamples, confidence, seed : standard bootstrap params.
523
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
522
524
  n_jobs : int, optional
523
525
  Parallel workers (default 1 — sequential). ``n_jobs > 1`` uses
524
526
  joblib loky; ``n_jobs=-1`` uses all cores; ``n_jobs=0`` is rejected.
@@ -547,7 +549,7 @@ def paired_bootstrap_diff(
547
549
  >>> y = rng.integers(0, 2, size=200)
548
550
  >>> s_a = rng.normal(0, 1, size=200) # random scorer
549
551
  >>> s_b = y + rng.normal(0, 0.3, size=200) # signal scorer
550
- >>> diff = paired_bootstrap_diff(y, s_a, s_b, pr_auc, n_resamples=200, seed=42)
552
+ >>> diff = paired_bootstrap_diff(y, s_a, s_b, pr_auc, n_resamples=200, rng=42)
551
553
  >>> diff.delta > 0 # B beats A
552
554
  True
553
555
 
@@ -581,7 +583,7 @@ def paired_bootstrap_diff(
581
583
  raise ValueError(f"n={n} too small for paired bootstrap; need ≥ 10")
582
584
 
583
585
  delta_point = float(metric(y_true_arr, b)) - float(metric(y_true_arr, a))
584
- seed_seqs = np.random.SeedSequence(seed).spawn(n_resamples)
586
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
585
587
  step = functools.partial(
586
588
  _paired_bootstrap_diff_step,
587
589
  y_true_arr=y_true_arr,
@@ -654,7 +656,7 @@ def paired_bootstrap_ece_diff(
654
656
  ece_fn: Callable[[np.ndarray, np.ndarray, int], float],
655
657
  n_resamples: int = DEFAULT_N_RESAMPLES,
656
658
  confidence: float = DEFAULT_CONFIDENCE,
657
- seed: int = DEFAULT_SEED,
659
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
658
660
  n_bins: int = 10,
659
661
  n_jobs: int = 1,
660
662
  ) -> PairedBootstrapCI:
@@ -677,7 +679,7 @@ def paired_bootstrap_ece_diff(
677
679
  does not depend on calibration. Typical use:
678
680
  ``from eval_toolkit.metrics import expected_calibration_error``,
679
681
  then pass ``ece_fn=expected_calibration_error``.
680
- n_resamples, confidence, seed : standard bootstrap params.
682
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
681
683
  n_bins : int, optional
682
684
  Number of ECE bins (passed through to ``ece_fn``).
683
685
  n_jobs : int, optional
@@ -715,7 +717,7 @@ def paired_bootstrap_ece_diff(
715
717
  raise ValueError(f"n={n} too small for paired bootstrap; need >= 10")
716
718
 
717
719
  delta_point = float(ece_fn(y_true_arr, b, n_bins)) - float(ece_fn(y_true_arr, a, n_bins))
718
- seed_seqs = np.random.SeedSequence(seed).spawn(n_resamples)
720
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
719
721
  step = functools.partial(
720
722
  _paired_bootstrap_ece_diff_step,
721
723
  y_true_arr=y_true_arr,
@@ -798,7 +800,7 @@ def paired_bootstrap_op_point_diff(
798
800
  *,
799
801
  n_resamples: int = DEFAULT_N_RESAMPLES,
800
802
  confidence: float = DEFAULT_CONFIDENCE,
801
- seed: int = DEFAULT_SEED,
803
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
802
804
  n_jobs: int = 1,
803
805
  ) -> PairedBootstrapCI:
804
806
  r"""Two-level paired bootstrap for operating-point lifts.
@@ -826,7 +828,7 @@ def paired_bootstrap_op_point_diff(
826
828
  ``lambda y, s: MaxF1Selector().select(y, s).threshold``).
827
829
  metric_fn : callable ``(y_true, y_score, threshold) -> float``
828
830
  Operating-point metric (e.g., F1, precision) at the given threshold.
829
- n_resamples, confidence, seed : standard bootstrap params.
831
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
830
832
  n_jobs : int, optional
831
833
  Parallel workers (default 1 — sequential). See
832
834
  :ref:`methodology/parallelism`. Both ``threshold_fn`` and
@@ -913,7 +915,7 @@ def paired_bootstrap_op_point_diff(
913
915
  metric_fn(test_y_arr, test_a, thr_a_full)
914
916
  )
915
917
 
916
- seed_seqs = np.random.SeedSequence(seed).spawn(n_resamples)
918
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
917
919
  step = functools.partial(
918
920
  _paired_bootstrap_op_point_diff_step,
919
921
  val_y_arr=val_y_arr,
@@ -1132,7 +1134,7 @@ def paired_mde(
1132
1134
  alpha: float = 0.05,
1133
1135
  power: float = 0.80,
1134
1136
  n_resamples: int = DEFAULT_N_RESAMPLES,
1135
- seed: int = DEFAULT_SEED,
1137
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1136
1138
  n_jobs: int = 1,
1137
1139
  ) -> MDEEstimate:
1138
1140
  r"""Minimum detectable paired Δ at (α, power).
@@ -1174,7 +1176,7 @@ def paired_mde(
1174
1176
  metric,
1175
1177
  n_resamples=n_resamples,
1176
1178
  confidence=0.95,
1177
- seed=seed,
1179
+ rng=rng,
1178
1180
  n_jobs=n_jobs,
1179
1181
  )
1180
1182
  est = mde_from_ci(paired, alpha=alpha, power=power)
@@ -1306,7 +1308,7 @@ def block_bootstrap_on_folds(
1306
1308
  *,
1307
1309
  n_resamples: int = DEFAULT_N_RESAMPLES,
1308
1310
  confidence: float = DEFAULT_CONFIDENCE,
1309
- seed: int = DEFAULT_SEED,
1311
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1310
1312
  ) -> BootstrapCI:
1311
1313
  r"""Block bootstrap on folds: resample K folds with replacement; percentile CI on mean.
1312
1314
 
@@ -1341,8 +1343,9 @@ def block_bootstrap_on_folds(
1341
1343
  the cross-fold sensitivity-check use case (runs in O(seconds)).
1342
1344
  confidence : float, optional
1343
1345
  Two-sided confidence level (default 0.95).
1344
- seed : int, optional
1345
- RNG seed for reproducibility.
1346
+ rng : RNGLike | SeedLike | None, optional
1347
+ RNG argument per `Scientific Python SPEC 7 <https://scientific-python.org/specs/spec-0007/>`_.
1348
+ Int seed (default ``DEFAULT_SEED=42``), ``Generator``, or ``None`` (entropy).
1346
1349
 
1347
1350
  Returns
1348
1351
  -------
@@ -1360,7 +1363,7 @@ def block_bootstrap_on_folds(
1360
1363
  --------
1361
1364
  >>> import numpy as np
1362
1365
  >>> folds = np.array([0.83, 0.81, 0.85, 0.79, 0.84])
1363
- >>> ci = block_bootstrap_on_folds(folds, n_resamples=2000, seed=42)
1366
+ >>> ci = block_bootstrap_on_folds(folds, n_resamples=2000, rng=42)
1364
1367
  >>> ci.method
1365
1368
  'block_bootstrap'
1366
1369
  >>> bool(ci.ci_low <= ci.point_estimate <= ci.ci_high)
@@ -1389,7 +1392,7 @@ def block_bootstrap_on_folds(
1389
1392
  if not 0.0 < confidence < 1.0:
1390
1393
  raise ValueError(f"confidence must be in (0, 1); got {confidence}")
1391
1394
 
1392
- rng = np.random.default_rng(seed)
1395
+ rng = np.random.default_rng(rng)
1393
1396
  # Vectorized: (n_resamples, K) index draws, gather, mean along axis 1.
1394
1397
  idx = rng.integers(0, K, size=(n_resamples, K))
1395
1398
  resample_means = arr[idx].mean(axis=1)
@@ -1412,7 +1415,7 @@ def cross_validate_metric(
1412
1415
  metric: MetricFn,
1413
1416
  k: int = 5,
1414
1417
  stratified: bool = True,
1415
- seed: int = DEFAULT_SEED,
1418
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1416
1419
  ) -> np.ndarray:
1417
1420
  r"""K-fold cross-validation of a metric on caller-supplied scores.
1418
1421
 
@@ -1444,8 +1447,8 @@ def cross_validate_metric(
1444
1447
  If ``True`` (default), use ``StratifiedKFold`` so each fold
1445
1448
  preserves the class balance. Recommended for binary
1446
1449
  classification under class imbalance.
1447
- seed : int, optional
1448
- Shuffle seed for fold assignment.
1450
+ rng : RNGLike | SeedLike | None, optional
1451
+ RNG per SPEC 7 — derived to int at the sklearn ``KFold/StratifiedKFold`` boundary.
1449
1452
 
1450
1453
  Returns
1451
1454
  -------
@@ -1467,7 +1470,7 @@ def cross_validate_metric(
1467
1470
  >>> n = 200
1468
1471
  >>> y = rng.binomial(1, 0.3, size=n).astype(int)
1469
1472
  >>> s = np.clip(y * 0.6 + rng.normal(0, 0.3, n), 0, 1)
1470
- >>> folds = cross_validate_metric(y, s, metric=pr_auc, k=5, seed=42)
1473
+ >>> folds = cross_validate_metric(y, s, metric=pr_auc, k=5, rng=42)
1471
1474
  >>> folds.shape
1472
1475
  (5,)
1473
1476
  >>> bool(np.all(0.0 <= folds[~np.isnan(folds)]))
@@ -1491,12 +1494,18 @@ def cross_validate_metric(
1491
1494
  if k > n:
1492
1495
  raise ValueError(f"k={k} exceeds n={n}")
1493
1496
 
1497
+ # Derive an int seed for sklearn — sklearn KFold's random_state accepts
1498
+ # int | None | RandomState (not Generator) across versions <1.4; safer to
1499
+ # derive at the boundary than pin a higher sklearn minimum.
1500
+ rng = np.random.default_rng(rng)
1501
+ sklearn_seed = int(rng.integers(0, 2**31 - 1))
1502
+
1494
1503
  splitter: KFold | StratifiedKFold
1495
1504
  if stratified:
1496
- splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
1505
+ splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=sklearn_seed)
1497
1506
  fold_iter = splitter.split(np.zeros(n), y_arr)
1498
1507
  else:
1499
- splitter = KFold(n_splits=k, shuffle=True, random_state=seed)
1508
+ splitter = KFold(n_splits=k, shuffle=True, random_state=sklearn_seed)
1500
1509
  fold_iter = splitter.split(np.zeros(n))
1501
1510
 
1502
1511
  fold_metrics = np.full(k, np.nan, dtype=np.float64)