eval-toolkit 1.6.0__tar.gz → 1.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/CHANGELOG.md +26 -0
  2. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/PKG-INFO +1 -1
  3. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/__init__.py +2 -0
  4. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/bootstrap.py +389 -1
  6. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/benchmarks/test_kernel_benchmarks.py +2 -2
  7. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/public_api/snapshot.json +13 -1
  8. eval_toolkit-1.8.0/tests/test_cluster_bootstrap.py +131 -0
  9. eval_toolkit-1.8.0/tests/test_stratified_cluster_bootstrap.py +153 -0
  10. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/.gitignore +0 -0
  11. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/LICENSE +0 -0
  12. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/README.md +0 -0
  13. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/STYLE.md +0 -0
  14. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/archive/README.md +0 -0
  15. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/README.md +0 -0
  16. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/datasets/README.md +0 -0
  17. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/papers/data-integrity/README.md +0 -0
  18. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  19. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/papers/inference/README.md +0 -0
  20. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/research/papers/prompt-injection/README.md +0 -0
  21. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/source/adr/README.md +0 -0
  22. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/docs/source/methodology/README.md +0 -0
  23. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/pyproject.toml +0 -0
  24. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/__main__.py +0 -0
  25. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_deprecated.py +0 -0
  26. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_narrative.py +0 -0
  27. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_parallel.py +0 -0
  28. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_rng.py +0 -0
  29. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/_sweep.py +0 -0
  30. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/adversarial.py +0 -0
  31. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/analysis.py +0 -0
  32. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/artifacts.py +0 -0
  33. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_citation_alignment.py +0 -0
  34. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_sister_doc_concept_drift.py +0 -0
  35. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/audit_value_bindings.py +0 -0
  36. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/calibration.py +0 -0
  37. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/claims.py +0 -0
  38. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/config.py +0 -0
  39. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/docs.py +0 -0
  40. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/__init__.py +0 -0
  41. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/data_audit.py +0 -0
  42. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/distribution_shift.py +0 -0
  43. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/lexical_association.py +0 -0
  44. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/eda/obfuscation.py +0 -0
  45. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/embeddings.py +0 -0
  46. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/evidence.py +0 -0
  47. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/harness.py +0 -0
  48. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/leakage.py +0 -0
  49. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/loaders.py +0 -0
  50. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/losses.py +0 -0
  51. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/manifest.py +0 -0
  52. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/metric_specs.py +0 -0
  53. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/metrics.py +0 -0
  54. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/operating_points.py +0 -0
  55. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/paths.py +0 -0
  56. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/plotting.py +0 -0
  57. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/preprocessing.py +0 -0
  58. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/probes.py +0 -0
  59. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/protocols.py +0 -0
  60. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/provenance.py +0 -0
  61. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/py.typed +0 -0
  62. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  63. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  64. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  65. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  66. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  67. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  68. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/scorecards.py +0 -0
  69. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/seeds.py +0 -0
  70. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/splits.py +0 -0
  71. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/stacking.py +0 -0
  72. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/text_dedup.py +0 -0
  73. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/src/eval_toolkit/thresholds.py +0 -0
  74. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  75. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  76. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  77. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  78. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  79. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  80. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  81. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  82. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  83. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  84. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/benchmarks/__init__.py +0 -0
  85. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/conftest.py +0 -0
  86. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  87. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  88. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  89. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  90. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/docs/expected.md +0 -0
  91. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/docs/input.md +0 -0
  92. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/docs/metrics.json +0 -0
  93. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  94. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/strategies.py +0 -0
  95. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_adversarial.py +0 -0
  96. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_analysis.py +0 -0
  97. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_artifacts.py +0 -0
  98. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_audit_citation_alignment.py +0 -0
  99. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_audit_sister_doc_concept_drift.py +0 -0
  100. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_audit_value_bindings.py +0 -0
  101. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  102. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  103. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_edge_cases.py +0 -0
  104. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_golden.py +0 -0
  105. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_njobs.py +0 -0
  106. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_props.py +0 -0
  107. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_research_grounded.py +0 -0
  108. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_bootstrap_unit.py +0 -0
  109. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_binary_adapters.py +0 -0
  110. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  111. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_determinism.py +0 -0
  112. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_optimization_failures.py +0 -0
  113. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_props.py +0 -0
  114. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_research_grounded.py +0 -0
  115. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_calibration_unit.py +0 -0
  116. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_claims.py +0 -0
  117. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_claims_coverage.py +0 -0
  118. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_claims_props.py +0 -0
  119. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_cli.py +0 -0
  120. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_config.py +0 -0
  121. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_coverage_bootstrap.py +0 -0
  122. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_coverage_calibration.py +0 -0
  123. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_coverage_harness.py +0 -0
  124. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_coverage_metrics.py +0 -0
  125. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_coverage_plotting.py +0 -0
  126. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_croissant_e2e.py +0 -0
  127. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  128. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_deprecated_scalars_shim.py +0 -0
  129. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_deprecations.py +0 -0
  130. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_docs_golden.py +0 -0
  131. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_docs_props.py +0 -0
  132. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_eda.py +0 -0
  133. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_eda_distribution_shift.py +0 -0
  134. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_eda_lexical_association.py +0 -0
  135. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_eda_obfuscation.py +0 -0
  136. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_embeddings.py +0 -0
  137. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_evidence_validators.py +0 -0
  138. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_edge_cases.py +0 -0
  139. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_fault_injection.py +0 -0
  140. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_folded.py +0 -0
  141. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_internals.py +0 -0
  142. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_metric_options.py +0 -0
  143. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_parallelism.py +0 -0
  144. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_harness_smoke.py +0 -0
  145. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_import_boundaries.py +0 -0
  146. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  147. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_lazy_extras_messages.py +0 -0
  148. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_leakage.py +0 -0
  149. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_leakage_error_paths.py +0 -0
  150. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_leakage_props.py +0 -0
  151. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_loaders.py +0 -0
  152. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_loaders_coverage.py +0 -0
  153. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_loaders_props.py +0 -0
  154. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_logging.py +0 -0
  155. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_losses.py +0 -0
  156. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_manifest.py +0 -0
  157. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  158. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_manifest_props.py +0 -0
  159. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_manifest_validation.py +0 -0
  160. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_metrics_props.py +0 -0
  161. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_metrics_stratified_subsets.py +0 -0
  162. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_metrics_unit.py +0 -0
  163. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_misc_coverage.py +0 -0
  164. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_numeric_edge_cases.py +0 -0
  165. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_ood_loader.py +0 -0
  166. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_operating_points.py +0 -0
  167. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_operating_points_props.py +0 -0
  168. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_parallel.py +0 -0
  169. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_paths.py +0 -0
  170. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_pipeline_e2e.py +0 -0
  171. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_plotting_edge.py +0 -0
  172. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_plotting_smoke.py +0 -0
  173. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_plotting_visual.py +0 -0
  174. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_preprocessing.py +0 -0
  175. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_probes.py +0 -0
  176. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_protocol_conformance.py +0 -0
  177. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_provenance.py +0 -0
  178. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_public_api.py +0 -0
  179. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_recall_at_fpr.py +0 -0
  180. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_reference_equivalence.py +0 -0
  181. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_reproducibility_integration.py +0 -0
  182. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_rng.py +0 -0
  183. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_schemas.py +0 -0
  184. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_scorecard.py +0 -0
  185. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_seeds.py +0 -0
  186. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_splits.py +0 -0
  187. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_splits_leakage_integration.py +0 -0
  188. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_splits_props.py +0 -0
  189. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_stacking.py +0 -0
  190. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_sweep.py +0 -0
  191. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_text_dedup.py +0 -0
  192. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_coverage.py +0 -0
  193. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_props.py +0 -0
  194. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_text_dedup_strategies.py +0 -0
  195. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_thresholds.py +0 -0
  196. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_thresholds_constant_score.py +0 -0
  197. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_thresholds_coverage.py +0 -0
  198. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_thresholds_props.py +0 -0
  199. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_thresholds_research_grounded.py +0 -0
  200. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_tokenization_leakage_check.py +0 -0
  201. {eval_toolkit-1.6.0 → eval_toolkit-1.8.0}/tests/test_v09_contracts.py +0 -0
@@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.8.0] — 2026-06-04 — composite multi-stratum cluster bootstrap (#92)
9
+
10
+ ### Added — `bootstrap.stratified_cluster_bootstrap_ci` (composite multi-stratum cluster bootstrap)
11
+
12
+ `eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Generalises the v1.7.0 single-block `cluster_bootstrap_ci` to the shape leave-one-group-out transfer gaps actually take: a **composite statistic reduced over several independently-resampled cluster strata**.
13
+
14
+ - **`stratified_cluster_bootstrap_ci(strata, per_stratum_metric, combine, *, resample_labels=(0,1), …)`** — `strata` is a mapping `{key: (y, score, groups)}` of independent resample-units (e.g. `seed`, `(carrier, seed)`, `(attack_type, seed)`); each bootstrap iteration resamples every stratum's `(label, group)` clusters, computes `per_stratum_metric` on each, and reduces the `{key: metric}` map with `combine` to one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a top−bottom per-type AUPRC contrast, …). Percentile `BootstrapCI` (`method="stratified_cluster_percentile"`). `cluster_bootstrap_ci` is the single-stratum, identity-reduce special case.
15
+ - **Why:** the v1.7.0 single-block primitive could not express the **seed-averaging** that real LODO estimators do inside the bootstrap (`Gx = val − mean_seed(test_roc)`), so it did not actually fit the consumer portfolio's attack-type / carrier / dialect bootstraps. This is the correct primitive for them.
16
+ - **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences` ⇒ `n_jobs` gives bit-for-bit-identical CIs; `n_jobs=-1` all cores.
17
+ - Exported via `from eval_toolkit import stratified_cluster_bootstrap_ci`; `__all__` + `_EXPORTS` + doctest + n_jobs-reproducibility / seed-averaged / composite-statistic tests; mypy-strict clean.
18
+
19
+ ## [1.7.0] — 2026-06-04 — label-stratified cluster bootstrap (#90, #91)
20
+
21
+ ### Added — `bootstrap.cluster_bootstrap_ci` (label-stratified cluster bootstrap)
22
+
23
+ `eval_toolkit.bootstrap` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — backward-compatible. Closes the gap between the row-level (`bootstrap_ci`) and fold-level (`block_bootstrap_on_folds`) resamplers: **the missing middle — resampling clusters of rows.**
24
+
25
+ - **`cluster_bootstrap_ci(y_true, y_score, groups, statistic, *, resample_labels=(0, 1), …)`** — percentile CI for a single-condition metric that resamples whole **clusters** (`groups`) with replacement, so the CI is honest under intra-cluster correlation (prompts sharing one attack payload; a document contributing both a poisoned and a benign row). The resample unit is `(label, group)`: by default positive- and negative-clusters are resampled **separately** (never a single-class draw); `resample_labels=(1,)` resamples only positive clusters with negatives held fixed (the payload-cluster convention). Returns a `BootstrapCI` with `method="cluster_percentile"`.
26
+ - **Parallel + reproducible:** built on `parallel_map` + `spawn_seed_sequences`, so `n_jobs` gives bit-for-bit-identical CIs across worker counts (the v0.34.0 reproducibility contract). `n_jobs=-1` uses all cores.
27
+ - Motivation: the analytic row-level AUROC-difference CI (`delong_roc_variance`) assumes row independence and under-covers on clustered eval data (LODO transfer gaps with payload/document/page clusters). Dogfooded by the consumer portfolio's attack-type / carrier / dialect leave-one-group-out bootstraps (Rule of Three).
28
+ - Exported via `from eval_toolkit import cluster_bootstrap_ci`; `__all__` + `_EXPORTS` updated; doctest + n_jobs-reproducibility tests; mypy-strict clean.
29
+
30
+ ### Fixed — stale `seed=` kwarg in 2 bootstrap benchmarks (#91)
31
+
32
+ `tests/benchmarks/test_kernel_benchmarks.py` passed `seed=` to `bootstrap_ci` / `paired_bootstrap_diff`, but those parameters migrated to `rng=` (SPEC 7) — the two bootstrap benchmark tests `TypeError`'d on the nightly-benchmarks workflow (excluded from PR CI via `-m "not benchmark"`, so it went unnoticed). 2-line `seed=`→`rng=` rename.
33
+
8
34
  ## [1.6.0] — 2026-05-29 — Tier-2 `eda` Job-2 + Job-3: shortcut + shift diagnostics (#86, #87)
9
35
 
10
36
  `eval_toolkit.eda.*` ADDITIVE per [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md) — Tier-2, backward-compatible. Completes the EDA-first analytic layer above the v1.5.0 Job-1 integrity gate: **Job-2** lexical shortcut diagnostics (`lexical_association`, #86) and **Job-3** distribution-shift quantification (`distribution_shift`, #87). Both are dogfooded by the consumer portfolio's pre-modeling OOD-wall prediction (V5 + V9).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 1.6.0
3
+ Version: 1.8.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -129,6 +129,7 @@ _EXPORTS: dict[str, str] = {
129
129
  "block_bootstrap_on_folds": "eval_toolkit.bootstrap",
130
130
  "bonferroni_correct": "eval_toolkit.bootstrap",
131
131
  "bootstrap_ci": "eval_toolkit.bootstrap",
132
+ "cluster_bootstrap_ci": "eval_toolkit.bootstrap",
132
133
  "correct_p_values": "eval_toolkit.bootstrap",
133
134
  "cross_validate_metric": "eval_toolkit.bootstrap",
134
135
  "cv_clt_ci": "eval_toolkit.bootstrap",
@@ -139,6 +140,7 @@ _EXPORTS: dict[str, str] = {
139
140
  "paired_bootstrap_ece_diff": "eval_toolkit.bootstrap",
140
141
  "paired_bootstrap_op_point_diff": "eval_toolkit.bootstrap",
141
142
  "paired_mde": "eval_toolkit.bootstrap",
143
+ "stratified_cluster_bootstrap_ci": "eval_toolkit.bootstrap",
142
144
  # --- calibration ---
143
145
  "DEFAULT_FN_COST": "eval_toolkit.calibration",
144
146
  "DEFAULT_FP_COST": "eval_toolkit.calibration",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "1.6.0"
5
+ __version__ = "1.8.0"
@@ -22,7 +22,7 @@ from __future__ import annotations
22
22
  import functools
23
23
  import logging
24
24
  import warnings
25
- from collections.abc import Callable
25
+ from collections.abc import Callable, Hashable, Mapping
26
26
  from dataclasses import dataclass
27
27
  from typing import Final, Literal
28
28
 
@@ -52,6 +52,7 @@ __all__ = [
52
52
  "block_bootstrap_on_folds",
53
53
  "bonferroni_correct",
54
54
  "bootstrap_ci",
55
+ "cluster_bootstrap_ci",
55
56
  "correct_p_values",
56
57
  "cross_validate_metric",
57
58
  "cv_clt_ci",
@@ -62,6 +63,7 @@ __all__ = [
62
63
  "paired_bootstrap_ece_diff",
63
64
  "paired_bootstrap_op_point_diff",
64
65
  "paired_mde",
66
+ "stratified_cluster_bootstrap_ci",
65
67
  ]
66
68
 
67
69
  DEFAULT_N_RESAMPLES: Final[int] = 1000
@@ -1447,6 +1449,392 @@ def block_bootstrap_on_folds(
1447
1449
  )
1448
1450
 
1449
1451
 
1452
+ def _label_cluster_units(y_true: np.ndarray, groups: np.ndarray) -> dict[int, list[np.ndarray]]:
1453
+ """Index rows by ``(label, group)``: per label, a list of per-group row-index arrays.
1454
+
1455
+ A group that appears under both labels contributes a **separate** index array to each label's
1456
+ list — the resample unit is ``(label, group)``, so a mixed-label group (e.g. a document with
1457
+ both a poisoned and a benign variant sharing one id) splits into one positive unit and one
1458
+ negative unit, resampled independently. Helper for :func:`cluster_bootstrap_ci`.
1459
+ """
1460
+ units: dict[int, list[np.ndarray]] = {}
1461
+ for lab in np.unique(y_true):
1462
+ lab_rows = np.flatnonzero(y_true == lab)
1463
+ order = np.argsort(groups[lab_rows], kind="stable")
1464
+ sorted_rows = lab_rows[order]
1465
+ sorted_groups = groups[lab_rows][order]
1466
+ # Split at the boundaries between consecutive distinct group ids (post-sort).
1467
+ cut = np.flatnonzero(sorted_groups[1:] != sorted_groups[:-1]) + 1
1468
+ units[int(lab)] = np.split(sorted_rows, cut)
1469
+ return units
1470
+
1471
+
1472
+ def _cluster_bootstrap_step(
1473
+ seed_seq: np.random.SeedSequence,
1474
+ *,
1475
+ y_true: np.ndarray,
1476
+ y_score: np.ndarray,
1477
+ units: dict[int, list[np.ndarray]],
1478
+ statistic: MetricFn,
1479
+ resample_labels: tuple[int, ...],
1480
+ ) -> float | None:
1481
+ """One cluster-resampled draw of ``statistic`` (module-level for parallel_map picklability).
1482
+
1483
+ For each label, its ``(label, group)`` units are resampled with replacement when the label is
1484
+ in ``resample_labels``, else all its rows are held fixed. Returns the statistic on the gathered
1485
+ rows, or ``None`` if the draw is degenerate (statistic raises — e.g. a single-class draw).
1486
+ """
1487
+ rng = np.random.default_rng(seed_seq)
1488
+ parts: list[np.ndarray] = []
1489
+ for lab, group_rows in units.items():
1490
+ if lab in resample_labels:
1491
+ chosen = rng.integers(0, len(group_rows), size=len(group_rows))
1492
+ parts.extend(group_rows[c] for c in chosen)
1493
+ else:
1494
+ parts.extend(group_rows)
1495
+ idx = np.concatenate(parts)
1496
+ try:
1497
+ return float(statistic(y_true[idx], y_score[idx]))
1498
+ except (ValueError, RuntimeError):
1499
+ return None
1500
+
1501
+
1502
+ def cluster_bootstrap_ci(
1503
+ y_true: np.ndarray,
1504
+ y_score: np.ndarray,
1505
+ groups: np.ndarray,
1506
+ statistic: MetricFn,
1507
+ *,
1508
+ resample_labels: tuple[int, ...] = (0, 1),
1509
+ n_resamples: int = DEFAULT_N_RESAMPLES,
1510
+ confidence: float = DEFAULT_CONFIDENCE,
1511
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1512
+ n_jobs: int = 1,
1513
+ ) -> BootstrapCI:
1514
+ r"""Label-stratified **cluster** (group) bootstrap percentile CI for a single-condition metric.
1515
+
1516
+ Resamples whole ``groups`` (clusters) with replacement rather than individual rows, so the CI
1517
+ is honest under intra-cluster correlation (multiple prompts sharing one attack payload; a
1518
+ document contributing both a poisoned and a benign row). The resample unit is ``(label,
1519
+ group)``: by default (``resample_labels=(0, 1)``) positive-clusters and negative-clusters are
1520
+ resampled **separately**, preserving the per-class cluster split so a draw is never
1521
+ single-class. Pass ``resample_labels=(1,)`` to resample only the positive clusters while holding
1522
+ all negatives fixed (the payload-cluster convention).
1523
+
1524
+ Where :func:`bootstrap_ci` resamples **rows** (i.i.d. assumption) and
1525
+ :func:`block_bootstrap_on_folds` resamples **per-fold scalars**, this resamples **clusters of
1526
+ rows** — the missing middle for grouped eval data. The analytic row-level AUROC-difference CI
1527
+ (:func:`delong_roc_variance`) assumes row independence and so under-covers on clustered data,
1528
+ which is the motivation for this estimator.
1529
+
1530
+ Parameters
1531
+ ----------
1532
+ y_true : np.ndarray, shape (n,)
1533
+ Binary labels in ``{0, 1}``.
1534
+ y_score : np.ndarray, shape (n,)
1535
+ Scores aligned with ``y_true``.
1536
+ groups : np.ndarray, shape (n,)
1537
+ Cluster id per row (any sortable dtype — ints or strings).
1538
+ statistic : callable ``(y_true, y_score) -> float``
1539
+ Metric to bootstrap (e.g. ``roc_auc``). Must be **picklable** when ``n_jobs != 1`` (a named
1540
+ top-level function — lambdas / closures are rejected).
1541
+ resample_labels : tuple[int, ...], optional
1542
+ Which label strata are cluster-resampled (default ``(0, 1)`` — both). Labels not listed are
1543
+ held fixed (all their rows always included). Must be non-empty.
1544
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
1545
+ n_jobs : int, optional
1546
+ Parallel workers (default 1 — sequential). ``n_jobs=-1`` uses all cores; ``n_jobs=0`` is
1547
+ rejected. Per-resample seeding via :func:`spawn_seed_sequences` makes the CI **bit-for-bit
1548
+ identical across ``n_jobs``** for a fixed ``rng``. See :ref:`methodology/parallelism`.
1549
+
1550
+ Returns
1551
+ -------
1552
+ BootstrapCI
1553
+ ``method="cluster_percentile"``; ``point_estimate = statistic(y_true, y_score)`` on the full
1554
+ data; ``[alpha/2, 1 - alpha/2]`` percentile CI over the cluster-resampled distribution.
1555
+
1556
+ Raises
1557
+ ------
1558
+ ValueError
1559
+ On shape mismatch, non-1-D input, ``n < 10``, ``confidence`` outside (0, 1), empty
1560
+ ``resample_labels``, a ``resample_labels`` entry absent from ``y_true``, ``n_jobs == 0``, or
1561
+ > 5% degenerate resamples.
1562
+ TypeError
1563
+ If ``n_jobs != 1`` and ``statistic`` is not picklable.
1564
+
1565
+ Examples
1566
+ --------
1567
+ >>> import numpy as np
1568
+ >>> from eval_toolkit.metrics import roc_auc
1569
+ >>> rng = np.random.default_rng(0)
1570
+ >>> groups = np.repeat(np.arange(40), 5) # 40 clusters of 5 rows
1571
+ >>> y = (groups % 2).astype(int) # cluster-pure labels
1572
+ >>> s = y + rng.normal(0, 0.3, size=y.size)
1573
+ >>> ci = cluster_bootstrap_ci(y, s, groups, roc_auc, n_resamples=200, rng=0)
1574
+ >>> ci.method
1575
+ 'cluster_percentile'
1576
+ >>> bool(0.0 <= ci.ci_low <= ci.ci_high <= 1.0)
1577
+ True
1578
+
1579
+ Notes
1580
+ -----
1581
+ For a *gap* statistic with a fixed offset (e.g. ``Gx = val_auc − test_auc`` with ``val_auc``
1582
+ held fixed), bootstrap the variable term and shift the bounds: ``Gx_low = val_auc − ci.ci_high``,
1583
+ ``Gx_high = val_auc − ci.ci_low``. For a one-sided 95% bound, pass ``confidence=0.90`` and read
1584
+ the relevant bound.
1585
+
1586
+ References
1587
+ ----------
1588
+ .. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
1589
+ (§8 — bootstrapping stratified / clustered data.)
1590
+ .. [2] Field, C. A. & Welsh, A. H. "Bootstrapping clustered data." JRSS-B 69(3), 2007.
1591
+ """
1592
+ y_true_arr = np.asarray(y_true)
1593
+ y_score_arr = np.asarray(y_score)
1594
+ groups_arr = np.asarray(groups)
1595
+ if not (y_true_arr.shape == y_score_arr.shape == groups_arr.shape):
1596
+ raise ValueError(
1597
+ f"shapes mismatch: y_true {y_true_arr.shape}, y_score {y_score_arr.shape}, "
1598
+ f"groups {groups_arr.shape}"
1599
+ )
1600
+ if y_true_arr.ndim != 1:
1601
+ raise ValueError(f"inputs must be 1-D; got shape {y_true_arr.shape}")
1602
+ n = int(y_true_arr.size)
1603
+ if n < 10:
1604
+ raise ValueError(f"n={n} too small for cluster bootstrap; need ≥ 10")
1605
+ if not 0.0 < confidence < 1.0:
1606
+ raise ValueError(f"confidence must be in (0, 1), got {confidence}")
1607
+ resample_labels = tuple(int(x) for x in resample_labels)
1608
+ if not resample_labels:
1609
+ raise ValueError("resample_labels must be non-empty (nothing would be resampled)")
1610
+ present = {int(v) for v in np.unique(y_true_arr).tolist()}
1611
+ missing = set(resample_labels) - present
1612
+ if missing:
1613
+ raise ValueError(
1614
+ f"resample_labels {sorted(missing)} absent from y_true (present: {sorted(present)})"
1615
+ )
1616
+
1617
+ point = float(statistic(y_true_arr, y_score_arr))
1618
+ units = _label_cluster_units(y_true_arr, groups_arr)
1619
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
1620
+ step = functools.partial(
1621
+ _cluster_bootstrap_step,
1622
+ y_true=y_true_arr,
1623
+ y_score=y_score_arr,
1624
+ units=units,
1625
+ statistic=statistic,
1626
+ resample_labels=resample_labels,
1627
+ )
1628
+ raw = parallel_map(step, seed_seqs, n_jobs=n_jobs, description="cluster_bootstrap_ci")
1629
+ failures = sum(1 for r in raw if r is None)
1630
+ vals = [r for r in raw if r is not None]
1631
+ if failures > 0.05 * n_resamples:
1632
+ raise ValueError(
1633
+ f"cluster_bootstrap_ci: {failures}/{n_resamples} resamples degenerate "
1634
+ "(statistic raised — e.g. single-class draws); refusing to compute CI on > 5% degenerate"
1635
+ )
1636
+ if not vals:
1637
+ raise ValueError("cluster_bootstrap_ci: no usable resamples")
1638
+ arr = np.asarray(vals, dtype=np.float64)
1639
+ alpha = 1.0 - confidence
1640
+ ci_low, ci_high = np.quantile(arr, [alpha / 2.0, 1.0 - alpha / 2.0])
1641
+ return BootstrapCI(
1642
+ point_estimate=point,
1643
+ ci_low=float(ci_low),
1644
+ ci_high=float(ci_high),
1645
+ confidence=confidence,
1646
+ n_resamples=int(len(vals)),
1647
+ method="cluster_percentile",
1648
+ )
1649
+
1650
+
1651
+ def _stratified_cluster_step(
1652
+ seed_seq: np.random.SeedSequence,
1653
+ *,
1654
+ strata_data: dict[Hashable, tuple[np.ndarray, np.ndarray]],
1655
+ strata_units: dict[Hashable, dict[int, list[np.ndarray]]],
1656
+ per_stratum_metric: MetricFn,
1657
+ combine: Callable[[Mapping[Hashable, float]], float],
1658
+ resample_labels: tuple[int, ...],
1659
+ ) -> float | None:
1660
+ """One stratified-cluster draw of ``combine({key: metric})`` (module-level for picklability).
1661
+
1662
+ Each stratum's ``(label, group)`` units are resampled independently (per ``resample_labels``;
1663
+ labels not listed are held fixed) with a single per-resample RNG, the per-stratum metric is
1664
+ computed on the gathered rows, and ``combine`` reduces the ``{key: metric}`` map to one scalar.
1665
+ Returns ``None`` if any stratum metric or ``combine`` raises (a degenerate draw).
1666
+ """
1667
+ rng = np.random.default_rng(seed_seq)
1668
+ by_key: dict[Hashable, float] = {}
1669
+ for key, units in strata_units.items():
1670
+ y_k, s_k = strata_data[key]
1671
+ parts: list[np.ndarray] = []
1672
+ for lab, group_rows in units.items():
1673
+ if lab in resample_labels:
1674
+ chosen = rng.integers(0, len(group_rows), size=len(group_rows))
1675
+ parts.extend(group_rows[c] for c in chosen)
1676
+ else:
1677
+ parts.extend(group_rows)
1678
+ idx = np.concatenate(parts)
1679
+ try:
1680
+ by_key[key] = float(per_stratum_metric(y_k[idx], s_k[idx]))
1681
+ except (ValueError, RuntimeError):
1682
+ return None
1683
+ try:
1684
+ return float(combine(by_key))
1685
+ except (ValueError, RuntimeError, KeyError, ZeroDivisionError):
1686
+ return None
1687
+
1688
+
1689
+ def stratified_cluster_bootstrap_ci(
1690
+ strata: Mapping[Hashable, tuple[np.ndarray, np.ndarray, np.ndarray]],
1691
+ per_stratum_metric: MetricFn,
1692
+ combine: Callable[[Mapping[Hashable, float]], float],
1693
+ *,
1694
+ resample_labels: tuple[int, ...] = (0, 1),
1695
+ n_resamples: int = DEFAULT_N_RESAMPLES,
1696
+ confidence: float = DEFAULT_CONFIDENCE,
1697
+ rng: RNGLike | SeedLike | None = DEFAULT_SEED,
1698
+ n_jobs: int = 1,
1699
+ ) -> BootstrapCI:
1700
+ r"""Cluster bootstrap of a **composite** statistic over several independent strata.
1701
+
1702
+ Generalises :func:`cluster_bootstrap_ci` (one condition, one metric) to a statistic that is a
1703
+ user-supplied **reduction over several independently-resampled cluster strata** — the shape that
1704
+ leave-one-group-out transfer gaps take in practice: a per-seed (and per-group) cluster resample,
1705
+ averaged/combined into one scalar (a seed-averaged ROC-AUC gap, a mean-over-carriers gap, a
1706
+ top-minus-bottom per-type AUPRC contrast, …). Each bootstrap iteration resamples every stratum's
1707
+ ``(label, group)`` clusters (label-stratified, per ``resample_labels``; labels not listed are
1708
+ held fixed), computes ``per_stratum_metric`` on each, and reduces the ``{key: metric}`` map with
1709
+ ``combine``; the percentile CI is over those reduced values.
1710
+
1711
+ :func:`cluster_bootstrap_ci` is the single-stratum, identity-reduce special case
1712
+ (``strata={0: (y, score, groups)}``, ``combine=lambda m: m[0]``).
1713
+
1714
+ Parameters
1715
+ ----------
1716
+ strata : Mapping[key, (y_true, y_score, groups)]
1717
+ Independent resample-units keyed by any hashable (e.g. ``seed`` / ``(carrier, seed)`` /
1718
+ ``(attack_type, seed)``). Each value is three aligned 1-D arrays. Iteration order is the
1719
+ mapping's order (stable ⇒ deterministic).
1720
+ per_stratum_metric : callable ``(y_true, y_score) -> float``
1721
+ Metric computed on each stratum's resampled rows (e.g. ``roc_auc``, ``pr_auc``). Must be
1722
+ **picklable** when ``n_jobs != 1``.
1723
+ combine : callable ``Mapping[key, float] -> float``
1724
+ Reduces the per-stratum metrics to the composite statistic (e.g.
1725
+ ``val − mean_seed(m)``; ``mean_carrier(val[c] − mean_seed(m[c, ·]))``; a top−bottom contrast).
1726
+ Closes over any fixed quantities (val ROC, the type partition) — pass a **picklable**
1727
+ top-level function or ``functools.partial`` when ``n_jobs != 1`` (lambdas are fine at
1728
+ ``n_jobs == 1``).
1729
+ resample_labels : tuple[int, ...], optional
1730
+ Which label strata are cluster-resampled within each stratum (default ``(0, 1)``);
1731
+ ``(1,)`` resamples only positive clusters, holding negatives fixed (the payload-cluster
1732
+ convention). Labels not present in a given stratum are simply skipped there.
1733
+ n_resamples, confidence, rng : standard bootstrap params (``rng`` per SPEC 7).
1734
+ n_jobs : int, optional
1735
+ Parallel workers (default 1). Per-resample seeding via :func:`spawn_seed_sequences` makes the
1736
+ CI **bit-for-bit identical across ``n_jobs``**; ``n_jobs=-1`` uses all cores. See
1737
+ :ref:`methodology/parallelism`.
1738
+
1739
+ Returns
1740
+ -------
1741
+ BootstrapCI
1742
+ ``method="stratified_cluster_percentile"``; ``point_estimate = combine({key:
1743
+ per_stratum_metric(y, score)})`` on the full data; ``[alpha/2, 1 - alpha/2]`` percentile CI.
1744
+
1745
+ Raises
1746
+ ------
1747
+ ValueError
1748
+ On empty ``strata``, a stratum whose arrays mismatch shape / are not 1-D, empty
1749
+ ``resample_labels``, ``confidence`` outside (0, 1), ``n_jobs == 0``, or > 5% degenerate
1750
+ resamples.
1751
+ TypeError
1752
+ If ``n_jobs != 1`` and ``per_stratum_metric`` / ``combine`` are not picklable.
1753
+
1754
+ Examples
1755
+ --------
1756
+ >>> import numpy as np
1757
+ >>> from eval_toolkit.metrics import roc_auc
1758
+ >>> def _block(seed):
1759
+ ... g = np.repeat(np.arange(20), 4) # 20 clusters of 4
1760
+ ... y = (g % 2).astype(int)
1761
+ ... s = y + np.random.default_rng(seed).normal(0, 0.3, size=y.size)
1762
+ ... return y, s, g
1763
+ >>> strata = {0: _block(0), 1: _block(1)} # two seeds
1764
+ >>> ci = stratified_cluster_bootstrap_ci(
1765
+ ... strata, roc_auc, lambda m: float(np.mean(list(m.values()))),
1766
+ ... n_resamples=200, rng=0)
1767
+ >>> ci.method
1768
+ 'stratified_cluster_percentile'
1769
+ >>> bool(0.0 <= ci.ci_low <= ci.ci_high <= 1.0)
1770
+ True
1771
+
1772
+ Notes
1773
+ -----
1774
+ For a *gap* with a fixed offset (``Gx = val − stat``, ``val`` fixed), fold the offset into
1775
+ ``combine`` (``combine`` returns ``val − mean(m.values())``) so the CI is on ``Gx`` directly.
1776
+
1777
+ References
1778
+ ----------
1779
+ .. [1] Efron, B. & Tibshirani, R. "An Introduction to the Bootstrap." Chapman & Hall, 1993.
1780
+ (§8 — stratified / clustered resampling.)
1781
+ .. [2] Field, C. A. & Welsh, A. H. "Bootstrapping clustered data." JRSS-B 69(3), 2007.
1782
+ """
1783
+ if not strata:
1784
+ raise ValueError("strata must be non-empty")
1785
+ if not 0.0 < confidence < 1.0:
1786
+ raise ValueError(f"confidence must be in (0, 1), got {confidence}")
1787
+ resample_labels = tuple(int(x) for x in resample_labels)
1788
+ if not resample_labels:
1789
+ raise ValueError("resample_labels must be non-empty (nothing would be resampled)")
1790
+
1791
+ strata_data: dict[Hashable, tuple[np.ndarray, np.ndarray]] = {}
1792
+ strata_units: dict[Hashable, dict[int, list[np.ndarray]]] = {}
1793
+ for key, triple in strata.items():
1794
+ y_a, s_a, g_a = (np.asarray(triple[0]), np.asarray(triple[1]), np.asarray(triple[2]))
1795
+ if not (y_a.shape == s_a.shape == g_a.shape) or y_a.ndim != 1:
1796
+ raise ValueError(
1797
+ f"stratum {key!r}: y/score/groups must be aligned 1-D arrays; got "
1798
+ f"{y_a.shape}, {s_a.shape}, {g_a.shape}"
1799
+ )
1800
+ strata_data[key] = (y_a, s_a)
1801
+ strata_units[key] = _label_cluster_units(y_a, g_a)
1802
+
1803
+ point = float(combine({k: float(per_stratum_metric(*strata_data[k])) for k in strata_data}))
1804
+ seed_seqs = spawn_seed_sequences(rng, n_resamples)
1805
+ step = functools.partial(
1806
+ _stratified_cluster_step,
1807
+ strata_data=strata_data,
1808
+ strata_units=strata_units,
1809
+ per_stratum_metric=per_stratum_metric,
1810
+ combine=combine,
1811
+ resample_labels=resample_labels,
1812
+ )
1813
+ raw = parallel_map(
1814
+ step, seed_seqs, n_jobs=n_jobs, description="stratified_cluster_bootstrap_ci"
1815
+ )
1816
+ failures = sum(1 for r in raw if r is None)
1817
+ vals = [r for r in raw if r is not None]
1818
+ if failures > 0.05 * n_resamples:
1819
+ raise ValueError(
1820
+ f"stratified_cluster_bootstrap_ci: {failures}/{n_resamples} resamples degenerate "
1821
+ "(a per-stratum metric or combine raised); refusing CI on > 5% degenerate"
1822
+ )
1823
+ if not vals:
1824
+ raise ValueError("stratified_cluster_bootstrap_ci: no usable resamples")
1825
+ arr = np.asarray(vals, dtype=np.float64)
1826
+ alpha = 1.0 - confidence
1827
+ ci_low, ci_high = np.quantile(arr, [alpha / 2.0, 1.0 - alpha / 2.0])
1828
+ return BootstrapCI(
1829
+ point_estimate=point,
1830
+ ci_low=float(ci_low),
1831
+ ci_high=float(ci_high),
1832
+ confidence=confidence,
1833
+ n_resamples=int(len(vals)),
1834
+ method="stratified_cluster_percentile",
1835
+ )
1836
+
1837
+
1450
1838
  def cross_validate_metric(
1451
1839
  y_true: np.ndarray,
1452
1840
  y_score: np.ndarray,
@@ -136,7 +136,7 @@ def test_benchmark_bootstrap_ci_pr_auc_n200(
136
136
  y, s = yspc_n200
137
137
 
138
138
  def _run() -> float:
139
- return bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, seed=42).point_estimate
139
+ return bootstrap_ci(y, s, metric=pr_auc, n_resamples=200, rng=42).point_estimate
140
140
 
141
141
  result = benchmark(_run)
142
142
  assert 0.0 <= result <= 1.0
@@ -155,7 +155,7 @@ def test_benchmark_paired_bootstrap_diff_pr_auc_n200(
155
155
  y, s_a, s_b = y_two_scorers_n200
156
156
 
157
157
  def _run() -> float:
158
- return paired_bootstrap_diff(y, s_a, s_b, metric=pr_auc, n_resamples=200, seed=42).delta
158
+ return paired_bootstrap_diff(y, s_a, s_b, metric=pr_auc, n_resamples=200, rng=42).delta
159
159
 
160
160
  result = benchmark(_run)
161
161
  # Delta can be negative (B worse than A) — just verify it's finite + in expected range
@@ -148,6 +148,7 @@
148
148
  "bootstrap_metric_from_predictions",
149
149
  "brier_decomposition",
150
150
  "capture_git_sha",
151
+ "cluster_bootstrap_ci",
151
152
  "compute_file_hash",
152
153
  "compute_label_overlap",
153
154
  "correct_p_values",
@@ -241,6 +242,7 @@
241
242
  "skipped_metric",
242
243
  "source_role_gate",
243
244
  "split_provenance_config",
245
+ "stratified_cluster_bootstrap_ci",
244
246
  "stratified_recall",
245
247
  "strict_artifact_gate",
246
248
  "sweep",
@@ -1429,7 +1431,7 @@
1429
1431
  "doc_first_line": "str(object='') -> str",
1430
1432
  "kind": "value",
1431
1433
  "type": "str",
1432
- "value": "'1.6.0'"
1434
+ "value": "'1.8.0'"
1433
1435
  },
1434
1436
  "apply_operating_points": {
1435
1437
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1476,6 +1478,11 @@
1476
1478
  "kind": "function",
1477
1479
  "signature": "(repo_root: 'Path | str | None' = None) -> 'str | None'"
1478
1480
  },
1481
+ "cluster_bootstrap_ci": {
1482
+ "doc_first_line": "Label-stratified **cluster** (group) bootstrap percentile CI for a single-condition metric.",
1483
+ "kind": "function",
1484
+ "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', groups: 'np.ndarray', statistic: 'MetricFn', *, resample_labels: 'tuple[int, ...]' = (0, 1), n_resamples: 'int' = 1000, confidence: 'float' = 0.95, rng: 'RNGLike | SeedLike | None' = 42, n_jobs: 'int' = 1) -> 'BootstrapCI'"
1485
+ },
1479
1486
  "compute_file_hash": {
1480
1487
  "doc_first_line": "SHA-256 hex digest of an existing file (sentinel-typed).",
1481
1488
  "kind": "function",
@@ -1941,6 +1948,11 @@
1941
1948
  "kind": "function",
1942
1949
  "signature": "(config: 'Mapping[str, Any]', repo_root: 'Path | str | None' = None, *, path_keys: 'tuple[str, ...]' = ('path', 'dir', 'file', 'splits_dir', 'model_path')) -> 'dict[str, Any]'"
1943
1950
  },
1951
+ "stratified_cluster_bootstrap_ci": {
1952
+ "doc_first_line": "Cluster bootstrap of a **composite** statistic over several independent strata.",
1953
+ "kind": "function",
1954
+ "signature": "(strata: 'Mapping[Hashable, tuple[np.ndarray, np.ndarray, np.ndarray]]', per_stratum_metric: 'MetricFn', combine: 'Callable[[Mapping[Hashable, float]], float]', *, resample_labels: 'tuple[int, ...]' = (0, 1), n_resamples: 'int' = 1000, confidence: 'float' = 0.95, rng: 'RNGLike | SeedLike | None' = 42, n_jobs: 'int' = 1) -> 'BootstrapCI'"
1955
+ },
1944
1956
  "stratified_recall": {
1945
1957
  "doc_first_line": "Recall (TPR) per categorical stratum.",
1946
1958
  "kind": "function",