eval-toolkit 0.48.0__tar.gz → 0.50.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/CHANGELOG.md +163 -0
  2. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/PKG-INFO +6 -6
  3. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/README.md +5 -5
  4. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/STYLE.md +103 -4
  5. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/pyproject.toml +7 -8
  6. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__init__.py +8 -8
  7. eval_toolkit-0.50.0/src/eval_toolkit/_rng.py +65 -0
  8. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_version.py +1 -1
  9. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/adversarial.py +18 -18
  10. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/analysis.py +5 -4
  11. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/bootstrap.py +42 -33
  12. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/harness.py +31 -24
  13. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/leakage.py +5 -17
  14. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/manifest.py +10 -10
  15. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metric_specs.py +1 -1
  16. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/metrics.py +7 -5
  17. eval_toolkit-0.48.0/src/eval_toolkit/_scorecard.py → eval_toolkit-0.50.0/src/eval_toolkit/scorecards.py +19 -15
  18. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/stacking.py +16 -4
  19. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/thresholds.py +5 -4
  20. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/bootstrap_ci/cases.json +6 -6
  21. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/public_api/snapshot.json +26 -26
  22. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_adversarial.py +17 -17
  23. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_analysis.py +5 -5
  24. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_block_bootstrap_on_folds.py +7 -7
  25. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_calibration_mc.py +4 -4
  26. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_edge_cases.py +3 -3
  27. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_golden.py +18 -18
  28. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_njobs.py +12 -12
  29. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_props.py +11 -11
  30. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_research_grounded.py +3 -3
  31. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_bootstrap_unit.py +18 -18
  32. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_bootstrap_chain.py +2 -2
  33. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_bootstrap.py +6 -6
  34. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_metrics.py +1 -1
  35. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_fault_injection.py +2 -2
  36. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_internals.py +3 -3
  37. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_metric_options.py +2 -2
  38. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_parallelism.py +10 -10
  39. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_smoke.py +2 -2
  40. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_lazy_extras_messages.py +2 -2
  41. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_logging.py +1 -1
  42. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest.py +43 -43
  43. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_contamination_round_trip.py +6 -6
  44. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_props.py +11 -11
  45. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_manifest_validation.py +4 -4
  46. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_pipeline_e2e.py +5 -5
  47. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_preprocessing.py +2 -2
  48. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_reference_equivalence.py +2 -2
  49. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_reproducibility_integration.py +10 -10
  50. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_schemas.py +6 -6
  51. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_scorecard.py +16 -16
  52. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_stacking.py +17 -19
  53. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds.py +2 -2
  54. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_coverage.py +1 -1
  55. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_v09_contracts.py +5 -5
  56. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/.gitignore +0 -0
  57. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/LICENSE +0 -0
  58. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/archive/README.md +0 -0
  59. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/README.md +0 -0
  60. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/datasets/README.md +0 -0
  61. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/data-integrity/README.md +0 -0
  62. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  63. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/inference/README.md +0 -0
  64. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/research/papers/prompt-injection/README.md +0 -0
  65. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/source/adr/README.md +0 -0
  66. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/docs/source/methodology/README.md +0 -0
  67. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/__main__.py +0 -0
  68. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_deprecated.py +0 -0
  69. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_parallel.py +0 -0
  70. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/_sweep.py +0 -0
  71. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/artifacts.py +0 -0
  72. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/calibration.py +0 -0
  73. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/claims.py +0 -0
  74. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/config.py +0 -0
  75. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/docs.py +0 -0
  76. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/embeddings.py +0 -0
  77. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/evidence.py +0 -0
  78. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/loaders.py +0 -0
  79. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/losses.py +0 -0
  80. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/operating_points.py +0 -0
  81. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/paths.py +0 -0
  82. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/plotting.py +0 -0
  83. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/preprocessing.py +0 -0
  84. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/probes.py +0 -0
  85. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/protocols.py +0 -0
  86. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/provenance.py +0 -0
  87. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/py.typed +0 -0
  88. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  89. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  90. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  91. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  92. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  93. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  94. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/seeds.py +0 -0
  95. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/splits.py +0 -0
  96. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/src/eval_toolkit/text_dedup.py +0 -0
  97. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  98. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  99. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  100. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  101. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  102. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  103. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  104. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  105. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  106. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  107. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/benchmarks/__init__.py +0 -0
  108. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  109. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/conftest.py +0 -0
  110. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  111. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  112. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  113. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/expected.md +0 -0
  114. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/input.md +0 -0
  115. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/docs/metrics.json +0 -0
  116. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  117. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/strategies.py +0 -0
  118. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_artifacts.py +0 -0
  119. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_binary_adapters.py +0 -0
  120. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_determinism.py +0 -0
  121. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_optimization_failures.py +0 -0
  122. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_props.py +0 -0
  123. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_research_grounded.py +0 -0
  124. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_calibration_unit.py +0 -0
  125. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims.py +0 -0
  126. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims_coverage.py +0 -0
  127. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_claims_props.py +0 -0
  128. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_cli.py +0 -0
  129. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_config.py +0 -0
  130. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_calibration.py +0 -0
  131. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_harness.py +0 -0
  132. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_coverage_plotting.py +0 -0
  133. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_croissant_e2e.py +0 -0
  134. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  135. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_deprecated_scalars_shim.py +0 -0
  136. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_deprecations.py +0 -0
  137. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_docs_golden.py +0 -0
  138. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_docs_props.py +0 -0
  139. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_embeddings.py +0 -0
  140. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_evidence_validators.py +0 -0
  141. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_edge_cases.py +0 -0
  142. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_harness_folded.py +0 -0
  143. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_import_boundaries.py +0 -0
  144. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  145. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage.py +0 -0
  146. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage_error_paths.py +0 -0
  147. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_leakage_props.py +0 -0
  148. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders.py +0 -0
  149. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders_coverage.py +0 -0
  150. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_loaders_props.py +0 -0
  151. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_losses.py +0 -0
  152. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_props.py +0 -0
  153. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_stratified_subsets.py +0 -0
  154. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_metrics_unit.py +0 -0
  155. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_misc_coverage.py +0 -0
  156. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_numeric_edge_cases.py +0 -0
  157. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_ood_loader.py +0 -0
  158. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_operating_points.py +0 -0
  159. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_operating_points_props.py +0 -0
  160. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_parallel.py +0 -0
  161. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_paths.py +0 -0
  162. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_edge.py +0 -0
  163. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_smoke.py +0 -0
  164. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_plotting_visual.py +0 -0
  165. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_probes.py +0 -0
  166. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_protocol_conformance.py +0 -0
  167. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_provenance.py +0 -0
  168. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_public_api.py +0 -0
  169. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_recall_at_fpr.py +0 -0
  170. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_seeds.py +0 -0
  171. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits.py +0 -0
  172. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits_leakage_integration.py +0 -0
  173. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_splits_props.py +0 -0
  174. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_sweep.py +0 -0
  175. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup.py +0 -0
  176. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_coverage.py +0 -0
  177. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_props.py +0 -0
  178. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_text_dedup_strategies.py +0 -0
  179. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_constant_score.py +0 -0
  180. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_props.py +0 -0
  181. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_thresholds_research_grounded.py +0 -0
  182. {eval_toolkit-0.48.0 → eval_toolkit-0.50.0}/tests/test_tokenization_leakage_check.py +0 -0
@@ -5,6 +5,169 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.50.0] — 2026-05-23 — SPEC 7 `rng` parameter adoption
9
+
10
+ The SPEC 7 follow-up to v0.49.0. The `_rng.py` scaffold shipped at
11
+ v0.49.0 (SeedLike + RNGLike type aliases per
12
+ [Scientific Python SPEC 7](https://scientific-python.org/specs/spec-0007/))
13
+ is now wired into every Tier-1 public function that consumes a NumPy RNG.
14
+
15
+ ### BREAKING
16
+
17
+ **22 Tier-1 function signatures**: `seed: int = X` / `random_state: int | None` → `rng: RNGLike | SeedLike | None = X`. Pre-v1.0 SemVer-minor BREAKING (v0.34.0 precedent). Defaults preserved (still deterministic-by-default).
18
+
19
+ Affected functions:
20
+
21
+ - `bootstrap.py` (7 public + 1 private): `bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`, `paired_bootstrap_op_point_diff`, `paired_mde`, `block_bootstrap_on_folds`, `cross_validate_metric`, `_bootstrap_t_ci`.
22
+ - `metrics.py:1063`: `expected_calibration_error_debiased`.
23
+ - `thresholds.py`: `selected_operating_point` + `_bootstrap_threshold_metric_cis`.
24
+ - `analysis.py`: `bootstrap_metric_from_predictions`, `paired_diff_from_prediction_refs`.
25
+ - `harness.py` (6 sites): `evaluate`, `evaluate_scorer_on_slice`, `_bootstrap_auc_ci`, `_evaluate_scores`, `_compute_paired_diffs`, `_score_all_slices`.
26
+ - `scorecards.py`: `scorecard`, `_evaluate_spec`.
27
+ - `stacking.py`: `LogisticStacker.random_state` → `LogisticStacker.rng` class-field rename (sklearn pass-through derives int at the boundary).
28
+
29
+ **Body refactors**:
30
+
31
+ - 4 SeedSequence.spawn() sites converted from `np.random.SeedSequence(seed).spawn(n)` to `rng.bit_generator.seed_seq.spawn(n)` (Option A — preserves existing worker SeedSequence signatures).
32
+ - 2 sklearn-bridge sites in `cross_validate_metric` derive int from rng before passing to `StratifiedKFold`/`KFold(random_state=...)` (defensive across sklearn versions <1.4).
33
+ - `LogisticStacker.fit` derives sklearn int from `self.rng` at the boundary.
34
+
35
+ **Config schema** (Tier-2 additive): `evaluate()` config dict key `"seed"` → `"rng"`. Generator-typed input serializes as `repr(rng)`; int/None serialize as-is (backward-compatible for prior int-seed usage).
36
+
37
+ ### Added
38
+
39
+ - **Docstrings**: NumPy-style parameter doc for every renamed function now references `rng : RNGLike | SeedLike | None` with explicit link to SPEC 7.
40
+ - **STYLE.md §3a** + **ADR 0004 D4**: `rng` row flipped from "target convention; adopted in v0.50.0" → "**canonical** convention (adopted v0.50.0)".
41
+
42
+ ### Changed
43
+
44
+ - **Test sweep** (~230+ test sites): `seed=X` → `rng=X` in test kwarg calls, EXCEPT in test files that test legitimate `seed`-as-int contexts (`test_adversarial.py` for Python `random.Random`, `test_seeds.py` for `set_global_seeds`, `test_splits*.py` for Splitter dataclass fields, `test_text_dedup*.py` for MinHashLSHStrategy class field).
45
+ - **CHANGELOG header**: this release.
46
+
47
+ ### Exceptions to SPEC 7 (KEPT `seed:` — documented in STYLE.md §3a + ADR 0004 D4)
48
+
49
+ - `seeds.set_global_seeds(seed: int)` — global-state setter, not per-function RNG.
50
+ - `adversarial.py` dataclass fields + functional wrappers — use Python stdlib `random.Random(seed)`, not NumPy.
51
+ - `splits.py` Splitter dataclass class-fields (`HoldoutSplitter.seed`, `StratifiedKFoldSplitter.seed`, etc.) — configuration storage, not user-facing RNG parameter.
52
+ - `loaders.py:903` YAML config schema key — declarative; renaming would break consumer YAMLs.
53
+
54
+ ### Migration
55
+
56
+ - Consumer (`prompt-injection-detection-submission`) lockstep: bump dep pin `>=0.49.0` → `>=0.50.0`; rename `seed=` → `rng=` on eval-toolkit-bound call sites (estimated 5-8 sites).
57
+ - Bit-for-bit reproducibility preserved when migrating `seed=42` → `rng=42` (int seed is SeedLike; `np.random.default_rng(42)` is the canonical normalization).
58
+
59
+ ### Notes
60
+
61
+ - Ships in parallel with Round 8 audit STOP-GATE (Decision Y.2); R8 briefing at commit `6f6839a`, awaiting Codex+Gemini reports.
62
+ - Memory pattern captured at v0.49.0: pre-flight grep MUST cover `README.md`, `.doctest-modules`, and any config files (per `feedback_sybil_runs_readme.md`). Applied to v0.50.0 pre-flight.
63
+
64
+ ## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
65
+
66
+ Final pre-v1.0 minor consolidating the naming-convention standardization
67
+ that locks the v1.0 Tier-1 contract. Audit + industry-research pass
68
+ (PEP 8, scikit-learn, NumPy, Google Python Style Guide, Scientific
69
+ Python SPEC 7) found the repo already 95-99% consistent; this release
70
+ closes the small remaining gaps + documents the conventions as
71
+ [ADR 0004](docs/source/adr/0004-naming-conventions.md). The SPEC 7
72
+ ``rng`` parameter convention is documented here and adopted in v0.50.0.
73
+
74
+ ### BREAKING
75
+
76
+ Five Tier-1 renames for naming consistency (pre-v1.0; SemVer-minor per
77
+ the v0.34.0 BREAKING-minor precedent). Single-consumer lockstep bump in
78
+ ``prompt-injection-detection-submission``; no deprecation aliases.
79
+
80
+ - **``build_manifest`` → ``make_manifest``** (manifest.py). Aligns
81
+ with ``make_minilm_embedder`` / ``make_palette`` / ``make_run_dir``
82
+ factory pattern. ``build_*`` was the only outlier.
83
+ - **``CaseRandomization`` → ``CaseInjection``** (adversarial.py).
84
+ Aligns with ``*Injection`` / ``*Substitution`` adversarial suffix
85
+ convention.
86
+ - **``TokenSplitting`` → ``TokenSplittingInjection``** (adversarial.py).
87
+ Same rationale.
88
+ - **``UnicodeNormalization`` → ``UnicodeNormalizationInjection``**
89
+ (adversarial.py). Same rationale.
90
+ - **``eval_toolkit._scorecard.py`` → ``eval_toolkit.scorecards.py``**
91
+ (private → public module promotion). The 4 top-level symbols
92
+ (``scorecard``, ``Scorecard``, ``MetricSpec``, ``MetricResult``)
93
+ remain top-level Tier-1; the new public submodule path
94
+ ``from eval_toolkit.scorecards import Scorecard`` is now stable.
95
+ ``_scorecard.py`` is gone — old import paths raise
96
+ ``ModuleNotFoundError``. Per the asymmetric-promotion principle in
97
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md): promote
98
+ collection-of-types modules, keep single-function modules underscore
99
+ (``_sweep.py`` stays private).
100
+
101
+ ### Added
102
+
103
+ - **[ADR 0004](docs/source/adr/0004-naming-conventions.md)** — Naming
104
+ conventions decision record with industry citations. Covers module
105
+ naming (singular vs plural), class suffixes by domain, function
106
+ verb-prefix conventions, canonical parameter list, fitted-attribute
107
+ trailing underscore (sklearn convention), TypeVar leading underscore
108
+ (Google convention), and the SPEC 7 ``rng`` parameter convention
109
+ (adopted in v0.50.0).
110
+ - **STYLE.md** extended with §3a-d (parameter naming, class suffixes
111
+ by domain, module naming, asymmetric promotion), §4a-b
112
+ (fitted-attribute trailing underscore + TypeVar), §12 (75-col
113
+ docstring prose rule), §14 (test naming convention).
114
+ - **CONTRIBUTING.md** cross-link to ADR 0004 + STYLE.md.
115
+ - **[docs/source/api/strict_tier2_protocols.md](docs/source/api/strict_tier2_protocols.md)** —
116
+ new docs page enumerating the 9 strict Tier-2 Protocols + 1 opt-in
117
+ per [ADR 0003 §1](docs/source/adr/0003-stability-contract-and-gate3-methodology.md),
118
+ with canonical top-level import paths. Resolves #69's discoverability
119
+ concern without breaking the lightweight design intent of
120
+ ``eval_toolkit.protocols`` (per ``protocols.py:1-5``).
121
+ - **``src/eval_toolkit/_rng.py``** — private module with SPEC 7 type
122
+ aliases (``SeedLike``, ``RNGLike``). Not yet referenced; scaffold for
123
+ the v0.50.0 SPEC 7 adoption.
124
+ - **[ADR 0001](docs/source/adr/0001-flat-module-layout.md)** amendment
125
+ — added the asymmetric-promotion sub-rule (collection-of-types MAY
126
+ promote, single-function SHOULD stay underscore).
127
+
128
+ ### Changed
129
+
130
+ - **Duplicate-type consolidation** (single source of truth):
131
+ - ``Versioned`` Protocol — canonical at ``protocols.py:64``; the
132
+ duplicate at ``leakage.py:82`` removed. Removed
133
+ ``"Versioned"`` from ``leakage.__all__``; previously-unused
134
+ ``from eval_toolkit.leakage import Versioned`` now raises
135
+ ``ImportError``. Use ``from eval_toolkit.protocols import Versioned``
136
+ or top-level ``from eval_toolkit import Versioned``.
137
+ - ``MetricStatus`` ``Literal`` — canonical at ``artifacts.py:30``; the
138
+ duplicate at ``scorecards.py:78`` removed; ``scorecards`` now
139
+ imports from ``artifacts``.
140
+ - **[validation] optional extra** reclassified from "active deprecation
141
+ with removal target v0.33.0" → "permanent no-op kept for backward
142
+ compatibility." Hard removal would break consumer pip pins of the
143
+ form ``eval-toolkit[validation]`` for zero functional benefit
144
+ (R3 in DEPRECATION.md).
145
+ - **Sphinx cross-references** updated from
146
+ ``eval_toolkit.leakage.Versioned`` → ``eval_toolkit.protocols.Versioned``
147
+ in ``manifest.py`` docstrings.
148
+
149
+ ### Deferred to v0.50.0
150
+
151
+ - **SPEC 7 ``rng`` parameter adoption** across ~30 NumPy-RNG functions.
152
+ Scope deferred from v0.49.0 after the planning audit revealed the
153
+ full blast radius (~30 signature sites + 247 test kwarg sites +
154
+ 7 internal helpers + SeedSequence/Generator/sklearn-bridge
155
+ conversions). Splitting matches the "one cleanup per minor" pattern
156
+ per [feedback_staggered_breaking_releases]. ``_rng.py`` ships in
157
+ v0.49.0 as the scaffold; v0.50.0 wires it into every applicable
158
+ function.
159
+
160
+ ### Notes
161
+
162
+ - Round 8 audit STOP-GATE per Decision Y.2 — briefing committed at
163
+ v0.48.0 (commit ``6f6839a``); v0.49.0 ships in parallel since the
164
+ audit-trail synthesis confirmed R8 audits the existing contract
165
+ (does not prescribe new changes). Any R8 finding folds into v0.49.1
166
+ hotfix if needed.
167
+ - Issue #69 closed by the new strict-Tier-2-Protocols docs page; see
168
+ ``docs/source/api/strict_tier2_protocols.md`` and the close
169
+ rationale on the issue itself.
170
+
8
171
  ## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
9
172
 
10
173
  Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.48.0
3
+ Version: 0.50.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -233,12 +233,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
233
233
  from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
234
234
  from eval_toolkit.metrics import pr_auc
235
235
 
236
- ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
236
+ ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
237
237
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
238
238
 
239
239
  # Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
240
240
  s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
241
- diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, seed=42)
241
+ diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
242
242
  print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
243
243
  ```
244
244
 
@@ -261,13 +261,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
261
261
  ```python
262
262
  import tempfile
263
263
  from pathlib import Path
264
- from eval_toolkit import build_manifest, write_manifest
264
+ from eval_toolkit import make_manifest, write_manifest
265
265
 
266
266
  with tempfile.TemporaryDirectory() as run_dir:
267
267
  # data_files: {name: path} → eval_toolkit hashes the files for you;
268
268
  # versioned: any object with a `version` attribute (e.g. a scorer or
269
269
  # leakage check) is captured by name → version in the manifest.
270
- manifest = build_manifest(
270
+ manifest = make_manifest(
271
271
  run_id="quickstart-demo",
272
272
  config={"threshold_criterion": "max_f1", "seed": 42},
273
273
  seeds={"global": 42, "bootstrap": 42},
@@ -290,7 +290,7 @@ with tempfile.TemporaryDirectory() as run_dir:
290
290
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
291
291
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
292
292
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
293
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
293
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
294
294
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
295
295
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
296
296
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -150,12 +150,12 @@ print(f"ECE (10 bins): {expected_calibration_error(y, s, n_bins=10):.3f}")
150
150
  from eval_toolkit import bootstrap_ci, paired_bootstrap_diff
151
151
  from eval_toolkit.metrics import pr_auc
152
152
 
153
- ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, seed=42)
153
+ ci = bootstrap_ci(y, s, pr_auc, n_resamples=1000, rng=42)
154
154
  print(f"PR-AUC: {ci.point_estimate:.3f} 95% CI: [{ci.ci_low:.3f}, {ci.ci_high:.3f}]")
155
155
 
156
156
  # Paired bootstrap on the lift between two scorers (s_baseline must be in [0, 1] too).
157
157
  s_baseline = np.clip(rng.normal(0.5, 0.3, size=200), 0, 1)
158
- diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, seed=42)
158
+ diff = paired_bootstrap_diff(y, s_baseline, s, pr_auc, n_resamples=1000, rng=42)
159
159
  print(f"Δ PR-AUC: {diff.delta:.3f} overlaps zero: {diff.overlaps_zero}")
160
160
  ```
161
161
 
@@ -178,13 +178,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
178
178
  ```python
179
179
  import tempfile
180
180
  from pathlib import Path
181
- from eval_toolkit import build_manifest, write_manifest
181
+ from eval_toolkit import make_manifest, write_manifest
182
182
 
183
183
  with tempfile.TemporaryDirectory() as run_dir:
184
184
  # data_files: {name: path} → eval_toolkit hashes the files for you;
185
185
  # versioned: any object with a `version` attribute (e.g. a scorer or
186
186
  # leakage check) is captured by name → version in the manifest.
187
- manifest = build_manifest(
187
+ manifest = make_manifest(
188
188
  run_id="quickstart-demo",
189
189
  config={"threshold_criterion": "max_f1", "seed": 42},
190
190
  seeds={"global": 42, "bootstrap": 42},
@@ -207,7 +207,7 @@ with tempfile.TemporaryDirectory() as run_dir:
207
207
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
208
208
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
209
209
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
210
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
210
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
211
211
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
212
212
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
213
213
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -36,6 +36,11 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
36
36
 
37
37
  ## 3. Naming
38
38
 
39
+ For the full decision record + industry-citations, see
40
+ [ADR 0004 — Naming conventions](docs/source/adr/0004-naming-conventions.md).
41
+ This section is the day-to-day quick reference; the ADR is the
42
+ authoritative source.
43
+
39
44
  - Module names: `snake_case`, lowercase package (`eval_toolkit`).
40
45
  - Class names: `PascalCase`. Suffixes used in this repo:
41
46
  - `*Config` — frozen dataclass for settings
@@ -55,6 +60,68 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
55
60
  - Mutation marking: not used. Mutating functions return `None` (Pythonic over
56
61
  Julia's `_inplace` suffix).
57
62
 
63
+ ### 3a. Parameter naming (canonical list, locked at v1.0)
64
+
65
+ These names mean these things, everywhere. Future functions MUST use
66
+ them; deviations need justification in the PR description.
67
+
68
+ | Parameter | Meaning |
69
+ |---|---|
70
+ | `y_true` | Ground-truth labels (binary, shape `(n,)`) |
71
+ | `y_score` | Continuous score / probability (shape `(n,)`) |
72
+ | `y_pred` | Discrete prediction (threshold-dependent) |
73
+ | `n_resamples` | Bootstrap iteration count |
74
+ | `confidence` | Two-sided confidence level (0.95 default) |
75
+ | `n_bins` | Binning count for calibration / ECE |
76
+ | `n_jobs` | Parallelism (joblib + sklearn convention) |
77
+ | `ax` | Matplotlib axis (matplotlib convention) |
78
+ | `metric` | Callable `(y_true, y_score) -> float` |
79
+ | `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — **canonical** convention (adopted v0.50.0). Accepts `int`, `np.random.Generator`, `BitGenerator`, `SeedSequence`, or `None`. |
80
+
81
+ The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
82
+ `set_global_seeds(seed: int)` (global-state setter, not per-function
83
+ RNG; SPEC 7 doesn't apply) and adversarial dataclass fields (use Python
84
+ `random.Random(seed)`; not NumPy-RNG, so SPEC 7's typing doesn't fit).
85
+
86
+ ### 3b. Class suffixes by domain
87
+
88
+ Each suffix maps to a Protocol contract. Stay within the pattern:
89
+
90
+ | Suffix | Domain | Protocol |
91
+ |---|---|---|
92
+ | `*Selector` | Threshold selection | `ThresholdSelector` |
93
+ | `*Splitter` | Cross-validation splits | `Splitter` |
94
+ | `*Check` | Leakage detection | `LeakageCheck` |
95
+ | `*Loader` | Dataset loading | `DatasetLoader` |
96
+ | `*Reader` | Prediction artifact reading | `PredictionReader` |
97
+ | `*Variant` | Preprocessing variant | (functional API) |
98
+ | `*Strategy` | Dedup similarity backend | `SimilarityStrategy` |
99
+ | `*Injection` / `*Substitution` | Adversarial char-injection / -substitution | `TextTransform` |
100
+
101
+ ### 3c. Module naming (singular vs plural)
102
+
103
+ - **Plural noun** for collection-of-types modules: `metrics`,
104
+ `loaders`, `protocols`, `losses`, `probes`, `splits`, `paths`,
105
+ `seeds`, `thresholds`, `artifacts`, `claims`, `embeddings`,
106
+ `scorecards`.
107
+ - **Singular noun** for domain-concept modules: `harness`,
108
+ `bootstrap`, `manifest`, `calibration`, `leakage`, `analysis`,
109
+ `provenance`, `evidence`, `stacking`, `text_dedup`.
110
+ - **Gerund** for process-domain modules: `preprocessing`.
111
+
112
+ ### 3d. Asymmetric module promotion (private → public)
113
+
114
+ Collection-of-types private modules MAY be promoted to plural-public
115
+ when they hold ≥2 user-relevant types. Single-function private
116
+ modules SHOULD stay underscore. See
117
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md) for the trigger
118
+ analysis.
119
+
120
+ Examples:
121
+
122
+ - `_scorecard.py` (4 public exports) → `scorecards.py` at v0.49.0. ✓ promote.
123
+ - `_sweep.py` (1 public function `sweep`) → stays `_sweep.py`. ✓ keep private.
124
+
58
125
  ## 4. Type hints
59
126
 
60
127
  - Every public function has fully typed parameters and return.
@@ -79,10 +146,13 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
79
146
  for 4 reference impls.
80
147
  - `SimilarityStrategy` (`text_dedup.py`) — pluggable similarity backend for
81
148
  `near_dedup` / `cross_dedup` / `NearDuplicateCheck` / `CrossSplitLeakageCheck`.
82
- - `Versioned` (`leakage.py`) — opt-in single-attribute Protocol; any Tier-2
83
- implementation may expose `version: str`. `RunManifest.versioned_objects`
84
- auto-collects them. Mirrors the `lm-evaluation-harness` task `VERSION`
85
- pattern. See `docs/methodology/versioning.md`.
149
+ - `Versioned` (`protocols.py`) — opt-in single-attribute Protocol; any
150
+ Tier-2 implementation may expose `version: str`.
151
+ `RunManifest.versioned_objects` auto-collects them. Mirrors the
152
+ `lm-evaluation-harness` task `VERSION` pattern. See
153
+ `docs/methodology/versioning.md`. (Single source of truth at
154
+ `protocols.py:64` since v0.49.0; the duplicate previously in
155
+ `leakage.py:82` was removed.)
86
156
  - All seams are `@runtime_checkable` so callers can `isinstance(obj, Protocol)`.
87
157
  - Reference impls are `@dataclass(frozen=True, slots=True)` with config in the
88
158
  constructor (`TargetRecallSelector(recall=0.90)`) and the Protocol method as
@@ -90,6 +160,25 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
90
160
  - `NamedTuple` for stable public records that benefit from positional access;
91
161
  frozen dataclasses with `slots=True` otherwise.
92
162
 
163
+ ### 4a. Fitted-attribute trailing underscore (sklearn convention)
164
+
165
+ Estimator-style classes (`fit`/`predict` pattern) that store
166
+ **learned-from-data attributes** use trailing underscore per scikit-learn
167
+ convention: `coef_`, `classes_`, `n_features_in_`, `feature_importances_`.
168
+ These attributes MUST NOT be set in `__init__` — set them only in `fit()`.
169
+
170
+ Frozen reference-impl dataclasses (`@dataclass(frozen=True, slots=True)`)
171
+ are **exempt** — they hold config, not fitted state.
172
+
173
+ Current canonical example: `stacking.LogisticStacker`.
174
+
175
+ ### 4b. TypeVar naming
176
+
177
+ Internal (private) `TypeVar`s use a leading underscore per Google Python
178
+ Style Guide §3.19.10: `_T = TypeVar("_T")`. Public, constrained `TypeVar`s
179
+ without the underscore are allowed only when explicitly part of an
180
+ exported generic API.
181
+
93
182
  ## 5. Dataclasses
94
183
 
95
184
  1. **`slots=True` always** on repo-owned dataclasses. Catches typos at
@@ -220,6 +309,10 @@ def fit_temperature(val_logits, val_labels, bounds=(0.05, 20.0)):
220
309
  - **References** cites arXiv IDs / DOIs / journal cites.
221
310
  - For modules where doctests would be contrived (`plotting`, `harness`,
222
311
  `provenance`), Examples are optional.
312
+ - **Docstring prose wraps at 75 cols** (numpydoc convention) so that
313
+ `help()` is readable in a terminal. Doctest code blocks inside the
314
+ docstring follow the 100-col Black rule (code stays comfortable in an
315
+ editor even though prose around it is narrower).
223
316
 
224
317
  ## 13. Comments
225
318
 
@@ -228,6 +321,12 @@ restate what the code says.
228
321
 
229
322
  ## 14. Tests
230
323
 
324
+ - **File naming**: `tests/test_<module>.py` mirrors
325
+ `src/eval_toolkit/<module>.py`. Auxiliary tests per module use
326
+ suffixes (`test_<module>_props.py`, `test_<module>_validation.py`,
327
+ `test_<module>_golden.py`).
328
+ - **Function naming**: `test_<thing_under_test>_<scenario>`. No
329
+ class-based test grouping unless fixtures truly demand it (rare).
231
330
  - **Markers**: `unit`, `property`, `smoke`, `golden`.
232
331
  - **Sklearn-reference + analytical** as the unit-test oracle where available.
233
332
  - **Hypothesis** required for math/stat invariants. Strategies use
@@ -74,15 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
74
74
  # (granular extras — losses callers should not have to install the larger
75
75
  # transformers stack). Shares the torch version pin with [probes].
76
76
  losses = ["torch>=2.0"]
77
- # DEPRECATED (announced v0.30.1, removal v0.33.0).
77
+ # NO-OP extra kept for backward compatibility (R3 at v0.49.0).
78
78
  #
79
- # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
80
- # / dev still resolve cleanly. jsonschema moved to the base deps in
81
- # v0.16.0; this extra has been a no-op ever since. The 2-minor-version
82
- # window (v0.30.1 announce v0.33.0 remove) matches the @deprecated
83
- # policy in docs/DEPRECATION.md. Extras can't trigger import-time
84
- # DeprecationWarnings, so the deprecation is documentation-only here +
85
- # in CHANGELOG ### Deprecated + docs/DEPRECATION.md.
79
+ # jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
80
+ # no-op ever since. Originally announced as deprecated in v0.30.1 with
81
+ # target removal at v0.33.0, but reclassified at v0.49.0 (R3 in
82
+ # docs/DEPRECATION.md) as a permanent no-op hard removal would break
83
+ # consumer pip pins of the form `eval-toolkit[validation]` for zero
84
+ # functional benefit. Retained indefinitely.
86
85
  validation = []
87
86
  # v0.31.0 docs site: Sphinx + pydata-sphinx-theme (replaces v0.28.0's
88
87
  # mkdocs-material). Migration drivers — pain points Q1 in the v0.31.0
@@ -38,15 +38,15 @@ _EXPORTS: dict[str, str] = {
38
38
  "ALL_TECHNIQUES": "eval_toolkit.adversarial",
39
39
  "BidiRTLInjection": "eval_toolkit.adversarial",
40
40
  "CORE_TECHNIQUES": "eval_toolkit.adversarial",
41
- "CaseRandomization": "eval_toolkit.adversarial",
41
+ "CaseInjection": "eval_toolkit.adversarial",
42
42
  "DiacriticInjection": "eval_toolkit.adversarial",
43
43
  "HomoglyphSubstitution": "eval_toolkit.adversarial",
44
44
  "InvisibleCharsInjection": "eval_toolkit.adversarial",
45
45
  "PunctuationInjection": "eval_toolkit.adversarial",
46
46
  "SynonymSubstitution": "eval_toolkit.adversarial",
47
47
  "TagStrippingInjection": "eval_toolkit.adversarial",
48
- "TokenSplitting": "eval_toolkit.adversarial",
49
- "UnicodeNormalization": "eval_toolkit.adversarial",
48
+ "TokenSplittingInjection": "eval_toolkit.adversarial",
49
+ "UnicodeNormalizationInjection": "eval_toolkit.adversarial",
50
50
  "WhitespaceInjection": "eval_toolkit.adversarial",
51
51
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
52
52
  # CharacterInjectionStrategy + character_injection SimpleNamespace
@@ -202,7 +202,7 @@ _EXPORTS: dict[str, str] = {
202
202
  "MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
203
203
  "RunManifest": "eval_toolkit.manifest",
204
204
  "SourceRoleRecord": "eval_toolkit.manifest",
205
- "build_manifest": "eval_toolkit.manifest",
205
+ "make_manifest": "eval_toolkit.manifest",
206
206
  "validate_source_roles": "eval_toolkit.manifest",
207
207
  "write_manifest": "eval_toolkit.manifest",
208
208
  # --- metrics ---
@@ -315,10 +315,10 @@ _EXPORTS: dict[str, str] = {
315
315
  "wilson_interval": "eval_toolkit.thresholds",
316
316
  "LogisticStacker": "eval_toolkit.stacking",
317
317
  "MetaLearner": "eval_toolkit.stacking",
318
- "MetricResult": "eval_toolkit._scorecard",
319
- "MetricSpec": "eval_toolkit._scorecard",
320
- "Scorecard": "eval_toolkit._scorecard",
321
- "scorecard": "eval_toolkit._scorecard",
318
+ "MetricResult": "eval_toolkit.scorecards",
319
+ "MetricSpec": "eval_toolkit.scorecards",
320
+ "Scorecard": "eval_toolkit.scorecards",
321
+ "scorecard": "eval_toolkit.scorecards",
322
322
  # --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
323
323
  "sweep": "eval_toolkit._sweep",
324
324
  }
@@ -0,0 +1,65 @@
1
+ """Private RNG-parameter type aliases per Scientific-Python SPEC 7.
2
+
3
+ This module centralizes the type aliases used to annotate user-facing RNG
4
+ parameters across the toolkit. Per `SPEC 7 — Seeding PRNG
5
+ <https://scientific-python.org/specs/spec-0007/>`_ (Endorsed) eval-toolkit
6
+ exposes a single canonical parameter name ``rng`` typed as
7
+ ``RNGLike | SeedLike | None`` on every function that consumes a NumPy
8
+ ``Generator``. Bodies normalize via ``np.random.default_rng(rng)``.
9
+
10
+ This module is private (underscore prefix) so the aliases stay an
11
+ implementation detail — public symbols use them only in their annotations.
12
+ If a Tier-2 consumer ever needs them exposed for their own callsite type
13
+ annotations, promote them via ``eval_toolkit.protocols`` per the
14
+ asymmetric-promotion principle in ADR 0001 + STYLE.md §3d.
15
+
16
+ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
17
+
18
+ - ``seeds.set_global_seeds(seed: int)`` — global-state setter, not a
19
+ per-function RNG parameter; SPEC 7 is scoped to per-function RNG inputs.
20
+ - ``adversarial.*Injection`` / ``*Substitution`` / ``CaseInjection``
21
+ dataclass fields — they use Python's stdlib ``random.Random(seed)``,
22
+ not NumPy. SPEC 7's typing (``RNGLike = np.random.Generator | ...``) is
23
+ strictly NumPy-scoped.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from collections.abc import Sequence
29
+ from typing import cast
30
+
31
+ import numpy as np
32
+
33
+ type SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence
34
+ """Anything that can seed a NumPy bit generator.
35
+
36
+ Per SPEC 7, ``np.random.default_rng`` accepts any of these as a seed
37
+ without further conversion. ``Sequence[int]`` is the entropy-vector form
38
+ used by ``np.random.SeedSequence``.
39
+ """
40
+
41
+ type RNGLike = np.random.Generator | np.random.BitGenerator
42
+ """An already-instantiated NumPy bit generator or generator wrapper.
43
+
44
+ ``np.random.default_rng(rng)`` is the identity function on
45
+ ``Generator`` inputs and lifts ``BitGenerator`` inputs into a
46
+ ``Generator`` — both forms compose cleanly.
47
+ """
48
+
49
+
50
+ def spawn_seed_sequences(rng: RNGLike | SeedLike | None, n: int) -> list[np.random.SeedSequence]:
51
+ """Spawn ``n`` independent SeedSequences from any SPEC 7 ``rng`` input.
52
+
53
+ Normalizes the input to a ``Generator``, then extracts the underlying
54
+ ``SeedSequence`` via the bit-generator and spawns ``n`` children.
55
+ The cast satisfies mypy strict: the ``seed_seq`` attribute on a
56
+ concrete BitGenerator is a ``SeedSequence`` instance, but the type
57
+ stub on ``BitGenerator.seed_seq`` returns the abstract
58
+ ``ISeedSequence`` interface (which lacks ``spawn``).
59
+
60
+ Used by the bootstrap parallel workers (which take spawned
61
+ ``SeedSequence`` objects to seed their internal ``default_rng()`` calls).
62
+ """
63
+ gen = np.random.default_rng(rng)
64
+ seed_seq = cast(np.random.SeedSequence, gen.bit_generator.seed_seq)
65
+ return seed_seq.spawn(n)
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.48.0"
5
+ __version__ = "0.50.0"