eval-toolkit 0.48.0__tar.gz → 0.49.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/CHANGELOG.md +107 -0
  2. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/PKG-INFO +4 -4
  3. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/README.md +3 -3
  4. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/STYLE.md +103 -4
  5. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/pyproject.toml +7 -8
  6. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__init__.py +8 -8
  7. eval_toolkit-0.49.0/src/eval_toolkit/_rng.py +46 -0
  8. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_version.py +1 -1
  9. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/adversarial.py +18 -18
  10. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/leakage.py +5 -17
  11. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/manifest.py +10 -10
  12. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metric_specs.py +1 -1
  13. eval_toolkit-0.48.0/src/eval_toolkit/_scorecard.py → eval_toolkit-0.49.0/src/eval_toolkit/scorecards.py +5 -4
  14. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/public_api/snapshot.json +13 -13
  15. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_adversarial.py +17 -17
  16. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_lazy_extras_messages.py +2 -2
  17. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest.py +43 -43
  18. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_contamination_round_trip.py +6 -6
  19. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_props.py +11 -11
  20. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_manifest_validation.py +4 -4
  21. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_preprocessing.py +2 -2
  22. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_schemas.py +6 -6
  23. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_v09_contracts.py +2 -2
  24. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/.gitignore +0 -0
  25. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/LICENSE +0 -0
  26. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/archive/README.md +0 -0
  27. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/README.md +0 -0
  28. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/datasets/README.md +0 -0
  29. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/data-integrity/README.md +0 -0
  30. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  31. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/inference/README.md +0 -0
  32. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/research/papers/prompt-injection/README.md +0 -0
  33. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/source/adr/README.md +0 -0
  34. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/docs/source/methodology/README.md +0 -0
  35. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__main__.py +0 -0
  36. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_deprecated.py +0 -0
  37. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_parallel.py +0 -0
  38. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_sweep.py +0 -0
  39. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/analysis.py +0 -0
  40. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/artifacts.py +0 -0
  41. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/bootstrap.py +0 -0
  42. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/calibration.py +0 -0
  43. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/claims.py +0 -0
  44. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/config.py +0 -0
  45. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/docs.py +0 -0
  46. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/embeddings.py +0 -0
  47. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/evidence.py +0 -0
  48. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/harness.py +0 -0
  49. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/loaders.py +0 -0
  50. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/losses.py +0 -0
  51. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metrics.py +0 -0
  52. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/operating_points.py +0 -0
  53. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/paths.py +0 -0
  54. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/plotting.py +0 -0
  55. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/preprocessing.py +0 -0
  56. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/probes.py +0 -0
  57. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/protocols.py +0 -0
  58. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/provenance.py +0 -0
  59. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/py.typed +0 -0
  60. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  61. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  62. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  63. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  64. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  65. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  66. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/seeds.py +0 -0
  67. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/splits.py +0 -0
  68. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/stacking.py +0 -0
  69. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/text_dedup.py +0 -0
  70. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/src/eval_toolkit/thresholds.py +0 -0
  71. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  72. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  73. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  74. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  75. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  76. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  77. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  78. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  79. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  80. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  81. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/benchmarks/__init__.py +0 -0
  82. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  83. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/conftest.py +0 -0
  84. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  85. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  86. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  87. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  88. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/expected.md +0 -0
  89. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/input.md +0 -0
  90. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/docs/metrics.json +0 -0
  91. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  92. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/strategies.py +0 -0
  93. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_analysis.py +0 -0
  94. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_artifacts.py +0 -0
  95. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  96. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  97. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_edge_cases.py +0 -0
  98. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_golden.py +0 -0
  99. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_njobs.py +0 -0
  100. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_props.py +0 -0
  101. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_research_grounded.py +0 -0
  102. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_unit.py +0 -0
  103. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_binary_adapters.py +0 -0
  104. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  105. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_determinism.py +0 -0
  106. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_optimization_failures.py +0 -0
  107. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_props.py +0 -0
  108. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_research_grounded.py +0 -0
  109. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_calibration_unit.py +0 -0
  110. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims.py +0 -0
  111. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims_coverage.py +0 -0
  112. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_claims_props.py +0 -0
  113. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_cli.py +0 -0
  114. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_config.py +0 -0
  115. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_bootstrap.py +0 -0
  116. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_calibration.py +0 -0
  117. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_harness.py +0 -0
  118. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_metrics.py +0 -0
  119. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_coverage_plotting.py +0 -0
  120. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_croissant_e2e.py +0 -0
  121. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  122. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_deprecated_scalars_shim.py +0 -0
  123. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_deprecations.py +0 -0
  124. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_docs_golden.py +0 -0
  125. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_docs_props.py +0 -0
  126. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_embeddings.py +0 -0
  127. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_evidence_validators.py +0 -0
  128. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_edge_cases.py +0 -0
  129. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_fault_injection.py +0 -0
  130. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_folded.py +0 -0
  131. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_internals.py +0 -0
  132. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_metric_options.py +0 -0
  133. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_parallelism.py +0 -0
  134. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_harness_smoke.py +0 -0
  135. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_import_boundaries.py +0 -0
  136. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  137. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage.py +0 -0
  138. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage_error_paths.py +0 -0
  139. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_leakage_props.py +0 -0
  140. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders.py +0 -0
  141. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders_coverage.py +0 -0
  142. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_loaders_props.py +0 -0
  143. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_logging.py +0 -0
  144. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_losses.py +0 -0
  145. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_props.py +0 -0
  146. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_stratified_subsets.py +0 -0
  147. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_metrics_unit.py +0 -0
  148. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_misc_coverage.py +0 -0
  149. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_numeric_edge_cases.py +0 -0
  150. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_ood_loader.py +0 -0
  151. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_operating_points.py +0 -0
  152. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_operating_points_props.py +0 -0
  153. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_parallel.py +0 -0
  154. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_paths.py +0 -0
  155. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_pipeline_e2e.py +0 -0
  156. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_edge.py +0 -0
  157. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_smoke.py +0 -0
  158. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_plotting_visual.py +0 -0
  159. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_probes.py +0 -0
  160. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_protocol_conformance.py +0 -0
  161. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_provenance.py +0 -0
  162. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_public_api.py +0 -0
  163. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_recall_at_fpr.py +0 -0
  164. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_reference_equivalence.py +0 -0
  165. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_reproducibility_integration.py +0 -0
  166. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_scorecard.py +0 -0
  167. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_seeds.py +0 -0
  168. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits.py +0 -0
  169. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits_leakage_integration.py +0 -0
  170. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_splits_props.py +0 -0
  171. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_stacking.py +0 -0
  172. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_sweep.py +0 -0
  173. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup.py +0 -0
  174. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_coverage.py +0 -0
  175. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_props.py +0 -0
  176. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_strategies.py +0 -0
  177. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds.py +0 -0
  178. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_constant_score.py +0 -0
  179. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_coverage.py +0 -0
  180. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_props.py +0 -0
  181. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_thresholds_research_grounded.py +0 -0
  182. {eval_toolkit-0.48.0 → eval_toolkit-0.49.0}/tests/test_tokenization_leakage_check.py +0 -0
@@ -5,6 +5,113 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
9
+
10
+ Final pre-v1.0 minor consolidating the naming-convention standardization
11
+ that locks the v1.0 Tier-1 contract. Audit + industry-research pass
12
+ (PEP 8, scikit-learn, NumPy, Google Python Style Guide, Scientific
13
+ Python SPEC 7) found the repo already 95-99% consistent; this release
14
+ closes the small remaining gaps + documents the conventions as
15
+ [ADR 0004](docs/source/adr/0004-naming-conventions.md). The SPEC 7
16
+ ``rng`` parameter convention is documented here and adopted in v0.50.0.
17
+
18
+ ### BREAKING
19
+
20
+ Five Tier-1 renames for naming consistency (pre-v1.0; SemVer-minor per
21
+ the v0.34.0 BREAKING-minor precedent). Single-consumer lockstep bump in
22
+ ``prompt-injection-detection-submission``; no deprecation aliases.
23
+
24
+ - **``build_manifest`` → ``make_manifest``** (manifest.py). Aligns
25
+ with ``make_minilm_embedder`` / ``make_palette`` / ``make_run_dir``
26
+ factory pattern. ``build_*`` was the only outlier.
27
+ - **``CaseRandomization`` → ``CaseInjection``** (adversarial.py).
28
+ Aligns with ``*Injection`` / ``*Substitution`` adversarial suffix
29
+ convention.
30
+ - **``TokenSplitting`` → ``TokenSplittingInjection``** (adversarial.py).
31
+ Same rationale.
32
+ - **``UnicodeNormalization`` → ``UnicodeNormalizationInjection``**
33
+ (adversarial.py). Same rationale.
34
+ - **``eval_toolkit._scorecard.py`` → ``eval_toolkit.scorecards.py``**
35
+ (private → public module promotion). The 4 top-level symbols
36
+ (``scorecard``, ``Scorecard``, ``MetricSpec``, ``MetricResult``)
37
+ remain top-level Tier-1; the new public submodule path
38
+ ``from eval_toolkit.scorecards import Scorecard`` is now stable.
39
+ ``_scorecard.py`` is gone — old import paths raise
40
+ ``ModuleNotFoundError``. Per the asymmetric-promotion principle in
41
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md): promote
42
+ collection-of-types modules, keep single-function modules underscore
43
+ (``_sweep.py`` stays private).
44
+
45
+ ### Added
46
+
47
+ - **[ADR 0004](docs/source/adr/0004-naming-conventions.md)** — Naming
48
+ conventions decision record with industry citations. Covers module
49
+ naming (singular vs plural), class suffixes by domain, function
50
+ verb-prefix conventions, canonical parameter list, fitted-attribute
51
+ trailing underscore (sklearn convention), TypeVar leading underscore
52
+ (Google convention), and the SPEC 7 ``rng`` parameter convention
53
+ (adopted in v0.50.0).
54
+ - **STYLE.md** extended with §3a-d (parameter naming, class suffixes
55
+ by domain, module naming, asymmetric promotion), §4a-b
56
+ (fitted-attribute trailing underscore + TypeVar), §12 (75-col
57
+ docstring prose rule), §14 (test naming convention).
58
+ - **CONTRIBUTING.md** cross-link to ADR 0004 + STYLE.md.
59
+ - **[docs/source/api/strict_tier2_protocols.md](docs/source/api/strict_tier2_protocols.md)** —
60
+ new docs page enumerating the 9 strict Tier-2 Protocols + 1 opt-in
61
+ per [ADR 0003 §1](docs/source/adr/0003-stability-contract-and-gate3-methodology.md),
62
+ with canonical top-level import paths. Resolves #69's discoverability
63
+ concern without breaking the lightweight design intent of
64
+ ``eval_toolkit.protocols`` (per ``protocols.py:1-5``).
65
+ - **``src/eval_toolkit/_rng.py``** — private module with SPEC 7 type
66
+ aliases (``SeedLike``, ``RNGLike``). Not yet referenced; scaffold for
67
+ the v0.50.0 SPEC 7 adoption.
68
+ - **[ADR 0001](docs/source/adr/0001-flat-module-layout.md)** amendment
69
+ — added the asymmetric-promotion sub-rule (collection-of-types MAY
70
+ promote, single-function SHOULD stay underscore).
71
+
72
+ ### Changed
73
+
74
+ - **Duplicate-type consolidation** (single source of truth):
75
+ - ``Versioned`` Protocol — canonical at ``protocols.py:64``; the
76
+ duplicate at ``leakage.py:82`` removed. Removed
77
+ ``"Versioned"`` from ``leakage.__all__``; previously-unused
78
+ ``from eval_toolkit.leakage import Versioned`` now raises
79
+ ``ImportError``. Use ``from eval_toolkit.protocols import Versioned``
80
+ or top-level ``from eval_toolkit import Versioned``.
81
+ - ``MetricStatus`` ``Literal`` — canonical at ``artifacts.py:30``; the
82
+ duplicate at ``scorecards.py:78`` removed; ``scorecards`` now
83
+ imports from ``artifacts``.
84
+ - **[validation] optional extra** reclassified from "active deprecation
85
+ with removal target v0.33.0" → "permanent no-op kept for backward
86
+ compatibility." Hard removal would break consumer pip pins of the
87
+ form ``eval-toolkit[validation]`` for zero functional benefit
88
+ (R3 in DEPRECATION.md).
89
+ - **Sphinx cross-references** updated from
90
+ ``eval_toolkit.leakage.Versioned`` → ``eval_toolkit.protocols.Versioned``
91
+ in ``manifest.py`` docstrings.
92
+
93
+ ### Deferred to v0.50.0
94
+
95
+ - **SPEC 7 ``rng`` parameter adoption** across ~30 NumPy-RNG functions.
96
+ Scope deferred from v0.49.0 after the planning audit revealed the
97
+ full blast radius (~30 signature sites + 247 test kwarg sites +
98
+ 7 internal helpers + SeedSequence/Generator/sklearn-bridge
99
+ conversions). Splitting matches the "one cleanup per minor" pattern
100
+ per [feedback_staggered_breaking_releases]. ``_rng.py`` ships in
101
+ v0.49.0 as the scaffold; v0.50.0 wires it into every applicable
102
+ function.
103
+
104
+ ### Notes
105
+
106
+ - Round 8 audit STOP-GATE per Decision Y.2 — briefing committed at
107
+ v0.48.0 (commit ``6f6839a``); v0.49.0 ships in parallel since the
108
+ audit-trail synthesis confirmed R8 audits the existing contract
109
+ (does not prescribe new changes). Any R8 finding folds into v0.49.1
110
+ hotfix if needed.
111
+ - Issue #69 closed by the new strict-Tier-2-Protocols docs page; see
112
+ ``docs/source/api/strict_tier2_protocols.md`` and the close
113
+ rationale on the issue itself.
114
+
8
115
  ## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
9
116
 
10
117
  Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.48.0
3
+ Version: 0.49.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -261,13 +261,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
261
261
  ```python
262
262
  import tempfile
263
263
  from pathlib import Path
264
- from eval_toolkit import build_manifest, write_manifest
264
+ from eval_toolkit import make_manifest, write_manifest
265
265
 
266
266
  with tempfile.TemporaryDirectory() as run_dir:
267
267
  # data_files: {name: path} → eval_toolkit hashes the files for you;
268
268
  # versioned: any object with a `version` attribute (e.g. a scorer or
269
269
  # leakage check) is captured by name → version in the manifest.
270
- manifest = build_manifest(
270
+ manifest = make_manifest(
271
271
  run_id="quickstart-demo",
272
272
  config={"threshold_criterion": "max_f1", "seed": 42},
273
273
  seeds={"global": 42, "bootstrap": 42},
@@ -290,7 +290,7 @@ with tempfile.TemporaryDirectory() as run_dir:
290
290
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
291
291
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
292
292
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
293
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
293
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
294
294
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
295
295
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
296
296
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -178,13 +178,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
178
178
  ```python
179
179
  import tempfile
180
180
  from pathlib import Path
181
- from eval_toolkit import build_manifest, write_manifest
181
+ from eval_toolkit import make_manifest, write_manifest
182
182
 
183
183
  with tempfile.TemporaryDirectory() as run_dir:
184
184
  # data_files: {name: path} → eval_toolkit hashes the files for you;
185
185
  # versioned: any object with a `version` attribute (e.g. a scorer or
186
186
  # leakage check) is captured by name → version in the manifest.
187
- manifest = build_manifest(
187
+ manifest = make_manifest(
188
188
  run_id="quickstart-demo",
189
189
  config={"threshold_criterion": "max_f1", "seed": 42},
190
190
  seeds={"global": 42, "bootstrap": 42},
@@ -207,7 +207,7 @@ with tempfile.TemporaryDirectory() as run_dir:
207
207
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
208
208
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
209
209
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
210
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
210
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
211
211
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
212
212
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
213
213
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -36,6 +36,11 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
36
36
 
37
37
  ## 3. Naming
38
38
 
39
+ For the full decision record + industry-citations, see
40
+ [ADR 0004 — Naming conventions](docs/source/adr/0004-naming-conventions.md).
41
+ This section is the day-to-day quick reference; the ADR is the
42
+ authoritative source.
43
+
39
44
  - Module names: `snake_case`, lowercase package (`eval_toolkit`).
40
45
  - Class names: `PascalCase`. Suffixes used in this repo:
41
46
  - `*Config` — frozen dataclass for settings
@@ -55,6 +60,68 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
55
60
  - Mutation marking: not used. Mutating functions return `None` (Pythonic over
56
61
  Julia's `_inplace` suffix).
57
62
 
63
+ ### 3a. Parameter naming (canonical list, locked at v1.0)
64
+
65
+ These names mean these things, everywhere. Future functions MUST use
66
+ them; deviations need justification in the PR description.
67
+
68
+ | Parameter | Meaning |
69
+ |---|---|
70
+ | `y_true` | Ground-truth labels (binary, shape `(n,)`) |
71
+ | `y_score` | Continuous score / probability (shape `(n,)`) |
72
+ | `y_pred` | Discrete prediction (threshold-dependent) |
73
+ | `n_resamples` | Bootstrap iteration count |
74
+ | `confidence` | Two-sided confidence level (0.95 default) |
75
+ | `n_bins` | Binning count for calibration / ECE |
76
+ | `n_jobs` | Parallelism (joblib + sklearn convention) |
77
+ | `ax` | Matplotlib axis (matplotlib convention) |
78
+ | `metric` | Callable `(y_true, y_score) -> float` |
79
+ | `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — target convention; adopted in v0.50.0 |
80
+
81
+ The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
82
+ `set_global_seeds(seed: int)` (global-state setter, not per-function
83
+ RNG; SPEC 7 doesn't apply) and adversarial dataclass fields (use Python
84
+ `random.Random(seed)`; not NumPy-RNG, so SPEC 7's typing doesn't fit).
85
+
86
+ ### 3b. Class suffixes by domain
87
+
88
+ Each suffix maps to a Protocol contract. Stay within the pattern:
89
+
90
+ | Suffix | Domain | Protocol |
91
+ |---|---|---|
92
+ | `*Selector` | Threshold selection | `ThresholdSelector` |
93
+ | `*Splitter` | Cross-validation splits | `Splitter` |
94
+ | `*Check` | Leakage detection | `LeakageCheck` |
95
+ | `*Loader` | Dataset loading | `DatasetLoader` |
96
+ | `*Reader` | Prediction artifact reading | `PredictionReader` |
97
+ | `*Variant` | Preprocessing variant | (functional API) |
98
+ | `*Strategy` | Dedup similarity backend | `SimilarityStrategy` |
99
+ | `*Injection` / `*Substitution` | Adversarial char-injection / -substitution | `TextTransform` |
100
+
101
+ ### 3c. Module naming (singular vs plural)
102
+
103
+ - **Plural noun** for collection-of-types modules: `metrics`,
104
+ `loaders`, `protocols`, `losses`, `probes`, `splits`, `paths`,
105
+ `seeds`, `thresholds`, `artifacts`, `claims`, `embeddings`,
106
+ `scorecards`.
107
+ - **Singular noun** for domain-concept modules: `harness`,
108
+ `bootstrap`, `manifest`, `calibration`, `leakage`, `analysis`,
109
+ `provenance`, `evidence`, `stacking`, `text_dedup`.
110
+ - **Gerund** for process-domain modules: `preprocessing`.
111
+
112
+ ### 3d. Asymmetric module promotion (private → public)
113
+
114
+ Collection-of-types private modules MAY be promoted to plural-public
115
+ when they hold ≥2 user-relevant types. Single-function private
116
+ modules SHOULD stay underscore. See
117
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md) for the trigger
118
+ analysis.
119
+
120
+ Examples:
121
+
122
+ - `_scorecard.py` (4 public exports) → `scorecards.py` at v0.49.0. ✓ promote.
123
+ - `_sweep.py` (1 public function `sweep`) → stays `_sweep.py`. ✓ keep private.
124
+
58
125
  ## 4. Type hints
59
126
 
60
127
  - Every public function has fully typed parameters and return.
@@ -79,10 +146,13 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
79
146
  for 4 reference impls.
80
147
  - `SimilarityStrategy` (`text_dedup.py`) — pluggable similarity backend for
81
148
  `near_dedup` / `cross_dedup` / `NearDuplicateCheck` / `CrossSplitLeakageCheck`.
82
- - `Versioned` (`leakage.py`) — opt-in single-attribute Protocol; any Tier-2
83
- implementation may expose `version: str`. `RunManifest.versioned_objects`
84
- auto-collects them. Mirrors the `lm-evaluation-harness` task `VERSION`
85
- pattern. See `docs/methodology/versioning.md`.
149
+ - `Versioned` (`protocols.py`) — opt-in single-attribute Protocol; any
150
+ Tier-2 implementation may expose `version: str`.
151
+ `RunManifest.versioned_objects` auto-collects them. Mirrors the
152
+ `lm-evaluation-harness` task `VERSION` pattern. See
153
+ `docs/methodology/versioning.md`. (Single source of truth at
154
+ `protocols.py:64` since v0.49.0; the duplicate previously in
155
+ `leakage.py:82` was removed.)
86
156
  - All seams are `@runtime_checkable` so callers can `isinstance(obj, Protocol)`.
87
157
  - Reference impls are `@dataclass(frozen=True, slots=True)` with config in the
88
158
  constructor (`TargetRecallSelector(recall=0.90)`) and the Protocol method as
@@ -90,6 +160,25 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
90
160
  - `NamedTuple` for stable public records that benefit from positional access;
91
161
  frozen dataclasses with `slots=True` otherwise.
92
162
 
163
+ ### 4a. Fitted-attribute trailing underscore (sklearn convention)
164
+
165
+ Estimator-style classes (`fit`/`predict` pattern) that store
166
+ **learned-from-data attributes** use trailing underscore per scikit-learn
167
+ convention: `coef_`, `classes_`, `n_features_in_`, `feature_importances_`.
168
+ These attributes MUST NOT be set in `__init__` — set them only in `fit()`.
169
+
170
+ Frozen reference-impl dataclasses (`@dataclass(frozen=True, slots=True)`)
171
+ are **exempt** — they hold config, not fitted state.
172
+
173
+ Current canonical example: `stacking.LogisticStacker`.
174
+
175
+ ### 4b. TypeVar naming
176
+
177
+ Internal (private) `TypeVar`s use a leading underscore per Google Python
178
+ Style Guide §3.19.10: `_T = TypeVar("_T")`. Public, constrained `TypeVar`s
179
+ without the underscore are allowed only when explicitly part of an
180
+ exported generic API.
181
+
93
182
  ## 5. Dataclasses
94
183
 
95
184
  1. **`slots=True` always** on repo-owned dataclasses. Catches typos at
@@ -220,6 +309,10 @@ def fit_temperature(val_logits, val_labels, bounds=(0.05, 20.0)):
220
309
  - **References** cites arXiv IDs / DOIs / journal cites.
221
310
  - For modules where doctests would be contrived (`plotting`, `harness`,
222
311
  `provenance`), Examples are optional.
312
+ - **Docstring prose wraps at 75 cols** (numpydoc convention) so that
313
+ `help()` is readable in a terminal. Doctest code blocks inside the
314
+ docstring follow the 100-col Black rule (code stays comfortable in an
315
+ editor even though prose around it is narrower).
223
316
 
224
317
  ## 13. Comments
225
318
 
@@ -228,6 +321,12 @@ restate what the code says.
228
321
 
229
322
  ## 14. Tests
230
323
 
324
+ - **File naming**: `tests/test_<module>.py` mirrors
325
+ `src/eval_toolkit/<module>.py`. Auxiliary tests per module use
326
+ suffixes (`test_<module>_props.py`, `test_<module>_validation.py`,
327
+ `test_<module>_golden.py`).
328
+ - **Function naming**: `test_<thing_under_test>_<scenario>`. No
329
+ class-based test grouping unless fixtures truly demand it (rare).
231
330
  - **Markers**: `unit`, `property`, `smoke`, `golden`.
232
331
  - **Sklearn-reference + analytical** as the unit-test oracle where available.
233
332
  - **Hypothesis** required for math/stat invariants. Strategies use
@@ -74,15 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
74
74
  # (granular extras — losses callers should not have to install the larger
75
75
  # transformers stack). Shares the torch version pin with [probes].
76
76
  losses = ["torch>=2.0"]
77
- # DEPRECATED (announced v0.30.1, removal v0.33.0).
77
+ # NO-OP extra kept for backward compatibility (R3 at v0.49.0).
78
78
  #
79
- # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
80
- # / dev still resolve cleanly. jsonschema moved to the base deps in
81
- # v0.16.0; this extra has been a no-op ever since. The 2-minor-version
82
- # window (v0.30.1 announce v0.33.0 remove) matches the @deprecated
83
- # policy in docs/DEPRECATION.md. Extras can't trigger import-time
84
- # DeprecationWarnings, so the deprecation is documentation-only here +
85
- # in CHANGELOG ### Deprecated + docs/DEPRECATION.md.
79
+ # jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
80
+ # no-op ever since. Originally announced as deprecated in v0.30.1 with
81
+ # target removal at v0.33.0, but reclassified at v0.49.0 (R3 in
82
+ # docs/DEPRECATION.md) as a permanent no-op hard removal would break
83
+ # consumer pip pins of the form `eval-toolkit[validation]` for zero
84
+ # functional benefit. Retained indefinitely.
86
85
  validation = []
87
86
  # v0.31.0 docs site: Sphinx + pydata-sphinx-theme (replaces v0.28.0's
88
87
  # mkdocs-material). Migration drivers — pain points Q1 in the v0.31.0
@@ -38,15 +38,15 @@ _EXPORTS: dict[str, str] = {
38
38
  "ALL_TECHNIQUES": "eval_toolkit.adversarial",
39
39
  "BidiRTLInjection": "eval_toolkit.adversarial",
40
40
  "CORE_TECHNIQUES": "eval_toolkit.adversarial",
41
- "CaseRandomization": "eval_toolkit.adversarial",
41
+ "CaseInjection": "eval_toolkit.adversarial",
42
42
  "DiacriticInjection": "eval_toolkit.adversarial",
43
43
  "HomoglyphSubstitution": "eval_toolkit.adversarial",
44
44
  "InvisibleCharsInjection": "eval_toolkit.adversarial",
45
45
  "PunctuationInjection": "eval_toolkit.adversarial",
46
46
  "SynonymSubstitution": "eval_toolkit.adversarial",
47
47
  "TagStrippingInjection": "eval_toolkit.adversarial",
48
- "TokenSplitting": "eval_toolkit.adversarial",
49
- "UnicodeNormalization": "eval_toolkit.adversarial",
48
+ "TokenSplittingInjection": "eval_toolkit.adversarial",
49
+ "UnicodeNormalizationInjection": "eval_toolkit.adversarial",
50
50
  "WhitespaceInjection": "eval_toolkit.adversarial",
51
51
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
52
52
  # CharacterInjectionStrategy + character_injection SimpleNamespace
@@ -202,7 +202,7 @@ _EXPORTS: dict[str, str] = {
202
202
  "MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
203
203
  "RunManifest": "eval_toolkit.manifest",
204
204
  "SourceRoleRecord": "eval_toolkit.manifest",
205
- "build_manifest": "eval_toolkit.manifest",
205
+ "make_manifest": "eval_toolkit.manifest",
206
206
  "validate_source_roles": "eval_toolkit.manifest",
207
207
  "write_manifest": "eval_toolkit.manifest",
208
208
  # --- metrics ---
@@ -315,10 +315,10 @@ _EXPORTS: dict[str, str] = {
315
315
  "wilson_interval": "eval_toolkit.thresholds",
316
316
  "LogisticStacker": "eval_toolkit.stacking",
317
317
  "MetaLearner": "eval_toolkit.stacking",
318
- "MetricResult": "eval_toolkit._scorecard",
319
- "MetricSpec": "eval_toolkit._scorecard",
320
- "Scorecard": "eval_toolkit._scorecard",
321
- "scorecard": "eval_toolkit._scorecard",
318
+ "MetricResult": "eval_toolkit.scorecards",
319
+ "MetricSpec": "eval_toolkit.scorecards",
320
+ "Scorecard": "eval_toolkit.scorecards",
321
+ "scorecard": "eval_toolkit.scorecards",
322
322
  # --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
323
323
  "sweep": "eval_toolkit._sweep",
324
324
  }
@@ -0,0 +1,46 @@
1
+ """Private RNG-parameter type aliases per Scientific-Python SPEC 7.
2
+
3
+ This module centralizes the type aliases used to annotate user-facing RNG
4
+ parameters across the toolkit. Per `SPEC 7 — Seeding PRNG
5
+ <https://scientific-python.org/specs/spec-0007/>`_ (Endorsed) eval-toolkit
6
+ exposes a single canonical parameter name ``rng`` typed as
7
+ ``RNGLike | SeedLike | None`` on every function that consumes a NumPy
8
+ ``Generator``. Bodies normalize via ``np.random.default_rng(rng)``.
9
+
10
+ This module is private (underscore prefix) so the aliases stay an
11
+ implementation detail — public symbols use them only in their annotations.
12
+ If a Tier-2 consumer ever needs them exposed for their own callsite type
13
+ annotations, promote them via ``eval_toolkit.protocols`` per the
14
+ asymmetric-promotion principle in ADR 0001 + STYLE.md §3d.
15
+
16
+ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
17
+
18
+ - ``seeds.set_global_seeds(seed: int)`` — global-state setter, not a
19
+ per-function RNG parameter; SPEC 7 is scoped to per-function RNG inputs.
20
+ - ``adversarial.*Injection`` / ``*Substitution`` / ``CaseInjection``
21
+ dataclass fields — they use Python's stdlib ``random.Random(seed)``,
22
+ not NumPy. SPEC 7's typing (``RNGLike = np.random.Generator | ...``) is
23
+ strictly NumPy-scoped.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from collections.abc import Sequence
29
+
30
+ import numpy as np
31
+
32
+ type SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence
33
+ """Anything that can seed a NumPy bit generator.
34
+
35
+ Per SPEC 7, ``np.random.default_rng`` accepts any of these as a seed
36
+ without further conversion. ``Sequence[int]`` is the entropy-vector form
37
+ used by ``np.random.SeedSequence``.
38
+ """
39
+
40
+ type RNGLike = np.random.Generator | np.random.BitGenerator
41
+ """An already-instantiated NumPy bit generator or generator wrapper.
42
+
43
+ ``np.random.default_rng(rng)`` is the identity function on
44
+ ``Generator`` inputs and lifts ``BitGenerator`` inputs into a
45
+ ``Generator`` — both forms compose cleanly.
46
+ """
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.48.0"
5
+ __version__ = "0.49.0"
@@ -12,7 +12,7 @@ Core techniques (shipped in v0.43.0):
12
12
  - :class:`HomoglyphSubstitution` — Latin → Cyrillic/Greek lookalikes
13
13
  - :class:`DiacriticInjection` — combining-mark insertion (NFC bypass)
14
14
  - :class:`WhitespaceInjection` — variable whitespace padding (regular + NBSP)
15
- - :class:`CaseRandomization` — random case-flipping per character
15
+ - :class:`CaseInjection` — random case-flipping per character
16
16
  - :class:`PunctuationInjection` — non-semantic punctuation insertion
17
17
 
18
18
  Advanced techniques (shipped in v0.47 per Decision Q11.3):
@@ -20,8 +20,8 @@ Advanced techniques (shipped in v0.47 per Decision Q11.3):
20
20
  - :class:`BidiRTLInjection` — U+202E…U+202C override block
21
21
  - :class:`TagStrippingInjection` — ``<…>`` tag removal (idempotent)
22
22
  - :class:`SynonymSubstitution` — whitelisted-word swap, seed-deterministic
23
- - :class:`TokenSplitting` — mid-word single-space insertion
24
- - :class:`UnicodeNormalization` — NFC / NFD / NFKC / NFKD form switch
23
+ - :class:`TokenSplittingInjection` — mid-word single-space insertion
24
+ - :class:`UnicodeNormalizationInjection` — NFC / NFD / NFKC / NFKD form switch
25
25
  - :class:`InvisibleCharsInjection` — 5 invisible code points
26
26
 
27
27
  The convenience tuples :data:`CORE_TECHNIQUES` (6-tuple),
@@ -54,15 +54,15 @@ __all__ = [
54
54
  "ALL_TECHNIQUES",
55
55
  "BidiRTLInjection",
56
56
  "CORE_TECHNIQUES",
57
- "CaseRandomization",
57
+ "CaseInjection",
58
58
  "DiacriticInjection",
59
59
  "HomoglyphSubstitution",
60
60
  "InvisibleCharsInjection",
61
61
  "PunctuationInjection",
62
62
  "SynonymSubstitution",
63
63
  "TagStrippingInjection",
64
- "TokenSplitting",
65
- "UnicodeNormalization",
64
+ "TokenSplittingInjection",
65
+ "UnicodeNormalizationInjection",
66
66
  "WhitespaceInjection",
67
67
  "ZeroWidthSpaceInjection",
68
68
  ]
@@ -287,7 +287,7 @@ class WhitespaceInjection:
287
287
 
288
288
 
289
289
  @dataclass(frozen=True, slots=True)
290
- class CaseRandomization:
290
+ class CaseInjection:
291
291
  """Randomly flip the case of alphabetic characters.
292
292
 
293
293
  Deterministic given the seed. Numeric / punctuation / whitespace pass
@@ -311,7 +311,7 @@ class CaseRandomization:
311
311
 
312
312
  def __post_init__(self) -> None:
313
313
  if not 0.0 <= self.ratio <= 1.0:
314
- raise ValueError(f"CaseRandomization: ratio must be in [0, 1]; got {self.ratio}")
314
+ raise ValueError(f"CaseInjection: ratio must be in [0, 1]; got {self.ratio}")
315
315
 
316
316
  def transform(self, text: str) -> str:
317
317
  rng = random.Random(self.seed)
@@ -524,7 +524,7 @@ class SynonymSubstitution:
524
524
 
525
525
 
526
526
  @dataclass(frozen=True, slots=True)
527
- class TokenSplitting:
527
+ class TokenSplittingInjection:
528
528
  """Insert a single space inside each long enough word.
529
529
 
530
530
  Forces subword tokenizers to break a single token into two, often
@@ -552,10 +552,10 @@ class TokenSplitting:
552
552
  def __post_init__(self) -> None:
553
553
  if self.min_word_length < 2:
554
554
  raise ValueError(
555
- f"TokenSplitting: min_word_length must be >= 2; got {self.min_word_length}"
555
+ f"TokenSplittingInjection: min_word_length must be >= 2; got {self.min_word_length}"
556
556
  )
557
557
  if not 0.0 <= self.ratio <= 1.0:
558
- raise ValueError(f"TokenSplitting: ratio must be in [0, 1]; got {self.ratio}")
558
+ raise ValueError(f"TokenSplittingInjection: ratio must be in [0, 1]; got {self.ratio}")
559
559
 
560
560
  def transform(self, text: str) -> str:
561
561
  rng = random.Random(self.seed)
@@ -576,7 +576,7 @@ class TokenSplitting:
576
576
 
577
577
 
578
578
  @dataclass(frozen=True, slots=True)
579
- class UnicodeNormalization:
579
+ class UnicodeNormalizationInjection:
580
580
  """Apply a Unicode normalization form to the input.
581
581
 
582
582
  Defaults to NFKC which folds compatibility characters (e.g., ``ABC``
@@ -598,7 +598,7 @@ class UnicodeNormalization:
598
598
  def __post_init__(self) -> None:
599
599
  if self.form not in {"NFC", "NFD", "NFKC", "NFKD"}:
600
600
  raise ValueError(
601
- f"UnicodeNormalization: form must be NFC / NFD / NFKC / NFKD; got {self.form!r}"
601
+ f"UnicodeNormalizationInjection: form must be NFC / NFD / NFKC / NFKD; got {self.form!r}"
602
602
  )
603
603
 
604
604
  def transform(self, text: str) -> str:
@@ -659,15 +659,15 @@ CORE_TECHNIQUES: tuple[type[Any], ...] = (
659
659
  HomoglyphSubstitution,
660
660
  DiacriticInjection,
661
661
  WhitespaceInjection,
662
- CaseRandomization,
662
+ CaseInjection,
663
663
  PunctuationInjection,
664
664
  )
665
665
  ADVANCED_TECHNIQUES: tuple[type[Any], ...] = (
666
666
  BidiRTLInjection,
667
667
  TagStrippingInjection,
668
668
  SynonymSubstitution,
669
- TokenSplitting,
670
- UnicodeNormalization,
669
+ TokenSplittingInjection,
670
+ UnicodeNormalizationInjection,
671
671
  InvisibleCharsInjection,
672
672
  )
673
673
  ALL_TECHNIQUES: tuple[type[Any], ...] = CORE_TECHNIQUES + ADVANCED_TECHNIQUES
@@ -703,8 +703,8 @@ def _whitespace(
703
703
 
704
704
 
705
705
  def _case_random(text: str, ratio: float = 0.5, seed: int = 42) -> str:
706
- """Functional alias for :class:`CaseRandomization`."""
707
- return CaseRandomization(ratio=ratio, seed=seed).transform(text)
706
+ """Functional alias for :class:`CaseInjection`."""
707
+ return CaseInjection(ratio=ratio, seed=seed).transform(text)
708
708
 
709
709
 
710
710
  def _punctuation(text: str, ratio: float = 0.1, seed: int = 42) -> str:
@@ -71,28 +71,16 @@ __all__ = [
71
71
  "Severity",
72
72
  "TemporalLeakageCheck",
73
73
  "TokenizationLeakageCheck",
74
- "Versioned",
75
74
  "run_leakage_checks",
76
75
  ]
77
76
 
78
77
  Severity = Literal["error", "warning", "info"]
79
78
 
80
-
81
- @runtime_checkable
82
- class Versioned(Protocol):
83
- """Anything exposing a ``version: str`` attribute.
84
-
85
- Used by :class:`~eval_toolkit.manifest.RunManifest` to capture per-object
86
- versions of any Tier-2 implementation (Scorer, LeakageCheck, Splitter,
87
- ThresholdSelector, DatasetLoader). Mirrors the lm-evaluation-harness
88
- ``VERSION`` field pattern, which invalidates cross-version metric
89
- comparisons. Opt-in: implementations are not required to set ``version``.
90
- """
91
-
92
- @property
93
- def version(self) -> str: # pragma: no cover
94
- """Stable version string for this implementation."""
95
- ...
79
+ # `Versioned` Protocol previously had a duplicate definition here (v0.7+).
80
+ # Removed at v0.49.0 (N5 dedup) — canonical home is `eval_toolkit.protocols`
81
+ # per `protocols.py:1-5` ("Lightweight public Protocols with minimal dependency
82
+ # surface"). Use `from eval_toolkit.protocols import Versioned` (or top-level
83
+ # `from eval_toolkit import Versioned`).
96
84
 
97
85
 
98
86
  # ---------------------------------------------------------------------------