eval-toolkit 0.47.0__tar.gz → 0.49.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/CHANGELOG.md +197 -0
  2. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/PKG-INFO +4 -4
  3. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/README.md +3 -3
  4. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/STYLE.md +103 -4
  5. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/pyproject.toml +7 -8
  6. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__init__.py +23 -17
  7. eval_toolkit-0.49.0/src/eval_toolkit/_rng.py +46 -0
  8. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_sweep.py +120 -2
  9. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_version.py +1 -1
  10. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/adversarial.py +56 -34
  11. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/bootstrap.py +69 -16
  12. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/calibration.py +41 -3
  13. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/config.py +1 -1
  14. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/embeddings.py +1 -1
  15. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/leakage.py +5 -17
  16. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/loaders.py +2 -3
  17. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/manifest.py +10 -10
  18. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metric_specs.py +1 -1
  19. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/metrics.py +15 -0
  20. eval_toolkit-0.47.0/src/eval_toolkit/_scorecard.py → eval_toolkit-0.49.0/src/eval_toolkit/scorecards.py +37 -4
  21. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/public_api/snapshot.json +13 -13
  22. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_adversarial.py +17 -17
  23. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_analysis.py +2 -3
  24. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_edge_cases.py +57 -0
  25. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_unit.py +39 -6
  26. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_coverage_bootstrap.py +4 -2
  27. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_metric_options.py +4 -2
  28. eval_toolkit-0.49.0/tests/test_lazy_extras_messages.py +283 -0
  29. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_loaders.py +8 -2
  30. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_manifest.py +43 -43
  31. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_manifest_contamination_round_trip.py +6 -6
  32. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_manifest_props.py +11 -11
  33. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_manifest_validation.py +4 -4
  34. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_metrics_unit.py +153 -0
  35. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_preprocessing.py +2 -2
  36. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_schemas.py +6 -6
  37. eval_toolkit-0.49.0/tests/test_sweep.py +426 -0
  38. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_v09_contracts.py +2 -2
  39. eval_toolkit-0.47.0/tests/test_sweep.py +0 -180
  40. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/.gitignore +0 -0
  41. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/LICENSE +0 -0
  42. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/archive/README.md +0 -0
  43. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/README.md +0 -0
  44. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/datasets/README.md +0 -0
  45. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/papers/data-integrity/README.md +0 -0
  46. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  47. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/papers/inference/README.md +0 -0
  48. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/research/papers/prompt-injection/README.md +0 -0
  49. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/source/adr/README.md +0 -0
  50. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/docs/source/methodology/README.md +0 -0
  51. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/__main__.py +0 -0
  52. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_deprecated.py +0 -0
  53. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/_parallel.py +0 -0
  54. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/analysis.py +0 -0
  55. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/artifacts.py +0 -0
  56. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/claims.py +0 -0
  57. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/docs.py +0 -0
  58. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/evidence.py +0 -0
  59. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/harness.py +0 -0
  60. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/losses.py +0 -0
  61. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/operating_points.py +0 -0
  62. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/paths.py +0 -0
  63. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/plotting.py +0 -0
  64. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/preprocessing.py +0 -0
  65. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/probes.py +0 -0
  66. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/protocols.py +0 -0
  67. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/provenance.py +0 -0
  68. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/py.typed +0 -0
  69. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  70. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  71. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  72. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  73. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  74. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  75. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/seeds.py +0 -0
  76. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/splits.py +0 -0
  77. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/stacking.py +0 -0
  78. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/text_dedup.py +0 -0
  79. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/src/eval_toolkit/thresholds.py +0 -0
  80. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  81. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  82. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  83. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  84. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  85. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  86. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  87. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  88. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  89. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  90. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/benchmarks/__init__.py +0 -0
  91. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  92. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/conftest.py +0 -0
  93. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  94. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  95. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  96. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  97. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/docs/expected.md +0 -0
  98. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/docs/input.md +0 -0
  99. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/docs/metrics.json +0 -0
  100. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  101. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/strategies.py +0 -0
  102. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_artifacts.py +0 -0
  103. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  104. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  105. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_golden.py +0 -0
  106. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_njobs.py +0 -0
  107. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_props.py +0 -0
  108. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_bootstrap_research_grounded.py +0 -0
  109. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_binary_adapters.py +0 -0
  110. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  111. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_determinism.py +0 -0
  112. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_optimization_failures.py +0 -0
  113. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_props.py +0 -0
  114. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_research_grounded.py +0 -0
  115. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_calibration_unit.py +0 -0
  116. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_claims.py +0 -0
  117. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_claims_coverage.py +0 -0
  118. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_claims_props.py +0 -0
  119. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_cli.py +0 -0
  120. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_config.py +0 -0
  121. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_coverage_calibration.py +0 -0
  122. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_coverage_harness.py +0 -0
  123. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_coverage_metrics.py +0 -0
  124. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_coverage_plotting.py +0 -0
  125. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_croissant_e2e.py +0 -0
  126. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  127. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_deprecated_scalars_shim.py +0 -0
  128. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_deprecations.py +0 -0
  129. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_docs_golden.py +0 -0
  130. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_docs_props.py +0 -0
  131. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_embeddings.py +0 -0
  132. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_evidence_validators.py +0 -0
  133. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_edge_cases.py +0 -0
  134. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_fault_injection.py +0 -0
  135. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_folded.py +0 -0
  136. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_internals.py +0 -0
  137. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_parallelism.py +0 -0
  138. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_harness_smoke.py +0 -0
  139. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_import_boundaries.py +0 -0
  140. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  141. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_leakage.py +0 -0
  142. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_leakage_error_paths.py +0 -0
  143. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_leakage_props.py +0 -0
  144. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_loaders_coverage.py +0 -0
  145. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_loaders_props.py +0 -0
  146. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_logging.py +0 -0
  147. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_losses.py +0 -0
  148. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_metrics_props.py +0 -0
  149. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_metrics_stratified_subsets.py +0 -0
  150. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_misc_coverage.py +0 -0
  151. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_numeric_edge_cases.py +0 -0
  152. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_ood_loader.py +0 -0
  153. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_operating_points.py +0 -0
  154. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_operating_points_props.py +0 -0
  155. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_parallel.py +0 -0
  156. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_paths.py +0 -0
  157. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_pipeline_e2e.py +0 -0
  158. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_plotting_edge.py +0 -0
  159. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_plotting_smoke.py +0 -0
  160. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_plotting_visual.py +0 -0
  161. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_probes.py +0 -0
  162. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_protocol_conformance.py +0 -0
  163. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_provenance.py +0 -0
  164. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_public_api.py +0 -0
  165. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_recall_at_fpr.py +0 -0
  166. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_reference_equivalence.py +0 -0
  167. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_reproducibility_integration.py +0 -0
  168. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_scorecard.py +0 -0
  169. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_seeds.py +0 -0
  170. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_splits.py +0 -0
  171. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_splits_leakage_integration.py +0 -0
  172. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_splits_props.py +0 -0
  173. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_stacking.py +0 -0
  174. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_text_dedup.py +0 -0
  175. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_coverage.py +0 -0
  176. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_props.py +0 -0
  177. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_text_dedup_strategies.py +0 -0
  178. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_thresholds.py +0 -0
  179. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_thresholds_constant_score.py +0 -0
  180. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_thresholds_coverage.py +0 -0
  181. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_thresholds_props.py +0 -0
  182. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_thresholds_research_grounded.py +0 -0
  183. {eval_toolkit-0.47.0 → eval_toolkit-0.49.0}/tests/test_tokenization_leakage_check.py +0 -0
@@ -5,6 +5,203 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.49.0] — 2026-05-23 — Global naming-standards sweep + final cleanup before v1.0
9
+
10
+ Final pre-v1.0 minor consolidating the naming-convention standardization
11
+ that locks the v1.0 Tier-1 contract. Audit + industry-research pass
12
+ (PEP 8, scikit-learn, NumPy, Google Python Style Guide, Scientific
13
+ Python SPEC 7) found the repo already 95-99% consistent; this release
14
+ closes the small remaining gaps + documents the conventions as
15
+ [ADR 0004](docs/source/adr/0004-naming-conventions.md). The SPEC 7
16
+ ``rng`` parameter convention is documented here and adopted in v0.50.0.
17
+
18
+ ### BREAKING
19
+
20
+ Five Tier-1 renames for naming consistency (pre-v1.0; SemVer-minor per
21
+ the v0.34.0 BREAKING-minor precedent). Single-consumer lockstep bump in
22
+ ``prompt-injection-detection-submission``; no deprecation aliases.
23
+
24
+ - **``build_manifest`` → ``make_manifest``** (manifest.py). Aligns
25
+ with ``make_minilm_embedder`` / ``make_palette`` / ``make_run_dir``
26
+ factory pattern. ``build_*`` was the only outlier.
27
+ - **``CaseRandomization`` → ``CaseInjection``** (adversarial.py).
28
+ Aligns with ``*Injection`` / ``*Substitution`` adversarial suffix
29
+ convention.
30
+ - **``TokenSplitting`` → ``TokenSplittingInjection``** (adversarial.py).
31
+ Same rationale.
32
+ - **``UnicodeNormalization`` → ``UnicodeNormalizationInjection``**
33
+ (adversarial.py). Same rationale.
34
+ - **``eval_toolkit._scorecard.py`` → ``eval_toolkit.scorecards.py``**
35
+ (private → public module promotion). The 4 top-level symbols
36
+ (``scorecard``, ``Scorecard``, ``MetricSpec``, ``MetricResult``)
37
+ remain top-level Tier-1; the new public submodule path
38
+ ``from eval_toolkit.scorecards import Scorecard`` is now stable.
39
+ ``_scorecard.py`` is gone — old import paths raise
40
+ ``ModuleNotFoundError``. Per the asymmetric-promotion principle in
41
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md): promote
42
+ collection-of-types modules, keep single-function modules underscore
43
+ (``_sweep.py`` stays private).
44
+
45
+ ### Added
46
+
47
+ - **[ADR 0004](docs/source/adr/0004-naming-conventions.md)** — Naming
48
+ conventions decision record with industry citations. Covers module
49
+ naming (singular vs plural), class suffixes by domain, function
50
+ verb-prefix conventions, canonical parameter list, fitted-attribute
51
+ trailing underscore (sklearn convention), TypeVar leading underscore
52
+ (Google convention), and the SPEC 7 ``rng`` parameter convention
53
+ (adopted in v0.50.0).
54
+ - **STYLE.md** extended with §3a-d (parameter naming, class suffixes
55
+ by domain, module naming, asymmetric promotion), §4a-b
56
+ (fitted-attribute trailing underscore + TypeVar), §12 (75-col
57
+ docstring prose rule), §14 (test naming convention).
58
+ - **CONTRIBUTING.md** cross-link to ADR 0004 + STYLE.md.
59
+ - **[docs/source/api/strict_tier2_protocols.md](docs/source/api/strict_tier2_protocols.md)** —
60
+ new docs page enumerating the 9 strict Tier-2 Protocols + 1 opt-in
61
+ per [ADR 0003 §1](docs/source/adr/0003-stability-contract-and-gate3-methodology.md),
62
+ with canonical top-level import paths. Resolves #69's discoverability
63
+ concern without breaking the lightweight design intent of
64
+ ``eval_toolkit.protocols`` (per ``protocols.py:1-5``).
65
+ - **``src/eval_toolkit/_rng.py``** — private module with SPEC 7 type
66
+ aliases (``SeedLike``, ``RNGLike``). Not yet referenced; scaffold for
67
+ the v0.50.0 SPEC 7 adoption.
68
+ - **[ADR 0001](docs/source/adr/0001-flat-module-layout.md)** amendment
69
+ — added the asymmetric-promotion sub-rule (collection-of-types MAY
70
+ promote, single-function SHOULD stay underscore).
71
+
72
+ ### Changed
73
+
74
+ - **Duplicate-type consolidation** (single source of truth):
75
+ - ``Versioned`` Protocol — canonical at ``protocols.py:64``; the
76
+ duplicate at ``leakage.py:82`` removed. Removed
77
+ ``"Versioned"`` from ``leakage.__all__``; previously-unused
78
+ ``from eval_toolkit.leakage import Versioned`` now raises
79
+ ``ImportError``. Use ``from eval_toolkit.protocols import Versioned``
80
+ or top-level ``from eval_toolkit import Versioned``.
81
+ - ``MetricStatus`` ``Literal`` — canonical at ``artifacts.py:30``; the
82
+ duplicate at ``scorecards.py:78`` removed; ``scorecards`` now
83
+ imports from ``artifacts``.
84
+ - **[validation] optional extra** reclassified from "active deprecation
85
+ with removal target v0.33.0" → "permanent no-op kept for backward
86
+ compatibility." Hard removal would break consumer pip pins of the
87
+ form ``eval-toolkit[validation]`` for zero functional benefit
88
+ (R3 in DEPRECATION.md).
89
+ - **Sphinx cross-references** updated from
90
+ ``eval_toolkit.leakage.Versioned`` → ``eval_toolkit.protocols.Versioned``
91
+ in ``manifest.py`` docstrings.
92
+
93
+ ### Deferred to v0.50.0
94
+
95
+ - **SPEC 7 ``rng`` parameter adoption** across ~30 NumPy-RNG functions.
96
+ Scope deferred from v0.49.0 after the planning audit revealed the
97
+ full blast radius (~30 signature sites + 247 test kwarg sites +
98
+ 7 internal helpers + SeedSequence/Generator/sklearn-bridge
99
+ conversions). Splitting matches the "one cleanup per minor" pattern
100
+ per [feedback_staggered_breaking_releases]. ``_rng.py`` ships in
101
+ v0.49.0 as the scaffold; v0.50.0 wires it into every applicable
102
+ function.
103
+
104
+ ### Notes
105
+
106
+ - Round 8 audit STOP-GATE per Decision Y.2 — briefing committed at
107
+ v0.48.0 (commit ``6f6839a``); v0.49.0 ships in parallel since the
108
+ audit-trail synthesis confirmed R8 audits the existing contract
109
+ (does not prescribe new changes). Any R8 finding folds into v0.49.1
110
+ hotfix if needed.
111
+ - Issue #69 closed by the new strict-Tier-2-Protocols docs page; see
112
+ ``docs/source/api/strict_tier2_protocols.md`` and the close
113
+ rationale on the issue itself.
114
+
115
+ ## [0.48.0] — 2026-05-22 — Polish + audit-driven tightening before v1.0 (Round 7 follow-on + cross-API consistency + doc-execution gates)
116
+
117
+ Third + final BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47
118
+ → v0.48 → v1.0 release sequence (plan
119
+ ``~/.claude/plans/evaluate-all-the-work-twinkly-kite.md``, Step 4). Migration
120
+ guide: ``docs/source/migration/v0.48.md``.
121
+
122
+ Closes:
123
+
124
+ - Round 7 audit STOP-GATE per Decision Y.2 (Codex R7-F1/F2/F3 + 6 Gemini
125
+ observations; see ``docs/source/audit_findings.md`` for the per-finding
126
+ ledger).
127
+ - Audit-as-seed extensions surfaced during plan refinement: full
128
+ module-docstring sweep across ``src/eval_toolkit/``; expanded
129
+ ``.doctest-modules`` from 11 → 21 modules; comprehensive cross-API
130
+ shape-validation consistency sweep.
131
+ - Round 5 §5E-prep packet-drift fixes (7 methodology documentation
132
+ corrections).
133
+
134
+ After v0.48 observes ≥1 consumer cycle, the Round 8 audit STOP-GATE
135
+ opens before ``v1.0.0`` tag.
136
+
137
+ ### BREAKING
138
+
139
+ - **``BootstrapCI.to_dict()`` + ``PairedBootstrapCI.to_dict()`` schema
140
+ rewrite** (§5B). Pre-v0.48 hard-coded a ``"ci_95"`` key regardless of
141
+ the actual ``confidence`` field — the key contradicted the data.
142
+ v0.48 schema is self-describing:
143
+
144
+ Before: ``{"point_estimate": p, "ci_95": [l, h], "confidence": 0.95, ...}``
145
+ After: ``{"point": p, "low": l, "high": h, "confidence": 0.95, ...}``
146
+
147
+ Migration: ``d["point_estimate"]`` → ``d["point"]``; ``d["ci_95"]``
148
+ → ``(d["low"], d["high"])``. Same rewrite for ``PairedBootstrapCI``.
149
+ - **``sweep()`` schema grows by 1 column** (§5I, Decision R7-B option C).
150
+ New ``strategy_id`` column inserted between ``text_id`` and ``variant``
151
+ carries the canonical per-row identifier built from configured
152
+ kwargs. Callers indexing by column position must re-check offsets.
153
+ - **``sweep()`` rejects duplicate ``strategy_id``** (§5I). Mirrors
154
+ R6-B's duplicate ``MetricSpec.name`` rejection in ``scorecard()``.
155
+ - **``sweep()`` validates scorer output shape** (§5J, Decision R7-C).
156
+ Wrong-shape arrays from ``Scorer.predict_proba`` raise contextual
157
+ ``ValueError`` at the boundary. Pre-v0.48: silent truncation
158
+ (overlong), ``IndexError`` (short), or ``TypeError`` (matrix-shaped).
159
+ - **``paired_bootstrap_op_point_diff()`` rejects ``val_y is test_y``**
160
+ (§5E-prep). The two-level bootstrap assumes disjoint val + test
161
+ partitions; passing the same array causes ~63.2% silent overlap.
162
+
163
+ ### Added
164
+
165
+ - **``make pre-push``** Makefile target (§5L) running all 3 doc-
166
+ execution surfaces — Sybil-collected ``.md`` fences, MyST-NB example
167
+ notebooks, and in-source ``>>>`` docstring examples. Closes the
168
+ v0.47 Sub-PR 7 incident class.
169
+ - **``nb_execution_raise_on_error = True``** in ``docs/source/conf.py``
170
+ (§5H, Decision R7-A). Docs CI now fails on notebook execution errors.
171
+ - **``.doctest-modules`` expanded** from 11 → 21 modules (§5M).
172
+
173
+ ### Changed
174
+
175
+ - **Cross-API shape-validation consistency** (§5N). Every public-API
176
+ surface with array inputs now validates shape + raises ``ValueError``
177
+ with context (rather than leaking low-level numpy/sklearn errors).
178
+ - **Standardized ``ImportError`` messages** across lazy-extras (§5C).
179
+ Canonical template: ``"<feature> requires <pkg>. Install with: pip
180
+ install eval-toolkit[<extra>]"``.
181
+ - **Pin-exact-key-set regression-guards** (§5A) for every dict-returning
182
+ metrics function. Audit revealed no drift; the tests pin existing
183
+ key sets so future drift fails CI loud.
184
+ - **Docs polish** (§5K + §5E-prep): ``SynonymSubstitution`` whitelist
185
+ ``Notes``; ``Scorecard.to_pandas()`` dtype coercion ``Notes``;
186
+ ``CostSensitiveSelector`` calibrated-prior ``Warning``; ``cv_clt_ci``
187
+ docstring per Bayle et al. (2020) Theorem 3.1; ``methodology/parallelism.md``
188
+ post-v0.36 state; ``methodology/testing.md`` reference-equivalence-gap
189
+ framing; ``methodology/calibration.md`` 4-binary-adapter family;
190
+ ``methodology/bootstrap.md`` disjoint-split example; DeLong docs
191
+ aligned to shipped state (Decision U).
192
+
193
+ ### Fixed
194
+
195
+ - **R7-F1**: 6 MyST-NB example notebooks (``docs/source/examples/*.md``)
196
+ migrated to v0.47 API; 4 module-level docstrings rewritten; 5
197
+ drifted ``docs/source/api/*.md`` autosummary lists corrected;
198
+ 8 missing ``api/*.md`` pages created; roadmap "Sybil-validated
199
+ examples" wording corrected (§5G).
200
+ - **ADR 0001** (flat-module layout) + **ADR 0003** (stability contract
201
+ + Gate 3 methodology) finalized for v1.0 (§5E + §5F).
202
+ - **schemas.md** + **methodology/claims.md** + **getting-started.md**:
203
+ ``BootstrapCI`` schema references updated for the §5B rewrite.
204
+
8
205
  ## [0.47.0] — 2026-05-21 — Sweep unification + TextTransform + advanced-6 + cleanup + Round 6 follow-on
9
206
 
10
207
  Second BREAKING minor of the staggered v0.45 → v0.46 → v0.46.1 → v0.47 →
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.47.0
3
+ Version: 0.49.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -261,13 +261,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
261
261
  ```python
262
262
  import tempfile
263
263
  from pathlib import Path
264
- from eval_toolkit import build_manifest, write_manifest
264
+ from eval_toolkit import make_manifest, write_manifest
265
265
 
266
266
  with tempfile.TemporaryDirectory() as run_dir:
267
267
  # data_files: {name: path} → eval_toolkit hashes the files for you;
268
268
  # versioned: any object with a `version` attribute (e.g. a scorer or
269
269
  # leakage check) is captured by name → version in the manifest.
270
- manifest = build_manifest(
270
+ manifest = make_manifest(
271
271
  run_id="quickstart-demo",
272
272
  config={"threshold_criterion": "max_f1", "seed": 42},
273
273
  seeds={"global": 42, "bootstrap": 42},
@@ -290,7 +290,7 @@ with tempfile.TemporaryDirectory() as run_dir:
290
290
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
291
291
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
292
292
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
293
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
293
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
294
294
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
295
295
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
296
296
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -178,13 +178,13 @@ print(f"NLL: {result['nll_pre']:.3f} -> {result['nll_post']:.3f}")
178
178
  ```python
179
179
  import tempfile
180
180
  from pathlib import Path
181
- from eval_toolkit import build_manifest, write_manifest
181
+ from eval_toolkit import make_manifest, write_manifest
182
182
 
183
183
  with tempfile.TemporaryDirectory() as run_dir:
184
184
  # data_files: {name: path} → eval_toolkit hashes the files for you;
185
185
  # versioned: any object with a `version` attribute (e.g. a scorer or
186
186
  # leakage check) is captured by name → version in the manifest.
187
- manifest = build_manifest(
187
+ manifest = make_manifest(
188
188
  run_id="quickstart-demo",
189
189
  config={"threshold_criterion": "max_f1", "seed": 42},
190
190
  seeds={"global": 42, "bootstrap": 42},
@@ -207,7 +207,7 @@ with tempfile.TemporaryDirectory() as run_dir:
207
207
  | `eval_toolkit.leakage` | `LeakageCheck` Protocol + 7 reference impls (exact / near / encoding-obfuscated / cross-split / label-conflict / group / temporal); `Versioned` opt-in Protocol |
208
208
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
209
209
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
210
- | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `build_manifest` / `write_manifest` |
210
+ | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
211
211
  | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
212
212
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
213
213
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
@@ -36,6 +36,11 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
36
36
 
37
37
  ## 3. Naming
38
38
 
39
+ For the full decision record + industry-citations, see
40
+ [ADR 0004 — Naming conventions](docs/source/adr/0004-naming-conventions.md).
41
+ This section is the day-to-day quick reference; the ADR is the
42
+ authoritative source.
43
+
39
44
  - Module names: `snake_case`, lowercase package (`eval_toolkit`).
40
45
  - Class names: `PascalCase`. Suffixes used in this repo:
41
46
  - `*Config` — frozen dataclass for settings
@@ -55,6 +60,68 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
55
60
  - Mutation marking: not used. Mutating functions return `None` (Pythonic over
56
61
  Julia's `_inplace` suffix).
57
62
 
63
+ ### 3a. Parameter naming (canonical list, locked at v1.0)
64
+
65
+ These names mean these things, everywhere. Future functions MUST use
66
+ them; deviations need justification in the PR description.
67
+
68
+ | Parameter | Meaning |
69
+ |---|---|
70
+ | `y_true` | Ground-truth labels (binary, shape `(n,)`) |
71
+ | `y_score` | Continuous score / probability (shape `(n,)`) |
72
+ | `y_pred` | Discrete prediction (threshold-dependent) |
73
+ | `n_resamples` | Bootstrap iteration count |
74
+ | `confidence` | Two-sided confidence level (0.95 default) |
75
+ | `n_bins` | Binning count for calibration / ECE |
76
+ | `n_jobs` | Parallelism (joblib + sklearn convention) |
77
+ | `ax` | Matplotlib axis (matplotlib convention) |
78
+ | `metric` | Callable `(y_true, y_score) -> float` |
79
+ | `rng` | RNG argument per [SPEC 7](https://scientific-python.org/specs/spec-0007/) — target convention; adopted in v0.50.0 |
80
+
81
+ The v0.50.0 SPEC 7 adoption preserves two `seed: int` exceptions:
82
+ `set_global_seeds(seed: int)` (global-state setter, not per-function
83
+ RNG; SPEC 7 doesn't apply) and adversarial dataclass fields (use Python
84
+ `random.Random(seed)`; not NumPy-RNG, so SPEC 7's typing doesn't fit).
85
+
86
+ ### 3b. Class suffixes by domain
87
+
88
+ Each suffix maps to a Protocol contract. Stay within the pattern:
89
+
90
+ | Suffix | Domain | Protocol |
91
+ |---|---|---|
92
+ | `*Selector` | Threshold selection | `ThresholdSelector` |
93
+ | `*Splitter` | Cross-validation splits | `Splitter` |
94
+ | `*Check` | Leakage detection | `LeakageCheck` |
95
+ | `*Loader` | Dataset loading | `DatasetLoader` |
96
+ | `*Reader` | Prediction artifact reading | `PredictionReader` |
97
+ | `*Variant` | Preprocessing variant | (functional API) |
98
+ | `*Strategy` | Dedup similarity backend | `SimilarityStrategy` |
99
+ | `*Injection` / `*Substitution` | Adversarial char-injection / -substitution | `TextTransform` |
100
+
101
+ ### 3c. Module naming (singular vs plural)
102
+
103
+ - **Plural noun** for collection-of-types modules: `metrics`,
104
+ `loaders`, `protocols`, `losses`, `probes`, `splits`, `paths`,
105
+ `seeds`, `thresholds`, `artifacts`, `claims`, `embeddings`,
106
+ `scorecards`.
107
+ - **Singular noun** for domain-concept modules: `harness`,
108
+ `bootstrap`, `manifest`, `calibration`, `leakage`, `analysis`,
109
+ `provenance`, `evidence`, `stacking`, `text_dedup`.
110
+ - **Gerund** for process-domain modules: `preprocessing`.
111
+
112
+ ### 3d. Asymmetric module promotion (private → public)
113
+
114
+ Collection-of-types private modules MAY be promoted to plural-public
115
+ when they hold ≥2 user-relevant types. Single-function private
116
+ modules SHOULD stay underscore. See
117
+ [ADR 0001](docs/source/adr/0001-flat-module-layout.md) for the trigger
118
+ analysis.
119
+
120
+ Examples:
121
+
122
+ - `_scorecard.py` (4 public exports) → `scorecards.py` at v0.49.0. ✓ promote.
123
+ - `_sweep.py` (1 public function `sweep`) → stays `_sweep.py`. ✓ keep private.
124
+
58
125
  ## 4. Type hints
59
126
 
60
127
  - Every public function has fully typed parameters and return.
@@ -79,10 +146,13 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
79
146
  for 4 reference impls.
80
147
  - `SimilarityStrategy` (`text_dedup.py`) — pluggable similarity backend for
81
148
  `near_dedup` / `cross_dedup` / `NearDuplicateCheck` / `CrossSplitLeakageCheck`.
82
- - `Versioned` (`leakage.py`) — opt-in single-attribute Protocol; any Tier-2
83
- implementation may expose `version: str`. `RunManifest.versioned_objects`
84
- auto-collects them. Mirrors the `lm-evaluation-harness` task `VERSION`
85
- pattern. See `docs/methodology/versioning.md`.
149
+ - `Versioned` (`protocols.py`) — opt-in single-attribute Protocol; any
150
+ Tier-2 implementation may expose `version: str`.
151
+ `RunManifest.versioned_objects` auto-collects them. Mirrors the
152
+ `lm-evaluation-harness` task `VERSION` pattern. See
153
+ `docs/methodology/versioning.md`. (Single source of truth at
154
+ `protocols.py:64` since v0.49.0; the duplicate previously in
155
+ `leakage.py:82` was removed.)
86
156
  - All seams are `@runtime_checkable` so callers can `isinstance(obj, Protocol)`.
87
157
  - Reference impls are `@dataclass(frozen=True, slots=True)` with config in the
88
158
  constructor (`TargetRecallSelector(recall=0.90)`) and the Protocol method as
@@ -90,6 +160,25 @@ Run via `make lint` (= `ruff check + black --check + mypy`) and `make test`.
90
160
  - `NamedTuple` for stable public records that benefit from positional access;
91
161
  frozen dataclasses with `slots=True` otherwise.
92
162
 
163
+ ### 4a. Fitted-attribute trailing underscore (sklearn convention)
164
+
165
+ Estimator-style classes (`fit`/`predict` pattern) that store
166
+ **learned-from-data attributes** use trailing underscore per scikit-learn
167
+ convention: `coef_`, `classes_`, `n_features_in_`, `feature_importances_`.
168
+ These attributes MUST NOT be set in `__init__` — set them only in `fit()`.
169
+
170
+ Frozen reference-impl dataclasses (`@dataclass(frozen=True, slots=True)`)
171
+ are **exempt** — they hold config, not fitted state.
172
+
173
+ Current canonical example: `stacking.LogisticStacker`.
174
+
175
+ ### 4b. TypeVar naming
176
+
177
+ Internal (private) `TypeVar`s use a leading underscore per Google Python
178
+ Style Guide §3.19.10: `_T = TypeVar("_T")`. Public, constrained `TypeVar`s
179
+ without the underscore are allowed only when explicitly part of an
180
+ exported generic API.
181
+
93
182
  ## 5. Dataclasses
94
183
 
95
184
  1. **`slots=True` always** on repo-owned dataclasses. Catches typos at
@@ -220,6 +309,10 @@ def fit_temperature(val_logits, val_labels, bounds=(0.05, 20.0)):
220
309
  - **References** cites arXiv IDs / DOIs / journal cites.
221
310
  - For modules where doctests would be contrived (`plotting`, `harness`,
222
311
  `provenance`), Examples are optional.
312
+ - **Docstring prose wraps at 75 cols** (numpydoc convention) so that
313
+ `help()` is readable in a terminal. Doctest code blocks inside the
314
+ docstring follow the 100-col Black rule (code stays comfortable in an
315
+ editor even though prose around it is narrower).
223
316
 
224
317
  ## 13. Comments
225
318
 
@@ -228,6 +321,12 @@ restate what the code says.
228
321
 
229
322
  ## 14. Tests
230
323
 
324
+ - **File naming**: `tests/test_<module>.py` mirrors
325
+ `src/eval_toolkit/<module>.py`. Auxiliary tests per module use
326
+ suffixes (`test_<module>_props.py`, `test_<module>_validation.py`,
327
+ `test_<module>_golden.py`).
328
+ - **Function naming**: `test_<thing_under_test>_<scenario>`. No
329
+ class-based test grouping unless fixtures truly demand it (rare).
231
330
  - **Markers**: `unit`, `property`, `smoke`, `golden`.
232
331
  - **Sklearn-reference + analytical** as the unit-test oracle where available.
233
332
  - **Hypothesis** required for math/stat invariants. Strategies use
@@ -74,15 +74,14 @@ probes = ["torch>=2.0", "transformers>=4.40"]
74
74
  # (granular extras — losses callers should not have to install the larger
75
75
  # transformers stack). Shares the torch version pin with [probes].
76
76
  losses = ["torch>=2.0"]
77
- # DEPRECATED (announced v0.30.1, removal v0.33.0).
77
+ # NO-OP extra kept for backward compatibility (R3 at v0.49.0).
78
78
  #
79
- # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
80
- # / dev still resolve cleanly. jsonschema moved to the base deps in
81
- # v0.16.0; this extra has been a no-op ever since. The 2-minor-version
82
- # window (v0.30.1 announce v0.33.0 remove) matches the @deprecated
83
- # policy in docs/DEPRECATION.md. Extras can't trigger import-time
84
- # DeprecationWarnings, so the deprecation is documentation-only here +
85
- # in CHANGELOG ### Deprecated + docs/DEPRECATION.md.
79
+ # jsonschema>=4.21 moved to base deps at v0.16.0; this extra has been a
80
+ # no-op ever since. Originally announced as deprecated in v0.30.1 with
81
+ # target removal at v0.33.0, but reclassified at v0.49.0 (R3 in
82
+ # docs/DEPRECATION.md) as a permanent no-op hard removal would break
83
+ # consumer pip pins of the form `eval-toolkit[validation]` for zero
84
+ # functional benefit. Retained indefinitely.
86
85
  validation = []
87
86
  # v0.31.0 docs site: Sphinx + pydata-sphinx-theme (replaces v0.28.0's
88
87
  # mkdocs-material). Migration drivers — pain points Q1 in the v0.31.0
@@ -1,9 +1,12 @@
1
1
  """eval-toolkit — reusable evaluation contracts for binary classification.
2
2
 
3
- Public API remains available from ``eval_toolkit`` and from submodules:
3
+ The v1.0 primary metric surface is :func:`~eval_toolkit.scorecard` plus the
4
+ :mod:`~eval_toolkit.metric_specs` namespace (ADR 0002). Submodule paths
5
+ remain available for scalar primitives and adapter authors:
4
6
 
5
- from eval_toolkit import pr_auc, bootstrap_ci, BootstrapCI
6
- from eval_toolkit.metrics import pr_auc
7
+ from eval_toolkit import scorecard, metric_specs as ms
8
+ from eval_toolkit import bootstrap_ci, BootstrapCI
9
+ from eval_toolkit.metrics import pr_auc # internal API, ADR 0002
7
10
 
8
11
  The package root uses lazy exports so importing ``eval_toolkit`` does not
9
12
  eagerly import optional-heavy modules such as plotting, loaders, or harnesses.
@@ -35,15 +38,15 @@ _EXPORTS: dict[str, str] = {
35
38
  "ALL_TECHNIQUES": "eval_toolkit.adversarial",
36
39
  "BidiRTLInjection": "eval_toolkit.adversarial",
37
40
  "CORE_TECHNIQUES": "eval_toolkit.adversarial",
38
- "CaseRandomization": "eval_toolkit.adversarial",
41
+ "CaseInjection": "eval_toolkit.adversarial",
39
42
  "DiacriticInjection": "eval_toolkit.adversarial",
40
43
  "HomoglyphSubstitution": "eval_toolkit.adversarial",
41
44
  "InvisibleCharsInjection": "eval_toolkit.adversarial",
42
45
  "PunctuationInjection": "eval_toolkit.adversarial",
43
46
  "SynonymSubstitution": "eval_toolkit.adversarial",
44
47
  "TagStrippingInjection": "eval_toolkit.adversarial",
45
- "TokenSplitting": "eval_toolkit.adversarial",
46
- "UnicodeNormalization": "eval_toolkit.adversarial",
48
+ "TokenSplittingInjection": "eval_toolkit.adversarial",
49
+ "UnicodeNormalizationInjection": "eval_toolkit.adversarial",
47
50
  "WhitespaceInjection": "eval_toolkit.adversarial",
48
51
  "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
49
52
  # CharacterInjectionStrategy + character_injection SimpleNamespace
@@ -199,7 +202,7 @@ _EXPORTS: dict[str, str] = {
199
202
  "MANIFEST_SCHEMA_VERSION": "eval_toolkit.manifest",
200
203
  "RunManifest": "eval_toolkit.manifest",
201
204
  "SourceRoleRecord": "eval_toolkit.manifest",
202
- "build_manifest": "eval_toolkit.manifest",
205
+ "make_manifest": "eval_toolkit.manifest",
203
206
  "validate_source_roles": "eval_toolkit.manifest",
204
207
  "write_manifest": "eval_toolkit.manifest",
205
208
  # --- metrics ---
@@ -207,12 +210,15 @@ _EXPORTS: dict[str, str] = {
207
210
  "SINGLE_CLASS_INCOMPATIBLE_METRICS": "eval_toolkit.metrics",
208
211
  "ThresholdResult": "eval_toolkit.metrics",
209
212
  "brier_decomposition": "eval_toolkit.metrics",
210
- # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants removed from
211
- # `_EXPORTS` at v0.46 (Decision L). They remain reachable at the top
212
- # level via the `__getattr__` deprecation branch (emits
213
- # `DeprecationWarning`; branch removed at v0.47) and via the metrics
214
- # submodule (`from eval_toolkit.metrics import pr_auc` — internal API
215
- # per ADR 0002, not part of the v1.0 stability contract).
213
+ # `brier_score`, `pr_auc`, `roc_auc`, and the 5 ECE variants were removed
214
+ # from `_EXPORTS` at v0.46 (Decision L); the v0.46 `__getattr__`
215
+ # deprecation branch that kept them reachable with `DeprecationWarning`
216
+ # was removed at v0.47. They now raise `AttributeError` at the top level.
217
+ # The metrics submodule (`from eval_toolkit.metrics import pr_auc`)
218
+ # remains the only stable import path for scalar primitives — internal
219
+ # API per ADR 0002, not part of the v1.0 stability contract. The
220
+ # `scorecard()` + `metric_specs` surface is the primary path going
221
+ # forward (`metric_specs.pr_auc`, `metric_specs.roc_auc`, etc.).
216
222
  "headline_metrics": "eval_toolkit.metrics",
217
223
  "is_metric_defined_for_slice": "eval_toolkit.metrics",
218
224
  "metrics_at_threshold": "eval_toolkit.metrics",
@@ -309,10 +315,10 @@ _EXPORTS: dict[str, str] = {
309
315
  "wilson_interval": "eval_toolkit.thresholds",
310
316
  "LogisticStacker": "eval_toolkit.stacking",
311
317
  "MetaLearner": "eval_toolkit.stacking",
312
- "MetricResult": "eval_toolkit._scorecard",
313
- "MetricSpec": "eval_toolkit._scorecard",
314
- "Scorecard": "eval_toolkit._scorecard",
315
- "scorecard": "eval_toolkit._scorecard",
318
+ "MetricResult": "eval_toolkit.scorecards",
319
+ "MetricSpec": "eval_toolkit.scorecards",
320
+ "Scorecard": "eval_toolkit.scorecards",
321
+ "scorecard": "eval_toolkit.scorecards",
316
322
  # --- sweep (top-level v0.47 unification — Decision K + Decision D) ---
317
323
  "sweep": "eval_toolkit._sweep",
318
324
  }
@@ -0,0 +1,46 @@
1
+ """Private RNG-parameter type aliases per Scientific-Python SPEC 7.
2
+
3
+ This module centralizes the type aliases used to annotate user-facing RNG
4
+ parameters across the toolkit. Per `SPEC 7 — Seeding PRNG
5
+ <https://scientific-python.org/specs/spec-0007/>`_ (Endorsed) eval-toolkit
6
+ exposes a single canonical parameter name ``rng`` typed as
7
+ ``RNGLike | SeedLike | None`` on every function that consumes a NumPy
8
+ ``Generator``. Bodies normalize via ``np.random.default_rng(rng)``.
9
+
10
+ This module is private (underscore prefix) so the aliases stay an
11
+ implementation detail — public symbols use them only in their annotations.
12
+ If a Tier-2 consumer ever needs them exposed for their own callsite type
13
+ annotations, promote them via ``eval_toolkit.protocols`` per the
14
+ asymmetric-promotion principle in ADR 0001 + STYLE.md §3d.
15
+
16
+ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
17
+
18
+ - ``seeds.set_global_seeds(seed: int)`` — global-state setter, not a
19
+ per-function RNG parameter; SPEC 7 is scoped to per-function RNG inputs.
20
+ - ``adversarial.*Injection`` / ``*Substitution`` / ``CaseInjection``
21
+ dataclass fields — they use Python's stdlib ``random.Random(seed)``,
22
+ not NumPy. SPEC 7's typing (``RNGLike = np.random.Generator | ...``) is
23
+ strictly NumPy-scoped.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from collections.abc import Sequence
29
+
30
+ import numpy as np
31
+
32
+ type SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence
33
+ """Anything that can seed a NumPy bit generator.
34
+
35
+ Per SPEC 7, ``np.random.default_rng`` accepts any of these as a seed
36
+ without further conversion. ``Sequence[int]`` is the entropy-vector form
37
+ used by ``np.random.SeedSequence``.
38
+ """
39
+
40
+ type RNGLike = np.random.Generator | np.random.BitGenerator
41
+ """An already-instantiated NumPy bit generator or generator wrapper.
42
+
43
+ ``np.random.default_rng(rng)`` is the identity function on
44
+ ``Generator`` inputs and lifts ``BitGenerator`` inputs into a
45
+ ``Generator`` — both forms compose cleanly.
46
+ """