eval-toolkit 0.50.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (183) hide show
  1. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/.gitignore +8 -0
  2. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/CHANGELOG.md +261 -0
  3. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/PKG-INFO +40 -43
  4. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/README.md +39 -42
  5. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/_rng.py +28 -9
  6. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/_sweep.py +31 -3
  7. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/_version.py +1 -1
  8. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/analysis.py +24 -1
  9. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/bootstrap.py +43 -4
  10. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/calibration.py +29 -0
  11. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/claims.py +19 -1
  12. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/harness.py +84 -5
  13. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/losses.py +10 -0
  14. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/metric_specs.py +7 -0
  15. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/protocols.py +19 -1
  16. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/splits.py +38 -5
  17. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/thresholds.py +49 -19
  18. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/public_api/snapshot.json +2 -2
  19. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_analysis.py +20 -0
  20. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_calibration_mc.py +12 -5
  21. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_unit.py +67 -0
  22. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_unit.py +18 -0
  23. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_folded.py +76 -0
  24. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_parallelism.py +77 -0
  25. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_losses.py +14 -0
  26. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_protocol_conformance.py +46 -0
  27. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_recall_at_fpr.py +45 -0
  28. eval_toolkit-1.0.0/tests/test_rng.py +97 -0
  29. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_scorecard.py +12 -0
  30. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_sweep.py +54 -0
  31. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/LICENSE +0 -0
  32. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/STYLE.md +0 -0
  33. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/archive/README.md +0 -0
  34. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/README.md +0 -0
  35. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/datasets/README.md +0 -0
  36. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/papers/data-integrity/README.md +0 -0
  37. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  38. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/papers/inference/README.md +0 -0
  39. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/research/papers/prompt-injection/README.md +0 -0
  40. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/source/adr/README.md +0 -0
  41. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/docs/source/methodology/README.md +0 -0
  42. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/pyproject.toml +0 -0
  43. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/__init__.py +0 -0
  44. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/__main__.py +0 -0
  45. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/_deprecated.py +0 -0
  46. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/_parallel.py +0 -0
  47. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/adversarial.py +0 -0
  48. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/artifacts.py +0 -0
  49. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/config.py +0 -0
  50. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/docs.py +0 -0
  51. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/embeddings.py +0 -0
  52. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/evidence.py +0 -0
  53. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/leakage.py +0 -0
  54. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/loaders.py +0 -0
  55. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/manifest.py +0 -0
  56. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/metrics.py +0 -0
  57. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/operating_points.py +0 -0
  58. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/paths.py +0 -0
  59. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/plotting.py +0 -0
  60. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/preprocessing.py +0 -0
  61. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/probes.py +0 -0
  62. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/provenance.py +0 -0
  63. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/py.typed +0 -0
  64. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  65. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  66. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  67. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/ood_manifest.v1.json +0 -0
  68. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  69. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  70. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/scorecards.py +0 -0
  71. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/seeds.py +0 -0
  72. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/stacking.py +0 -0
  73. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/src/eval_toolkit/text_dedup.py +0 -0
  74. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  75. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  76. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  77. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  78. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  79. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  80. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  81. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  82. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  83. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  84. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/benchmarks/__init__.py +0 -0
  85. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  86. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/conftest.py +0 -0
  87. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  88. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  89. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  90. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  91. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/docs/expected.md +0 -0
  92. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/docs/input.md +0 -0
  93. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/docs/metrics.json +0 -0
  94. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  95. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/strategies.py +0 -0
  96. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_adversarial.py +0 -0
  97. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_artifacts.py +0 -0
  98. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  99. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_edge_cases.py +0 -0
  100. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_golden.py +0 -0
  101. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_njobs.py +0 -0
  102. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_props.py +0 -0
  103. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_bootstrap_research_grounded.py +0 -0
  104. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_binary_adapters.py +0 -0
  105. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  106. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_determinism.py +0 -0
  107. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_optimization_failures.py +0 -0
  108. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_props.py +0 -0
  109. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_calibration_research_grounded.py +0 -0
  110. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_claims.py +0 -0
  111. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_claims_coverage.py +0 -0
  112. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_claims_props.py +0 -0
  113. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_cli.py +0 -0
  114. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_config.py +0 -0
  115. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_coverage_bootstrap.py +0 -0
  116. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_coverage_calibration.py +0 -0
  117. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_coverage_harness.py +0 -0
  118. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_coverage_metrics.py +0 -0
  119. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_coverage_plotting.py +0 -0
  120. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_croissant_e2e.py +0 -0
  121. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  122. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_deprecated_scalars_shim.py +0 -0
  123. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_deprecations.py +0 -0
  124. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_docs_golden.py +0 -0
  125. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_docs_props.py +0 -0
  126. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_embeddings.py +0 -0
  127. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_evidence_validators.py +0 -0
  128. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_edge_cases.py +0 -0
  129. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_fault_injection.py +0 -0
  130. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_internals.py +0 -0
  131. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_metric_options.py +0 -0
  132. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_harness_smoke.py +0 -0
  133. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_import_boundaries.py +0 -0
  134. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_is_metric_defined_for_slice.py +0 -0
  135. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_lazy_extras_messages.py +0 -0
  136. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_leakage.py +0 -0
  137. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_leakage_error_paths.py +0 -0
  138. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_leakage_props.py +0 -0
  139. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_loaders.py +0 -0
  140. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_loaders_coverage.py +0 -0
  141. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_loaders_props.py +0 -0
  142. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_logging.py +0 -0
  143. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_manifest.py +0 -0
  144. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  145. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_manifest_props.py +0 -0
  146. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_manifest_validation.py +0 -0
  147. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_metrics_props.py +0 -0
  148. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_metrics_stratified_subsets.py +0 -0
  149. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_metrics_unit.py +0 -0
  150. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_misc_coverage.py +0 -0
  151. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_numeric_edge_cases.py +0 -0
  152. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_ood_loader.py +0 -0
  153. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_operating_points.py +0 -0
  154. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_operating_points_props.py +0 -0
  155. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_parallel.py +0 -0
  156. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_paths.py +0 -0
  157. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_pipeline_e2e.py +0 -0
  158. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_plotting_edge.py +0 -0
  159. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_plotting_smoke.py +0 -0
  160. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_plotting_visual.py +0 -0
  161. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_preprocessing.py +0 -0
  162. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_probes.py +0 -0
  163. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_provenance.py +0 -0
  164. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_public_api.py +0 -0
  165. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_reference_equivalence.py +0 -0
  166. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_reproducibility_integration.py +0 -0
  167. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_schemas.py +0 -0
  168. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_seeds.py +0 -0
  169. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_splits.py +0 -0
  170. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_splits_leakage_integration.py +0 -0
  171. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_splits_props.py +0 -0
  172. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_stacking.py +0 -0
  173. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_text_dedup.py +0 -0
  174. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_text_dedup_coverage.py +0 -0
  175. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_text_dedup_props.py +0 -0
  176. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_text_dedup_strategies.py +0 -0
  177. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_thresholds.py +0 -0
  178. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_thresholds_constant_score.py +0 -0
  179. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_thresholds_coverage.py +0 -0
  180. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_thresholds_props.py +0 -0
  181. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_thresholds_research_grounded.py +0 -0
  182. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_tokenization_leakage_check.py +0 -0
  183. {eval_toolkit-0.50.0 → eval_toolkit-1.0.0}/tests/test_v09_contracts.py +0 -0
@@ -54,6 +54,14 @@ mutants/
54
54
  gate3-audit-prompt.md
55
55
  gate3-audit-report.md
56
56
  gate3-audit-round-*-report.md
57
+ # R8-C10 audit fix: extend to cover the comprehensive-audit-* and
58
+ # audit-verification-* naming conventions introduced at v0.50/v0.51.
59
+ codex-comprehensive-audit-*-report.md
60
+ codex-microaudit-*.md
61
+ gemini-microaudit-*.md
62
+ audit-gemini.md
63
+ comprehensive-audit-codex.md
64
+ audit-verification-*.md
57
65
 
58
66
  # Claude Code project settings (machine-local)
59
67
  .claude/
@@ -5,6 +5,267 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.0] — 2026-05-25 — Stability contract activates per ADR 0003
9
+
10
+ v1.0 is a **stability-contract activation**, not a code delta from v0.51.
11
+ Every fix that landed at v0.51 is what v1.0 ships; the new thing at v1.0
12
+ is that the [ADR 0003](docs/source/adr/0003-stability-contract-and-gate3-methodology.md)
13
+ Tier 1 / Tier 2 / Tier 3 stability contract becomes load-bearing.
14
+ Breaking changes to Tier-1 surfaces after v1.0 require a major bump (v2.0).
15
+
16
+ ### Contract activation
17
+
18
+ - **Tier 1 STRICT** — public-API signatures captured in
19
+ `tests/golden/public_api/snapshot.json`. Any signature drift bumps to v2.0.
20
+ - **Tier 2 ADDITIVE** — the 9 strict Protocols (`Scorer`, `LeakageCheck`,
21
+ `Splitter`, `ThresholdSelector`, `DatasetLoader`, `MetricSpec`,
22
+ `MetaLearner`, `Probe`, `TextTransform`) + 1 opt-in (`Versioned`).
23
+ Method shapes are frozen; additive subprotocols + new Protocols allowed.
24
+ - **Tier 3 FREE** — internal modules (prefixed `_`). Refactors don't
25
+ bump major.
26
+
27
+ ### Gate 3 audit closure (Rounds 5 → 10)
28
+
29
+ The multi-LLM cross-review sequence is closed. Per ADR 0003, Gate 3
30
+ substitutes Codex + Gemini + Claude independent reads for external
31
+ academic peer review. Outcomes:
32
+
33
+ - **Round 8** (against v0.50): 13 confirmed → fixed in v0.51; 3 refuted;
34
+ 2 v1.x-deferred (R8-G3 custom exceptions; R8-G4 joblib OOM capping).
35
+ - **Round 9** (against v0.51 RC): 6 confirmed of 10 source items + 3
36
+ third-audit findings in modules neither auditor cited. 2 candidate-
37
+ blocker items fixed in-PR before tag.
38
+ - **Round 10** (micro-audit on R9 follow-on commit): 3 Codex confirmed →
39
+ fixed in v0.51; 1 accept-as-design (Gemini); 1 refuted (Gemini
40
+ Pattern-1 violation).
41
+
42
+ Full ledger at `docs/source/audit_findings.md` Rounds 5 → 10. v1.0.1
43
+ cleanup batch tracked at GH issue #76.
44
+
45
+ ### Carried-over deprecations
46
+
47
+ The R8-C1 `DeprecationWarning` on multi-seed `evaluate_folded(seeds=...)`
48
+ calls without an explicit `reseed_splitter` callback **persists past v1.0
49
+ by design** (pre-v1.0 deprecation window is one minor; `DEPRECATION.md`
50
+ requires ≥2 minors to close a cycle). Single-seed callers see no change;
51
+ multi-seed callers should pass `reseed_splitter` for true seed variance.
52
+
53
+ ### Migration
54
+
55
+ If your consumer is on v0.51, nothing changes. If on v0.50 or earlier,
56
+ follow [`docs/source/migration/v0.51.md`](docs/source/migration/v0.51.md)
57
+ for the actual migration steps. Downstream projects should pin
58
+ `eval-toolkit>=1.0,<2.0` to opt into the stability contract.
59
+
60
+ ## [0.51.0] — 2026-05-24 — Round 8 rectification batch
61
+
62
+ The 18-item rectification batch following the Round 8 multi-LLM audit
63
+ (Codex + Gemini reports verified at
64
+ `audit-verification-codex-gemini-v0.50.0.md`, 2026-05-24). Per Decision
65
+ Y.2 + the staggered-pre-v1.0 plan, v0.51.0 is a BREAKING-allowed
66
+ minor bundling all fixes before v1.0 tags. Round 9 audit runs against
67
+ the v0.51 RC.
68
+
69
+ **Audit outcome**: 13 confirmed → fixed in this release; 2 deferred
70
+ (R8-G3 custom exceptions, R8-G4 joblib OOM capping) to v1.x as Tier-2
71
+ additive; 3 refuted (R8-G2 cyclic-import framing; R8-G5 cherry-picked
72
+ weak test; R8-V1 + R8-V2 over-confident Gemini validations). See
73
+ `docs/source/audit_findings.md` Round 8 section for the full ledger.
74
+
75
+ **Round 9 follow-on**: a Round 9 multi-LLM cross-review (Codex + Gemini)
76
+ ran against the v0.51 RC pre-tag. Verified by Claude at
77
+ `audit-verification-round-9-v0.51.0.md` (6 confirmed / 3 refuted / 1
78
+ partial; plus 3 third-audit fixes in modules neither auditor cited).
79
+ **Two third-audit findings + one source-report regression fix shipped
80
+ in this RC pre-tag** (commit-graph below); the remaining 4 deferred
81
+ items go to v1.0.1. See `audit_findings.md` Round 9 section for the
82
+ full ledger.
83
+
84
+ ### Added (Round 9 follow-on)
85
+
86
+ - **R9-F-sweep-1** (CANDIDATE v1.0 BLOCKER closed) — `_sweep.py:
87
+ _validate_scorer_output()` now validates scorer output is finite
88
+ (no NaN / +inf / -inf), not just shape. Pre-R9 follow-on, NaN/inf
89
+ scores passed R7-C's shape check and silently propagated into the
90
+ sweep DataFrame, then silently zeroed the ASR flag (NaN >= threshold
91
+ is False). Closes the "no silent failures" invariant gap R7-C
92
+ established for shape but didn't extend to finiteness. Brings sweep
93
+ validation to parity with `stacking.py`'s `_validate_fit_inputs` /
94
+ `_validate_predict_inputs`. Tier-2 additive — callers whose scorers
95
+ were silently producing NaN now get a clear `ValueError` with
96
+ diagnostic context.
97
+
98
+ - **R9-F-bootstrap-1** — `bootstrap.bootstrap_ci(...)` emits a
99
+ `UserWarning` when scipy's BCa method degenerates (returns
100
+ `ci_low == ci_high == point` or non-finite bounds). Pre-R9, the
101
+ R8-C4(b) RNG bug spuriously varied bootstrap streams and could mask
102
+ BCa degeneracy on small-n + ceiling/floor-metric inputs; post-R8 with
103
+ correct RNG, the brittleness is exposed. Warning text recommends
104
+ `method='percentile'` as the safer fallback at small n. The default
105
+ remains `method='BCa'` (preserves bit-stability for non-degenerate
106
+ cases); auto-fallback is deferred to v1.0.1 if user demand.
107
+
108
+ - **R9-F-bootstrap-2** — `bootstrap.mde_from_ci(...)` now explicitly
109
+ rejects NaN CI width with `RuntimeError`. Pre-R9, NaN width
110
+ (possible when scipy BCa returns NaN bounds) bypassed the
111
+ `if width <= 0` check (NaN <= 0 is False in IEEE float) and
112
+ silently returned `MDEEstimate.mde = NaN`. Bundled with F-bootstrap-1.
113
+
114
+ ### Fixed (Round 10 follow-on)
115
+
116
+ Pre-tag scoped Codex + Gemini micro-audit on `edadddc` surfaced 3
117
+ Codex-confirmed findings (all fix-recommended / minor; no v1.0
118
+ blockers). Verified by Claude; 1 Gemini accept-as-design + 1 Gemini
119
+ refuted (Pattern-1 violation; calibration record in
120
+ `audit_findings.md` Round 10 section). All 3 confirmed findings
121
+ shipped in this RC pre-tag:
122
+
123
+ - **R10-F1** — `protocols.Scorer.predict_proba` docstring + `_sweep.py`
124
+ error message clarification. Pre-R10, `_validate_scorer_output`'s
125
+ runtime error said "finite floats in [0, 1]" but the boundary check
126
+ only enforced finiteness (no range validation); the Scorer Protocol
127
+ docstring also lacked an explicit `[0, 1]` contract statement. R10-F1
128
+ extends the Protocol docstring to document calibrated-probability
129
+ semantics + reword the sweep runtime message to drop the unenforced
130
+ `[0, 1]` claim. Range enforcement is intentionally deferred to a
131
+ future minor once consumer usage patterns clarify whether the
132
+ Protocol should be strict (`[0, 1]`) or permissive (ranking scores).
133
+
134
+ - **R10-F2** — `tests/test_bootstrap_unit.py::test_bootstrap_ci_bca_degeneracy_emits_warning`
135
+ test predicate hardening. Pre-R10, the test's assertion block used
136
+ `if ci.ci_low == ci.ci_high == ci.point_estimate:` — but NaN==NaN is
137
+ False in IEEE float, so the assertions were silently skipped on the
138
+ current scipy fixture (which returns NaN bounds). The test passed
139
+ WITHOUT proving the warning fires for the common degeneracy mode.
140
+ R10-F2 mirrors the production predicate exactly:
141
+ `(not np.isfinite(low)) or (not np.isfinite(high)) or (low == high == point)`.
142
+ The assertion block now runs whenever ANY degeneracy mode fires.
143
+
144
+ - **R10-F3** — `bootstrap.mde_from_ci` docstring update for the
145
+ R9-F-bootstrap-2 non-finite-width branch. Pre-R10, the Raises section
146
+ said "non-positive width" only; the implementation has also been
147
+ rejecting non-finite width since `edadddc` but the docstring lagged.
148
+ R10-F3 updates the Raises text to "non-positive or non-finite width"
149
+ and adds a 4-line note explaining the scipy-BCa NaN-bound motivation
150
+ so callers understand the new behavior is intentional, not incidental.
151
+
152
+ ### Added
153
+
154
+ - **R8-C6** — `calibration.reliability_curve(...)` and
155
+ `calibration.maximum_calibration_error(...)` now call
156
+ `_validate_calibrated_score(y_score)` BEFORE the sklearn dispatch.
157
+ Pre-v0.51 these functions silently accepted raw logits (any range);
158
+ sibling `metrics.expected_calibration_error*` variants already
159
+ validated input range via the same helper. Now symmetric — out-of-range
160
+ scores raise `ValueError` with the same actionable diagnostic. Also,
161
+ `calibration.fit_temperature(...)` now validates the `bounds` tuple
162
+ (finiteness, positivity, `lo < hi`) BEFORE forwarding to
163
+ `scipy.optimize.minimize_scalar` — cryptic optimizer errors replaced
164
+ with actionable input-validation errors.
165
+
166
+ - **R8-F1** — `losses.RecallAtLowFPR.__init__(...)` now validates
167
+ `pos_weight > 0` at construction time, matching the sibling-kwarg
168
+ validators for `fpr_target` and `fpr_smoothing_beta`. Pre-v0.51
169
+ non-positive `pos_weight` produced degenerate-but-bounded loss
170
+ values silently.
171
+
172
+ - **R8-F2** — `metric_specs.ece(n_bins=, strategy=)` factory now validates
173
+ `n_bins` eagerly at spec-construction time (matches the eager
174
+ `strategy` validation already present). Pre-v0.51 `n_bins`
175
+ validation was deferred to compute time.
176
+
177
+ - **R8-F3** — `analysis.CsvPredictionReader.read_predictions(...)` now
178
+ detects missing CSV columns at read time and raises a
179
+ `ValueError(f"CSV file at {uri!r} is missing required column(s) ...")`
180
+ with the file path + available columns. Pre-v0.51 missing columns
181
+ were silently filled with empty strings, causing cryptic
182
+ `ValueError: invalid literal for int() with base 10: ''` downstream
183
+ in `load_prediction_arrays`'s dtype conversion. Root cause now
184
+ surfaces at the boundary.
185
+
186
+ - **R8-C1** — `harness.evaluate_folded(...)` now accepts an optional
187
+ `reseed_splitter: Callable[[Splitter, int], Splitter] | None`
188
+ callback. When provided, each seed iteration calls
189
+ `reseed_splitter(splitter, seed)` to produce a fresh splitter for
190
+ that seed's fold iteration. Default `None` preserves the historical
191
+ behavior (the same splitter instance is reused across the seed loop,
192
+ so multi-seed × CV only varies the bootstrap RNG, not fold
193
+ partitions) AND emits a `DeprecationWarning` when `len(seeds) > 1`.
194
+ The warning persists past v1.0 because the pre-v1.0 deprecation
195
+ window (v0.51 → v1.0) is one minor and ADR 0003 / DEPRECATION.md
196
+ require ≥2 minors to close a cycle. Migration example::
197
+
198
+ from dataclasses import replace
199
+ evaluate_folded(
200
+ scorers, splitter, slice_,
201
+ seeds=(1, 2, 3),
202
+ reseed_splitter=lambda sp, s: replace(sp, seed=s),
203
+ ...
204
+ )
205
+
206
+ R8-C1 audit fix.
207
+
208
+ ### BREAKING
209
+
210
+ - **R8-C2** — `SourceDisjointKFoldSplitter.iter_folds(...)` now caps
211
+ the fold count at `min(self.k, n_sources)` (matching
212
+ `get_n_splits(...)`). Pre-v0.51 the loop ran `range(self.k)` and
213
+ yielded EMPTY test partitions for the surplus folds when
214
+ `k > n_sources` while `get_n_splits` returned `min(k, n_sources)`
215
+ — the two methods silently disagreed on fold count. v0.51 caps both
216
+ at the same value AND emits a `UserWarning` when `k > n_sources` so
217
+ the caller knows the cap was applied. Callers that consumed the
218
+ surplus empty-test folds will see fewer iterations now; that was
219
+ the bug. (Probe-verified at
220
+ `audit-verification-codex-gemini-v0.50.0.md`.)
221
+
222
+ - **R8-C3** — `thresholds.recall_at_fpr(...)` fallback semantics changed
223
+ when no threshold satisfies `target_fpr`. Pre-v0.51 the fallback set
224
+ `threshold = 1.0` and then computed `y_pred = (y_score >= 1.0)` —
225
+ inclusive comparator — which classified any negative-class sample
226
+ with score exactly 1.0 as predicted-positive. The probe
227
+ `recall_at_fpr(y=[0,1], scores=[1.0,1.0], target_fpr=0.0)` returned
228
+ `actual_fpr=1.0, fp=1` in silent violation of the function's
229
+ FPR-ceiling invariant. v0.51 returns a SENTINEL
230
+ `RecallAtFprResult(threshold=np.inf, recall=0.0, actual_fpr=0.0,
231
+ fp=0, tn=n_val_neg)` whenever the constraint is unsatisfiable.
232
+ Callers detect via `np.isinf(result.threshold)`. The
233
+ `actual_fpr ≤ target_fpr` invariant is now preserved by construction.
234
+ Migration: any caller filtering on `result.threshold` should add an
235
+ `np.isinf(...)` branch — pre-v0.51 the sentinel value was `1.0`.
236
+ (Verified at `audit-verification-codex-gemini-v0.50.0.md`.)
237
+
238
+ - **R8-C4(a)** — `harness.evaluate(...)` with a `Generator`-typed `rng`
239
+ is now bit-stable across `n_jobs` values. Prior to v0.51, the same
240
+ `rng` object was attached to every `(slice, scorer)` work_unit;
241
+ joblib forked copies at the SAME generator state into N parallel
242
+ workers, so every worker used identical bootstrap sample streams —
243
+ silently producing non-independent CIs across `(slice, scorer)`
244
+ pairs in parallel mode and divergent results vs sequential mode.
245
+ The v0.51 implementation spawns one independent `SeedSequence` per
246
+ work unit at the dispatch boundary in `_score_all_slices` (depends
247
+ on the R8-C4(b) `spawn_seed_sequences` fix). Each pair now sees an
248
+ independent bootstrap stream; sequential (`n_jobs=1`) and parallel
249
+ (`n_jobs>1`) modes produce bit-identical CIs per the SPEC 7
250
+ contract at `docs/source/methodology/parallelism.md`. Integer `rng`
251
+ callers (the common case) are unaffected. (Verified by multi-slice
252
+ probe at `audit-verification-codex-gemini-v0.50.0.md`.)
253
+
254
+ - **R8-C4(b)** — `eval_toolkit._rng.spawn_seed_sequences(rng, n)` now
255
+ respects `Generator` state. Prior to v0.51, the function extracted
256
+ the bit-generator's seed_seq and called `.spawn(n)` on it — so a
257
+ `Generator` advanced by prior draws produced the same children as a
258
+ fresh `Generator` with the same construction seed. The new
259
+ implementation draws `n` fresh entropy values FROM the generator
260
+ via `rng.integers(0, 2**63-1, size=n)` and wraps each in a
261
+ `SeedSequence`. Each call advances generator state, so repeated
262
+ calls on the same instance yield different children. This was the
263
+ root cause of bootstrap non-independence across `(slice, scorer)`
264
+ pairs in `harness.evaluate` — when the same `Generator` was shared
265
+ across bootstrap callsites, all callsites silently used the same
266
+ resample stream. (Verified probe at
267
+ `audit-verification-codex-gemini-v0.50.0.md`.)
268
+
8
269
  ## [0.50.0] — 2026-05-23 — SPEC 7 `rng` parameter adoption
9
270
 
10
271
  The SPEC 7 follow-up to v0.49.0. The `_rng.py` scaffold shipped at
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.50.0
3
+ Version: 1.0.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -114,7 +114,8 @@ format changes.
114
114
  │ gpu_info + leakage_report (NeurIPS-aligned) │
115
115
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
116
116
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
117
- │ ThresholdSelector / DatasetLoader / SimilarityStrategy
117
+ │ ThresholdSelector / DatasetLoader / MetricSpec
118
+ │ MetaLearner / Probe / TextTransform (9 strict) │
118
119
  │ Versioned (opt-in: per-object versions in manifest) │
119
120
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
120
121
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -129,69 +130,65 @@ run: capture the manifest.
129
130
 
130
131
  ## Documentation
131
132
 
132
- - **[Getting started](docs/getting-started.md)** — end-to-end
133
+ - **[Getting started](docs/source/getting-started.md)** — end-to-end
133
134
  walkthrough for new users: install, define a Scorer, build slices,
134
135
  run `evaluate()`, persist results, add a claim, render a plot.
135
- - **[Methodology curriculum](docs/methodology/README.md)** — 16
136
+ - **[Methodology curriculum](docs/source/methodology/README.md)** — 16
136
137
  chapters on splits, metrics, calibration, evidence gates,
137
138
  prediction artifacts, and more.
138
- - **[Schema reference](docs/schemas.md)** — field-by-field semantics
139
+ - **[Schema reference](docs/source/schemas.md)** — field-by-field semantics
139
140
  for `results.v1.json`, `results_full.v1.json`, `manifest.v1.json`.
140
- - **[Migration guides](docs/MIGRATION.md)** — v0.6→v0.7, v0.7→v0.8,
141
- v0.8→v0.9.
142
- - **[Extending](docs/extending.md)** — Protocol-by-Protocol guide for
141
+ - **[Migration guides](docs/source/MIGRATION.md)** — per-version migration
142
+ hub (v0.7 onward).
143
+ - **[Extending](docs/source/extending.md)** — Protocol-by-Protocol guide for
143
144
  custom Scorers, Splitters, LeakageChecks, ThresholdSelectors,
144
145
  DatasetLoaders, EvidenceGates.
145
- - **[Repo strategy](docs/repo-strategy.md)** — how the package is
146
- organized, the 6-bucket target shape, and the checklist that
147
- governs when to extract a sub-package into its own repo.
146
+ - **[Repo strategy](docs/source/repo-strategy.md)** — how the package is
147
+ organized, the flat-module layout per ADR 0001, and the v2.0 trigger
148
+ criteria for any future subpackage split.
148
149
 
149
150
  ## Methodology
150
151
 
151
152
  What good binary-classification evaluation looks like, with each
152
153
  concern mapped to the toolkit primitive that operationalizes it.
153
154
 
154
- - [`docs/methodology/`](docs/methodology/README.md) — the curriculum
155
- (16 chapters). Recommended reading order:
156
- [`leakage`](docs/methodology/leakage.md) →
157
- [`splits`](docs/methodology/splits.md) →
158
- [`thresholds`](docs/methodology/thresholds.md) →
159
- [`calibration`](docs/methodology/calibration.md) →
160
- [`comparison`](docs/methodology/comparison.md) →
161
- [`bootstrap`](docs/methodology/bootstrap.md) →
162
- [`length_stratification`](docs/methodology/length_stratification.md) →
163
- [`text_dedup`](docs/methodology/text_dedup.md) →
164
- [`versioning`](docs/methodology/versioning.md) →
165
- [`fairness`](docs/methodology/fairness.md) →
166
- [`reproducibility`](docs/methodology/reproducibility.md) →
167
- [`testing`](docs/methodology/testing.md) →
168
- [`reading_list`](docs/methodology/reading_list.md).
169
- - [`docs/MIGRATION.md`](docs/MIGRATION.md) — per-version migration
170
- guides (v0.6→v0.7, v0.7→v0.8).
171
- - [`docs/roadmap.md`](docs/roadmap.md) — forward-looking tracker;
172
- v1.0.0 path; consumer gap-doc cross-links.
155
+ - [`docs/source/methodology/`](docs/source/methodology/README.md) — the
156
+ curriculum (16 chapters). Recommended reading order:
157
+ [`leakage`](docs/source/methodology/leakage.md) →
158
+ [`splits`](docs/source/methodology/splits.md) →
159
+ [`thresholds`](docs/source/methodology/thresholds.md) →
160
+ [`calibration`](docs/source/methodology/calibration.md) →
161
+ [`comparison`](docs/source/methodology/comparison.md) →
162
+ [`bootstrap`](docs/source/methodology/bootstrap.md) →
163
+ [`length_stratification`](docs/source/methodology/length_stratification.md) →
164
+ [`text_dedup`](docs/source/methodology/text_dedup.md) →
165
+ [`versioning`](docs/source/methodology/versioning.md) →
166
+ [`fairness`](docs/source/methodology/fairness.md) →
167
+ [`reproducibility`](docs/source/methodology/reproducibility.md) →
168
+ [`testing`](docs/source/methodology/testing.md) →
169
+ [`reading_list`](docs/source/methodology/reading_list.md).
170
+ - [`docs/source/MIGRATION.md`](docs/source/MIGRATION.md) — per-version
171
+ migration guides (v0.7 onward; v0.49 / v0.50 / v0.51 included as of
172
+ v0.51.0).
173
+ - [`docs/source/roadmap.md`](docs/source/roadmap.md) forward-looking
174
+ tracker; v1.0.0 path; consumer gap-doc cross-links.
173
175
 
174
176
  ## Extending eval-toolkit
175
177
 
176
178
  How to plug your own scorers / leakage checks / splitters / loaders /
177
179
  threshold selectors into the harness.
178
180
 
179
- - [`docs/extending.md`](docs/extending.md) — Protocol-by-Protocol
181
+ - [`docs/source/extending.md`](docs/source/extending.md) — Protocol-by-Protocol
180
182
  guide, ~50-line full-harness recipe, project-layout pointer.
181
183
 
182
184
  ## Worked examples
183
185
 
184
- - [`docs/examples/prompt_injection_walkthrough.md`](docs/examples/prompt_injection_walkthrough.md)
185
- — End-to-end prompt-injection eval on a synthetic OWASP LLM01:2025
186
- fixture; cross-links to the
187
- [showcase repo](https://github.com/brandon-behring/prompt_injection_classifier_showcase)
188
- for the real Lakera PINT walkthrough.
189
- - [`docs/examples/pytorch_scorer_example.md`](docs/examples/pytorch_scorer_example.md)
190
- — HuggingFace transformer + LoRA `Scorer` adapter (batched inference,
191
- GPU/CPU placement, deterministic-mode setup).
192
- - [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md)
193
- — Composing reference + custom `EvidenceGate`s into a `ClaimSpec` and
194
- running `evaluate_claims()` for release-time go/no-go checks.
186
+ - [`docs/source/examples/`](docs/source/examples/index.md) — Sphinx /
187
+ MyST-NB executable notebooks covering: the evaluation harness,
188
+ metrics + bootstrap, calibration, claims-and-gates, leakage
189
+ detection, cross-corpus contamination scanning, character-injection
190
+ adversarial sweeps, callable-embedder dedup, and the activation-delta
191
+ probe.
195
192
 
196
193
  ## Install
197
194
 
@@ -291,7 +288,7 @@ with tempfile.TemporaryDirectory() as run_dir:
291
288
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
292
289
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
293
290
  | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
294
- | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
291
+ | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/source/extending.md`](docs/source/extending.md) for writing custom gates and [`docs/source/examples/claims_and_gates.md`](docs/source/examples/claims_and_gates.md) for a worked end-to-end example. |
295
292
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
296
293
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
297
294
  | `eval_toolkit.provenance` | File hashing, run-directory layout, figure metadata sidecar |
@@ -31,7 +31,8 @@ format changes.
31
31
  │ gpu_info + leakage_report (NeurIPS-aligned) │
32
32
  ├─ Tier 2 ─ Protocol-based orchestration ────────────────┤
33
33
  │ Scorer / SliceAwareScorer / LeakageCheck / Splitter │
34
- │ ThresholdSelector / DatasetLoader / SimilarityStrategy
34
+ │ ThresholdSelector / DatasetLoader / MetricSpec
35
+ │ MetaLearner / Probe / TextTransform (9 strict) │
35
36
  │ Versioned (opt-in: per-object versions in manifest) │
36
37
  ├─ Tier 1 ─ Functional core ─────────────────────────────┤
37
38
  │ pr_auc / roc_auc / ECE variants / Brier / bootstrap_ci│
@@ -46,69 +47,65 @@ run: capture the manifest.
46
47
 
47
48
  ## Documentation
48
49
 
49
- - **[Getting started](docs/getting-started.md)** — end-to-end
50
+ - **[Getting started](docs/source/getting-started.md)** — end-to-end
50
51
  walkthrough for new users: install, define a Scorer, build slices,
51
52
  run `evaluate()`, persist results, add a claim, render a plot.
52
- - **[Methodology curriculum](docs/methodology/README.md)** — 16
53
+ - **[Methodology curriculum](docs/source/methodology/README.md)** — 16
53
54
  chapters on splits, metrics, calibration, evidence gates,
54
55
  prediction artifacts, and more.
55
- - **[Schema reference](docs/schemas.md)** — field-by-field semantics
56
+ - **[Schema reference](docs/source/schemas.md)** — field-by-field semantics
56
57
  for `results.v1.json`, `results_full.v1.json`, `manifest.v1.json`.
57
- - **[Migration guides](docs/MIGRATION.md)** — v0.6→v0.7, v0.7→v0.8,
58
- v0.8→v0.9.
59
- - **[Extending](docs/extending.md)** — Protocol-by-Protocol guide for
58
+ - **[Migration guides](docs/source/MIGRATION.md)** — per-version migration
59
+ hub (v0.7 onward).
60
+ - **[Extending](docs/source/extending.md)** — Protocol-by-Protocol guide for
60
61
  custom Scorers, Splitters, LeakageChecks, ThresholdSelectors,
61
62
  DatasetLoaders, EvidenceGates.
62
- - **[Repo strategy](docs/repo-strategy.md)** — how the package is
63
- organized, the 6-bucket target shape, and the checklist that
64
- governs when to extract a sub-package into its own repo.
63
+ - **[Repo strategy](docs/source/repo-strategy.md)** — how the package is
64
+ organized, the flat-module layout per ADR 0001, and the v2.0 trigger
65
+ criteria for any future subpackage split.
65
66
 
66
67
  ## Methodology
67
68
 
68
69
  What good binary-classification evaluation looks like, with each
69
70
  concern mapped to the toolkit primitive that operationalizes it.
70
71
 
71
- - [`docs/methodology/`](docs/methodology/README.md) — the curriculum
72
- (16 chapters). Recommended reading order:
73
- [`leakage`](docs/methodology/leakage.md) →
74
- [`splits`](docs/methodology/splits.md) →
75
- [`thresholds`](docs/methodology/thresholds.md) →
76
- [`calibration`](docs/methodology/calibration.md) →
77
- [`comparison`](docs/methodology/comparison.md) →
78
- [`bootstrap`](docs/methodology/bootstrap.md) →
79
- [`length_stratification`](docs/methodology/length_stratification.md) →
80
- [`text_dedup`](docs/methodology/text_dedup.md) →
81
- [`versioning`](docs/methodology/versioning.md) →
82
- [`fairness`](docs/methodology/fairness.md) →
83
- [`reproducibility`](docs/methodology/reproducibility.md) →
84
- [`testing`](docs/methodology/testing.md) →
85
- [`reading_list`](docs/methodology/reading_list.md).
86
- - [`docs/MIGRATION.md`](docs/MIGRATION.md) — per-version migration
87
- guides (v0.6→v0.7, v0.7→v0.8).
88
- - [`docs/roadmap.md`](docs/roadmap.md) — forward-looking tracker;
89
- v1.0.0 path; consumer gap-doc cross-links.
72
+ - [`docs/source/methodology/`](docs/source/methodology/README.md) — the
73
+ curriculum (16 chapters). Recommended reading order:
74
+ [`leakage`](docs/source/methodology/leakage.md) →
75
+ [`splits`](docs/source/methodology/splits.md) →
76
+ [`thresholds`](docs/source/methodology/thresholds.md) →
77
+ [`calibration`](docs/source/methodology/calibration.md) →
78
+ [`comparison`](docs/source/methodology/comparison.md) →
79
+ [`bootstrap`](docs/source/methodology/bootstrap.md) →
80
+ [`length_stratification`](docs/source/methodology/length_stratification.md) →
81
+ [`text_dedup`](docs/source/methodology/text_dedup.md) →
82
+ [`versioning`](docs/source/methodology/versioning.md) →
83
+ [`fairness`](docs/source/methodology/fairness.md) →
84
+ [`reproducibility`](docs/source/methodology/reproducibility.md) →
85
+ [`testing`](docs/source/methodology/testing.md) →
86
+ [`reading_list`](docs/source/methodology/reading_list.md).
87
+ - [`docs/source/MIGRATION.md`](docs/source/MIGRATION.md) — per-version
88
+ migration guides (v0.7 onward; v0.49 / v0.50 / v0.51 included as of
89
+ v0.51.0).
90
+ - [`docs/source/roadmap.md`](docs/source/roadmap.md) forward-looking
91
+ tracker; v1.0.0 path; consumer gap-doc cross-links.
90
92
 
91
93
  ## Extending eval-toolkit
92
94
 
93
95
  How to plug your own scorers / leakage checks / splitters / loaders /
94
96
  threshold selectors into the harness.
95
97
 
96
- - [`docs/extending.md`](docs/extending.md) — Protocol-by-Protocol
98
+ - [`docs/source/extending.md`](docs/source/extending.md) — Protocol-by-Protocol
97
99
  guide, ~50-line full-harness recipe, project-layout pointer.
98
100
 
99
101
  ## Worked examples
100
102
 
101
- - [`docs/examples/prompt_injection_walkthrough.md`](docs/examples/prompt_injection_walkthrough.md)
102
- — End-to-end prompt-injection eval on a synthetic OWASP LLM01:2025
103
- fixture; cross-links to the
104
- [showcase repo](https://github.com/brandon-behring/prompt_injection_classifier_showcase)
105
- for the real Lakera PINT walkthrough.
106
- - [`docs/examples/pytorch_scorer_example.md`](docs/examples/pytorch_scorer_example.md)
107
- — HuggingFace transformer + LoRA `Scorer` adapter (batched inference,
108
- GPU/CPU placement, deterministic-mode setup).
109
- - [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md)
110
- — Composing reference + custom `EvidenceGate`s into a `ClaimSpec` and
111
- running `evaluate_claims()` for release-time go/no-go checks.
103
+ - [`docs/source/examples/`](docs/source/examples/index.md) — Sphinx /
104
+ MyST-NB executable notebooks covering: the evaluation harness,
105
+ metrics + bootstrap, calibration, claims-and-gates, leakage
106
+ detection, cross-corpus contamination scanning, character-injection
107
+ adversarial sweeps, callable-embedder dedup, and the activation-delta
108
+ probe.
112
109
 
113
110
  ## Install
114
111
 
@@ -208,7 +205,7 @@ with tempfile.TemporaryDirectory() as run_dir:
208
205
  | `eval_toolkit.splits` | `Splitter` Protocol + 5 reference impls (holdout / stratified / group / source-disjoint / time-series) |
209
206
  | `eval_toolkit.loaders` | `DatasetLoader` Protocol + 4 reference impls (DataFrame / SingleSlice / ParquetGlob / HF datasets) with Croissant-compatible `describe()` |
210
207
  | `eval_toolkit.manifest` | `RunManifest` (NeurIPS-aligned) + source-role / guardrail metadata + `make_manifest` / `write_manifest` |
211
- | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/extending.md`](docs/extending.md) for writing custom gates and [`docs/examples/claims_and_gates.md`](docs/examples/claims_and_gates.md) for a worked end-to-end example. |
208
+ | `eval_toolkit.claims` | `EvidenceGate` class (frozen dataclass: name + callable check + severity), reference gate factories (`required_metric_gate`, `minimum_slice_size_gate`, `metric_threshold_gate`, etc.), `evaluate_claims()`, and `ClaimReport` for claim-mode vs exploratory-mode checks. See [`docs/source/extending.md`](docs/source/extending.md) for writing custom gates and [`docs/source/examples/claims_and_gates.md`](docs/source/examples/claims_and_gates.md) for a worked end-to-end example. |
212
209
  | `eval_toolkit.text_dedup` | `SimilarityStrategy` Protocol + 5 strategies (TF-IDF / hash / embedding / Jaccard / MinHash-LSH); `near_dedup` / `cross_dedup` orchestrators |
213
210
  | `eval_toolkit.plotting` | PR curves, reliability diagrams, confusion matrices, score histograms, lift CIs |
214
211
  | `eval_toolkit.provenance` | File hashing, run-directory layout, figure metadata sidecar |
@@ -26,7 +26,6 @@ Exceptions to the SPEC 7 convention — documented in STYLE.md §3a:
26
26
  from __future__ import annotations
27
27
 
28
28
  from collections.abc import Sequence
29
- from typing import cast
30
29
 
31
30
  import numpy as np
32
31
 
@@ -50,16 +49,36 @@ type RNGLike = np.random.Generator | np.random.BitGenerator
50
49
  def spawn_seed_sequences(rng: RNGLike | SeedLike | None, n: int) -> list[np.random.SeedSequence]:
51
50
  """Spawn ``n`` independent SeedSequences from any SPEC 7 ``rng`` input.
52
51
 
53
- Normalizes the input to a ``Generator``, then extracts the underlying
54
- ``SeedSequence`` via the bit-generator and spawns ``n`` children.
55
- The cast satisfies mypy strict: the ``seed_seq`` attribute on a
56
- concrete BitGenerator is a ``SeedSequence`` instance, but the type
57
- stub on ``BitGenerator.seed_seq`` returns the abstract
58
- ``ISeedSequence`` interface (which lacks ``spawn``).
52
+ Normalizes the input to a ``Generator``, then draws ``n`` random
53
+ 64-bit entropy values FROM the generator and wraps each in a fresh
54
+ ``SeedSequence``. The draws advance the generator's internal state
55
+ so subsequent calls to ``spawn_seed_sequences`` on the same
56
+ ``Generator`` instance produce different children.
59
57
 
60
58
  Used by the bootstrap parallel workers (which take spawned
61
59
  ``SeedSequence`` objects to seed their internal ``default_rng()`` calls).
60
+
61
+ Notes
62
+ -----
63
+ Prior to v0.51, this function extracted the
64
+ ``bit_generator.seed_seq`` from the input and called ``.spawn(n)``
65
+ on it. That implementation IGNORED Generator state — a ``Generator``
66
+ that had been advanced by prior draws produced the same children as
67
+ a fresh ``Generator`` with the same construction seed. Two callers
68
+ sharing one ``Generator`` therefore got identical bootstrap streams,
69
+ silently violating bootstrap independence across (slice, scorer)
70
+ pairs in ``harness.evaluate`` and across resamples in
71
+ ``bootstrap.bootstrap_ci``. The v0.51 implementation draws fresh
72
+ entropy from the generator on every call so spawning is
73
+ state-respecting; this also restores the SPEC 7 bit-for-bit identity
74
+ contract between sequential and parallel modes when the seeds are
75
+ fanned out at a single batch boundary.
62
76
  """
63
77
  gen = np.random.default_rng(rng)
64
- seed_seq = cast(np.random.SeedSequence, gen.bit_generator.seed_seq)
65
- return seed_seq.spawn(n)
78
+ # Draw n independent 64-bit unsigned entropy values FROM the generator.
79
+ # Each draw advances generator state, so repeated calls on the same
80
+ # Generator instance yield different children. Each entropy value seeds
81
+ # an independent SeedSequence (SeedSequence's bit-mixing guarantees
82
+ # downstream independence across the n children).
83
+ seeds = gen.integers(0, 2**63 - 1, size=n, dtype=np.int64)
84
+ return [np.random.SeedSequence(int(s)) for s in seeds]