eval-toolkit 0.33.0__tar.gz → 0.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/CHANGELOG.md +140 -0
  2. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/PKG-INFO +3 -1
  3. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/pyproject.toml +7 -1
  4. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/__init__.py +5 -0
  5. eval_toolkit-0.34.0/src/eval_toolkit/_parallel.py +129 -0
  6. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/bootstrap.py +406 -120
  8. eval_toolkit-0.34.0/src/eval_toolkit/embeddings.py +108 -0
  9. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/manifest.py +32 -1
  10. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/thresholds.py +129 -0
  11. eval_toolkit-0.34.0/tests/golden/data/dedup_holdout.jsonl +51 -0
  12. eval_toolkit-0.34.0/tests/golden/data/dedup_holdout_expected.json +44 -0
  13. eval_toolkit-0.34.0/tests/golden/data/dedup_holdout_provenance.md +89 -0
  14. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/public_api/snapshot.json +36 -9
  15. eval_toolkit-0.34.0/tests/golden/test_dedup_holdout_calibration.py +212 -0
  16. eval_toolkit-0.34.0/tests/test_block_bootstrap_on_folds.py +114 -0
  17. eval_toolkit-0.34.0/tests/test_bootstrap_njobs.py +165 -0
  18. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_unit.py +67 -0
  19. eval_toolkit-0.34.0/tests/test_embeddings.py +87 -0
  20. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest.py +50 -0
  21. eval_toolkit-0.34.0/tests/test_parallel.py +150 -0
  22. eval_toolkit-0.34.0/tests/test_recall_at_fpr.py +97 -0
  23. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/.gitignore +0 -0
  24. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/LICENSE +0 -0
  25. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/README.md +0 -0
  26. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/STYLE.md +0 -0
  27. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/archive/README.md +0 -0
  28. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/README.md +0 -0
  29. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/datasets/README.md +0 -0
  30. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/data-integrity/README.md +0 -0
  31. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  32. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/inference/README.md +0 -0
  33. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/prompt-injection/README.md +0 -0
  34. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/source/methodology/README.md +0 -0
  35. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/__main__.py +0 -0
  36. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/_deprecated.py +0 -0
  37. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/analysis.py +0 -0
  38. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/artifacts.py +0 -0
  39. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/calibration.py +0 -0
  40. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/claims.py +0 -0
  41. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/config.py +0 -0
  42. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/docs.py +0 -0
  43. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/evidence.py +0 -0
  44. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/harness.py +0 -0
  45. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/leakage.py +0 -0
  46. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/loaders.py +0 -0
  47. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/metrics.py +0 -0
  48. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/operating_points.py +0 -0
  49. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/paths.py +0 -0
  50. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/plotting.py +0 -0
  51. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/protocols.py +0 -0
  52. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/provenance.py +0 -0
  53. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/py.typed +0 -0
  54. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  55. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  56. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  57. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  58. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  59. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/seeds.py +0 -0
  60. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/splits.py +0 -0
  61. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/text_dedup.py +0 -0
  62. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  63. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  64. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  65. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  66. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  67. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  68. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  69. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  70. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  71. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  72. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/benchmarks/__init__.py +0 -0
  73. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  74. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/conftest.py +0 -0
  75. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  76. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/expected.md +0 -0
  77. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/input.md +0 -0
  78. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/metrics.json +0 -0
  79. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/strategies.py +0 -0
  80. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_analysis.py +0 -0
  81. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_artifacts.py +0 -0
  82. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  83. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_edge_cases.py +0 -0
  84. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_golden.py +0 -0
  85. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_props.py +0 -0
  86. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_research_grounded.py +0 -0
  87. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  88. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_determinism.py +0 -0
  89. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_optimization_failures.py +0 -0
  90. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_props.py +0 -0
  91. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_research_grounded.py +0 -0
  92. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_unit.py +0 -0
  93. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims.py +0 -0
  94. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims_coverage.py +0 -0
  95. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims_props.py +0 -0
  96. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_cli.py +0 -0
  97. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_config.py +0 -0
  98. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_bootstrap.py +0 -0
  99. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_calibration.py +0 -0
  100. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_harness.py +0 -0
  101. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_metrics.py +0 -0
  102. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_plotting.py +0 -0
  103. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  104. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_deprecations.py +0 -0
  105. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_docs_golden.py +0 -0
  106. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_docs_props.py +0 -0
  107. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_evidence_validators.py +0 -0
  108. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_edge_cases.py +0 -0
  109. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_fault_injection.py +0 -0
  110. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_folded.py +0 -0
  111. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_internals.py +0 -0
  112. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_metric_options.py +0 -0
  113. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_smoke.py +0 -0
  114. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_import_boundaries.py +0 -0
  115. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage.py +0 -0
  116. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage_error_paths.py +0 -0
  117. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage_props.py +0 -0
  118. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders.py +0 -0
  119. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders_coverage.py +0 -0
  120. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders_props.py +0 -0
  121. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_logging.py +0 -0
  122. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  123. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_props.py +0 -0
  124. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_validation.py +0 -0
  125. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_props.py +0 -0
  126. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_stratified_subsets.py +0 -0
  127. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_unit.py +0 -0
  128. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_misc_coverage.py +0 -0
  129. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_numeric_edge_cases.py +0 -0
  130. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_operating_points.py +0 -0
  131. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_operating_points_props.py +0 -0
  132. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_paths.py +0 -0
  133. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_pipeline_e2e.py +0 -0
  134. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_edge.py +0 -0
  135. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_smoke.py +0 -0
  136. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_visual.py +0 -0
  137. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_protocol_conformance.py +0 -0
  138. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_provenance.py +0 -0
  139. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_public_api.py +0 -0
  140. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_reference_equivalence.py +0 -0
  141. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_reproducibility_integration.py +0 -0
  142. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_schemas.py +0 -0
  143. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_seeds.py +0 -0
  144. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits.py +0 -0
  145. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits_leakage_integration.py +0 -0
  146. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits_props.py +0 -0
  147. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup.py +0 -0
  148. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_coverage.py +0 -0
  149. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_props.py +0 -0
  150. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_strategies.py +0 -0
  151. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds.py +0 -0
  152. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_constant_score.py +0 -0
  153. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_coverage.py +0 -0
  154. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_props.py +0 -0
  155. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_research_grounded.py +0 -0
  156. {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_v09_contracts.py +0 -0
@@ -7,6 +7,146 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
11
+
12
+ Closes all 7 open backlog issues in one consumer-closing release. Also
13
+ lands the toolkit's first unified parallelism story: a shared internal
14
+ `parallel_map` helper + `n_jobs` kwarg on all 5 public bootstrap
15
+ functions. Future iterations will mechanically extend the same helper to
16
+ harness + operating-points (follow-up issues filed).
17
+
18
+ ### Breaking changes
19
+
20
+ - **`eval_toolkit.bootstrap.mde_from_ci`**: parameter renamed from
21
+ `paired` to `ci` and type widened to `BootstrapCI | PairedBootstrapCI`
22
+ (was `PairedBootstrapCI`-only). Positional callers unaffected; keyword
23
+ callers must update:
24
+ ```python
25
+ mde_from_ci(paired=x) # v0.33.x and earlier
26
+ mde_from_ci(ci=x) # v0.34.0+
27
+ mde_from_ci(x) # positional form, unchanged
28
+ ```
29
+ This is a one-time exception to the repo's 2-minor-version deprecation
30
+ warning policy (justification + criteria recorded in
31
+ [`docs/source/DEPRECATION.md`](docs/source/DEPRECATION.md#one-time-exceptions-to-the-2-minor-version-warning-policy)).
32
+ Notification issues filed on the 2 known toolkit consumers
33
+ (`prompt-injection-detection-submission`, `post-transformers`). Audit
34
+ confirms both use positional form — zero actual breakage in practice.
35
+
36
+ ### Added
37
+
38
+ - `eval_toolkit.bootstrap.block_bootstrap_on_folds` — CV-aware sibling
39
+ to `cv_clt_ci`; resamples K folds with replacement; returns
40
+ `BootstrapCI(method="block_bootstrap")`. The A-008 sensitivity-check
41
+ pattern (block-bootstrap halfwidth / cv_clt halfwidth > 1.5 flags
42
+ LODO non-exchangeability) is the prototypical use. Closes #21.
43
+ - `eval_toolkit.RecallAtFprResult` (frozen dataclass) +
44
+ `eval_toolkit.recall_at_fpr(y_true, y_score, target_fpr)` — one-shot
45
+ recall + actual_fpr + FP/TN at the smallest threshold meeting FPR ≤
46
+ target. Use `.to_dict()` for JSON / pandas-row integration. Closes #9.
47
+ - New optional `n_jobs: int = 1` kwarg on 5 bootstrap functions:
48
+ `bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`,
49
+ `paired_bootstrap_op_point_diff`, `paired_mde` (via `_bootstrap_t_ci`
50
+ internal helper). Backed by a new internal `_parallel.parallel_map`
51
+ helper (joblib loky; not exported). `n_jobs > 1` reproduces `n_jobs=1`
52
+ result bit-for-bit for the same seed (via
53
+ `np.random.SeedSequence.spawn`). Lambda metrics rejected at call time
54
+ with helpful `TypeError`. `n_jobs > os.cpu_count()` is auto-capped
55
+ with WARNING log; `n_jobs=0` raises `ValueError`. Closes #17.
56
+ - 6 new pages in `docs/source/examples/`:
57
+ - **Cookbook** (closes #19): `nested_seed_split.md`,
58
+ `callable_embedder_dedup.md`, `cross_corpus_contamination_scan.md`.
59
+ - **Plotting walkthroughs**: `plot_roc_curve_walkthrough.md`,
60
+ `plot_pareto_frontier_walkthrough.md`,
61
+ `plot_slice_metric_heatmap_walkthrough.md` (backfills the v0.33.0
62
+ docs gap).
63
+ - `docs/source/methodology/parallelism.md` — design rationale + caller
64
+ contract for the toolkit-wide parallelism story. Documents the 6
65
+ design principles (single backend, single helper, opt-in per-fn,
66
+ default sequential, reproducibility via SeedSequence, picklability
67
+ surface) and the checklist for adding `n_jobs` to a new function.
68
+
69
+ ### Changed
70
+
71
+ - `eval_toolkit.bootstrap.mde_from_ci` now accepts
72
+ `BootstrapCI | PairedBootstrapCI` (was paired-only). See **Breaking
73
+ changes** above. Closes #20.
74
+ - `eval_toolkit.build_manifest` gains `config_path: Path | str | None`
75
+ kwarg; when supplied, `config_hash` is computed as
76
+ `sha256(Path(config_path).read_bytes()).hexdigest()` — capturing the
77
+ exact YAML file bytes including comments + key ordering (which the
78
+ default canonical-JSON path strips during parse). Default behavior
79
+ preserved when `config_path is None`. Closes #10.
80
+
81
+ ### Internal
82
+
83
+ - New `src/eval_toolkit/_parallel.py` (internal; not exported) — single
84
+ source of truth for parallelism. Future per-function `n_jobs`
85
+ additions will reuse this helper. The toolkit's first INFO-level
86
+ log site is here (once-per-process guidance log when `n_jobs=1` AND
87
+ iteration count ≥ 1000). New `tests/test_parallel.py` covers smart-
88
+ default semantics + reproducibility contract.
89
+ - New golden test `tests/golden/test_dedup_holdout_calibration.py`
90
+ exercising 3 deterministic `SimilarityStrategy` variants against a
91
+ migrated 50-pair adversarial fixture at thresholds {0.75, 0.80, 0.85}
92
+ (strict snapshot at `tests/golden/data/dedup_holdout_expected.json`)
93
+ plus an `EmbeddingCosineStrategy` soft-bound check (FPR < 0.5,
94
+ FNR < 0.5 at threshold 0.80) gated by `pytest.importorskip` +
95
+ `@pytest.mark.slow`. Refresh helper at
96
+ `scripts/refresh_dedup_holdout.py`. Closes #18.
97
+ - `CONTRIBUTING.md` + `docs/source/repo-strategy.md` updated with
98
+ explicit "Parallelism" section codifying the new pattern (was an
99
+ implicit anti-pattern before; v0.34.0 codifies the new opt-in design).
100
+ - `docs/source/DEPRECATION.md` extended with a "One-time exceptions"
101
+ section documenting the `mde_from_ci` rename + criteria future
102
+ exceptions must satisfy.
103
+ - RNG-stream note: the 5 wired bootstrap fns now derive per-resample
104
+ seeds via `np.random.SeedSequence(seed).spawn(n_resamples)` instead
105
+ of sequential calls on a single `Generator`. The bootstrap output is
106
+ *statistically equivalent* (both are valid bootstraps) but the exact
107
+ numerical CI bounds for the same caller-supplied `seed` will differ
108
+ slightly from v0.33.x. Existing tests use behavioral assertions
109
+ (`overlaps_zero`, `delta`, etc.) that are robust to the RNG-stream
110
+ change; tests that pin exact CI bounds (e.g., consumer golden tests)
111
+ may need regen on upgrade.
112
+
113
+ ## [0.33.1] — 2026-05-17 — MiniLM convenience embedder
114
+
115
+ Closes the last open item in the v0.33 milestone (deferred from v0.33.0
116
+ per the planned split). Ships the canonical semantic-dedup recipe
117
+ (`sentence-transformers/all-MiniLM-L6-v2` at cosine ≥ 0.80, per ADR-027)
118
+ pre-wired for `EmbeddingCosineStrategy` so consumers stop reinventing the
119
+ embedder-wrapping boilerplate.
120
+
121
+ No breaking changes. Public API gains 1 new export
122
+ (`make_minilm_embedder`) and 1 new optional dependency extra
123
+ (`[embeddings]`). Existing `EmbeddingCosineStrategy` callers that already
124
+ ship their own embedder are unaffected.
125
+
126
+ ### Added
127
+
128
+ - `eval_toolkit.embeddings.make_minilm_embedder` — factory returning a
129
+ `Callable[[Sequence[str]], np.ndarray]` that loads
130
+ `sentence-transformers/all-MiniLM-L6-v2` (configurable), memoises model
131
+ loads via `functools.lru_cache(maxsize=8)`, and emits `(n, 384)`
132
+ `float64` embeddings ready for `EmbeddingCosineStrategy`. Raises a
133
+ helpful `ImportError` with the install hint when the optional dep is
134
+ absent. Closes #3.
135
+ - New optional dependency extra `[embeddings]` →
136
+ `sentence-transformers>=3.0`. Intentionally **not** in `[all]` / `[dev]`
137
+ because the transitive `torch` install (~700MB) would balloon
138
+ contributor setup.
139
+
140
+ ### Internal
141
+
142
+ - `docs/source/api/embeddings.md` Sphinx page added (autosummary stub);
143
+ wired into the API toctree alongside the other module pages.
144
+ - `docs/source/api/plotting.md` autosummary backfilled with
145
+ `plot_roc_curve`, `plot_pareto_frontier`, `plot_slice_metric_heatmap`
146
+ (missed in v0.33.0).
147
+ - `tool.mypy.overrides` extended with `sentence_transformers.*` (matches
148
+ the existing pattern for untyped third-party libs).
149
+
10
150
  ## [0.33.0] — 2026-05-17 — Plotting batch + ax= parity + CI quality-of-life
11
151
 
12
152
  Consumer-unblocking release: closes the four upstream-gap TODOs in
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.33.0
3
+ Version: 0.34.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -60,6 +60,8 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
60
60
  Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
61
61
  Requires-Dist: sphinx-design>=0.6; extra == 'docs'
62
62
  Requires-Dist: sphinx>=7.3; extra == 'docs'
63
+ Provides-Extra: embeddings
64
+ Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
63
65
  Provides-Extra: parquet
64
66
  Requires-Dist: pyarrow>=15.0; extra == 'parquet'
65
67
  Provides-Extra: plotting
@@ -50,6 +50,12 @@ plotting = ["matplotlib>=3.8", "pillow>=10.0"]
50
50
  property = ["hypothesis>=6.100"]
51
51
  yaml = ["pyyaml>=6.0"]
52
52
  parquet = ["pyarrow>=15.0"]
53
+ # v0.33.1: MiniLM convenience embedder for EmbeddingCosineStrategy.
54
+ # sentence-transformers transitively pulls torch + tokenizers (~700MB).
55
+ # Intentionally NOT in [all] / [dev] — opt-in only to keep contributor
56
+ # setup small. The canonical semantic-dedup recipe (all-MiniLM-L6-v2 +
57
+ # cosine@0.80) is what this factory pre-wires for callers.
58
+ embeddings = ["sentence-transformers>=3.0"]
53
59
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
54
60
  #
55
61
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -158,7 +164,7 @@ warn_no_return = true
158
164
  strict_equality = true
159
165
 
160
166
  [[tool.mypy.overrides]]
161
- module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*"]
167
+ module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
162
168
  ignore_missing_imports = true
163
169
 
164
170
  [tool.pytest.ini_options]
@@ -62,6 +62,7 @@ _EXPORTS: dict[str, str] = {
62
62
  "PairedBootstrapCI": "eval_toolkit.bootstrap",
63
63
  "ThresholdedMetricFn": "eval_toolkit.bootstrap",
64
64
  "ThresholdFn": "eval_toolkit.bootstrap",
65
+ "block_bootstrap_on_folds": "eval_toolkit.bootstrap",
65
66
  "bonferroni_correct": "eval_toolkit.bootstrap",
66
67
  "bootstrap_ci": "eval_toolkit.bootstrap",
67
68
  "correct_p_values": "eval_toolkit.bootstrap",
@@ -117,6 +118,8 @@ _EXPORTS: dict[str, str] = {
117
118
  "render_files": "eval_toolkit.docs",
118
119
  "render_text": "eval_toolkit.docs",
119
120
  "walk_path": "eval_toolkit.docs",
121
+ # --- embeddings ---
122
+ "make_minilm_embedder": "eval_toolkit.embeddings",
120
123
  # --- evidence ---
121
124
  "AggregateEvidence": "eval_toolkit.evidence",
122
125
  "EvidenceAxis": "eval_toolkit.evidence",
@@ -250,6 +253,7 @@ _EXPORTS: dict[str, str] = {
250
253
  "CISafeThresholdSelector": "eval_toolkit.thresholds",
251
254
  "CostSensitiveSelector": "eval_toolkit.thresholds",
252
255
  "MaxF1Selector": "eval_toolkit.thresholds",
256
+ "RecallAtFprResult": "eval_toolkit.thresholds",
253
257
  "TargetFPRSelector": "eval_toolkit.thresholds",
254
258
  "TargetPrecisionSelector": "eval_toolkit.thresholds",
255
259
  "TargetRecallSelector": "eval_toolkit.thresholds",
@@ -257,6 +261,7 @@ _EXPORTS: dict[str, str] = {
257
261
  "ThresholdSelector": "eval_toolkit.thresholds",
258
262
  "WilsonInterval": "eval_toolkit.thresholds",
259
263
  "YoudenJSelector": "eval_toolkit.thresholds",
264
+ "recall_at_fpr": "eval_toolkit.thresholds",
260
265
  "select_threshold": "eval_toolkit.thresholds",
261
266
  "wilson_interval": "eval_toolkit.thresholds",
262
267
  }
@@ -0,0 +1,129 @@
1
+ """Toolkit-internal parallel-map helper. Single source of truth for parallelism.
2
+
3
+ This module is *internal* (not exported via ``__all__`` or
4
+ ``__init__._EXPORTS``); future per-function ``n_jobs`` additions across the
5
+ toolkit call into this helper rather than each inventing their own
6
+ parallelism backend.
7
+
8
+ See ``docs/source/methodology/parallelism.md`` for the design rationale +
9
+ caller contract (reproducibility, picklability, smart defaults).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import os
16
+ import pickle
17
+ from collections.abc import Callable, Iterable, Sized
18
+
19
+ _logger = logging.getLogger(__name__)
20
+
21
+ _GUIDANCE_THRESHOLD = 1000
22
+
23
+ _GUIDANCE_EMITTED = False
24
+
25
+
26
+ def parallel_map[T, R](
27
+ fn: Callable[[T], R],
28
+ items: Iterable[T],
29
+ *,
30
+ n_jobs: int = 1,
31
+ description: str = "work",
32
+ ) -> list[R]:
33
+ """Map ``fn`` over ``items``; parallel when ``n_jobs != 1``.
34
+
35
+ Design contract (see ``docs/source/methodology/parallelism.md``):
36
+
37
+ - ``n_jobs == 1`` (default) — pure-Python serial; preserves tracebacks.
38
+ If ``len(items) >= 1000``, emits an INFO log on the first qualifying
39
+ call per Python process suggesting ``n_jobs > 1`` (silent thereafter).
40
+ - ``n_jobs == -1`` — joblib loky with all cores.
41
+ - ``n_jobs > 1`` — joblib loky; values exceeding ``os.cpu_count()``
42
+ are silently capped (with a WARNING log) to avoid CPU-frying.
43
+ - ``n_jobs == 0`` — raises ``ValueError`` (likely a typo for 1 or -1).
44
+ - ``fn`` MUST be picklable when ``n_jobs != 1`` (lambdas and closures
45
+ over local state are rejected at call time with a helpful
46
+ ``TypeError``).
47
+ - Reproducibility: caller is responsible for deterministic per-item
48
+ state (use ``np.random.SeedSequence(seed).spawn(n)`` for resample
49
+ loops so ``n_jobs > 1`` produces identical results to ``n_jobs == 1``
50
+ for the same seed).
51
+
52
+ Parameters
53
+ ----------
54
+ fn : Callable[[T], R]
55
+ Picklable callable to apply to each item. Lambdas and closures
56
+ over local state are rejected when ``n_jobs != 1``.
57
+ items : Iterable[T]
58
+ Work items. Materialised internally; pass any iterable.
59
+ n_jobs : int, optional
60
+ Default 1 (sequential). Set to -1 for all cores, or a positive int.
61
+ ``n_jobs=0`` is rejected (use 1 or -1).
62
+ description : str, optional
63
+ Used in log messages and error messages for context (e.g.,
64
+ ``"paired bootstrap"``).
65
+
66
+ Returns
67
+ -------
68
+ list[R]
69
+ Results in the order of ``items``.
70
+
71
+ Raises
72
+ ------
73
+ ValueError
74
+ If ``n_jobs == 0``.
75
+ TypeError
76
+ If ``n_jobs != 1`` and ``fn`` is not picklable.
77
+
78
+ Examples
79
+ --------
80
+ >>> def square(x): return x * x
81
+ >>> parallel_map(square, [1, 2, 3], n_jobs=1)
82
+ [1, 4, 9]
83
+ """
84
+ if n_jobs == 0:
85
+ raise ValueError(
86
+ f"n_jobs=0 is not allowed for {description}; use 1 (sequential), "
87
+ "-1 (all cores), or a positive integer."
88
+ )
89
+
90
+ items_list = items if isinstance(items, list) else list(items)
91
+ n_items = len(items_list) if isinstance(items_list, Sized) else 0
92
+
93
+ if n_jobs == 1:
94
+ global _GUIDANCE_EMITTED
95
+ if n_items >= _GUIDANCE_THRESHOLD and not _GUIDANCE_EMITTED:
96
+ _logger.info(
97
+ "%s: running %d items sequentially (n_jobs=1). For parallel "
98
+ "speedup set n_jobs > 1 (typical wall-clock 3-5x on 8 cores). "
99
+ "(Shown once per process.)",
100
+ description,
101
+ n_items,
102
+ )
103
+ _GUIDANCE_EMITTED = True
104
+ return [fn(item) for item in items_list]
105
+
106
+ if n_jobs > 0:
107
+ cpu_count = os.cpu_count() or 1
108
+ if n_jobs > cpu_count:
109
+ _logger.warning(
110
+ "%s: capping n_jobs from %d to %d (os.cpu_count()).",
111
+ description,
112
+ n_jobs,
113
+ cpu_count,
114
+ )
115
+ n_jobs = cpu_count
116
+
117
+ try:
118
+ pickle.dumps(fn)
119
+ except (pickle.PicklingError, AttributeError, TypeError) as e:
120
+ raise TypeError(
121
+ f"parallel_map of {description}: callable is not picklable "
122
+ f"(lambdas and closures over local state are not supported "
123
+ f"with n_jobs != 1). Define a named top-level function. "
124
+ f"Underlying error: {e}"
125
+ ) from e
126
+
127
+ from joblib import Parallel, delayed # noqa: PLC0415
128
+
129
+ return list(Parallel(n_jobs=n_jobs, backend="loky")(delayed(fn)(item) for item in items_list))
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.33.0"
5
+ __version__ = "0.34.0"