eval-toolkit 0.34.0__tar.gz → 0.37.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/.gitignore +1 -0
  2. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/CHANGELOG.md +140 -0
  3. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/PKG-INFO +3 -1
  4. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/pyproject.toml +7 -0
  5. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/__init__.py +2 -0
  6. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/calibration.py +97 -0
  8. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/embeddings.py +7 -4
  9. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/harness.py +227 -35
  10. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/leakage.py +142 -2
  11. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/protocols.py +10 -0
  12. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/seeds.py +11 -7
  13. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/public_api/snapshot.json +18 -3
  14. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_unit.py +126 -0
  15. eval_toolkit-0.37.0/tests/test_harness_parallelism.py +266 -0
  16. eval_toolkit-0.37.0/tests/test_tokenization_leakage_check.py +194 -0
  17. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/LICENSE +0 -0
  18. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/README.md +0 -0
  19. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/STYLE.md +0 -0
  20. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/archive/README.md +0 -0
  21. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/README.md +0 -0
  22. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/datasets/README.md +0 -0
  23. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/data-integrity/README.md +0 -0
  24. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  25. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/inference/README.md +0 -0
  26. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/prompt-injection/README.md +0 -0
  27. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/source/methodology/README.md +0 -0
  28. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/__main__.py +0 -0
  29. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_deprecated.py +0 -0
  30. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_parallel.py +0 -0
  31. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/analysis.py +0 -0
  32. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/artifacts.py +0 -0
  33. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/bootstrap.py +0 -0
  34. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/claims.py +0 -0
  35. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/config.py +0 -0
  36. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/docs.py +0 -0
  37. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/evidence.py +0 -0
  38. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/loaders.py +0 -0
  39. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/manifest.py +0 -0
  40. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/metrics.py +0 -0
  41. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/operating_points.py +0 -0
  42. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/paths.py +0 -0
  43. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/plotting.py +0 -0
  44. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/provenance.py +0 -0
  45. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/py.typed +0 -0
  46. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  47. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  48. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  49. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  50. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  51. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/splits.py +0 -0
  52. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/text_dedup.py +0 -0
  53. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/thresholds.py +0 -0
  54. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  55. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  56. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  57. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  58. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  59. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  60. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  61. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  62. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  63. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  64. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/benchmarks/__init__.py +0 -0
  65. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  66. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/conftest.py +0 -0
  67. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  68. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  69. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  70. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  71. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/expected.md +0 -0
  72. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/input.md +0 -0
  73. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/metrics.json +0 -0
  74. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  75. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/strategies.py +0 -0
  76. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_analysis.py +0 -0
  77. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_artifacts.py +0 -0
  78. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  79. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  80. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_edge_cases.py +0 -0
  81. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_golden.py +0 -0
  82. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_njobs.py +0 -0
  83. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_props.py +0 -0
  84. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_research_grounded.py +0 -0
  85. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_unit.py +0 -0
  86. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  87. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_determinism.py +0 -0
  88. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_optimization_failures.py +0 -0
  89. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_props.py +0 -0
  90. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_research_grounded.py +0 -0
  91. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims.py +0 -0
  92. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims_coverage.py +0 -0
  93. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims_props.py +0 -0
  94. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_cli.py +0 -0
  95. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_config.py +0 -0
  96. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_bootstrap.py +0 -0
  97. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_calibration.py +0 -0
  98. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_harness.py +0 -0
  99. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_metrics.py +0 -0
  100. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_plotting.py +0 -0
  101. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  102. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_deprecations.py +0 -0
  103. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_docs_golden.py +0 -0
  104. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_docs_props.py +0 -0
  105. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_embeddings.py +0 -0
  106. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_evidence_validators.py +0 -0
  107. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_edge_cases.py +0 -0
  108. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_fault_injection.py +0 -0
  109. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_folded.py +0 -0
  110. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_internals.py +0 -0
  111. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_metric_options.py +0 -0
  112. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_smoke.py +0 -0
  113. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_import_boundaries.py +0 -0
  114. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage.py +0 -0
  115. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage_error_paths.py +0 -0
  116. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage_props.py +0 -0
  117. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders.py +0 -0
  118. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders_coverage.py +0 -0
  119. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders_props.py +0 -0
  120. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_logging.py +0 -0
  121. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest.py +0 -0
  122. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  123. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_props.py +0 -0
  124. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_validation.py +0 -0
  125. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_props.py +0 -0
  126. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_stratified_subsets.py +0 -0
  127. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_unit.py +0 -0
  128. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_misc_coverage.py +0 -0
  129. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_numeric_edge_cases.py +0 -0
  130. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_operating_points.py +0 -0
  131. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_operating_points_props.py +0 -0
  132. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_parallel.py +0 -0
  133. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_paths.py +0 -0
  134. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_pipeline_e2e.py +0 -0
  135. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_edge.py +0 -0
  136. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_smoke.py +0 -0
  137. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_visual.py +0 -0
  138. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_protocol_conformance.py +0 -0
  139. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_provenance.py +0 -0
  140. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_public_api.py +0 -0
  141. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_recall_at_fpr.py +0 -0
  142. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_reference_equivalence.py +0 -0
  143. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_reproducibility_integration.py +0 -0
  144. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_schemas.py +0 -0
  145. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_seeds.py +0 -0
  146. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits.py +0 -0
  147. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits_leakage_integration.py +0 -0
  148. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits_props.py +0 -0
  149. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup.py +0 -0
  150. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_coverage.py +0 -0
  151. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_props.py +0 -0
  152. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_strategies.py +0 -0
  153. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds.py +0 -0
  154. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_constant_score.py +0 -0
  155. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_coverage.py +0 -0
  156. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_props.py +0 -0
  157. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_research_grounded.py +0 -0
  158. {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_v09_contracts.py +0 -0
@@ -22,6 +22,7 @@ wheels/
22
22
  .coverage.*
23
23
  htmlcov/
24
24
  coverage.xml
25
+ coverage.json
25
26
  .hypothesis/
26
27
 
27
28
  # Type-checker / linter caches
@@ -7,6 +7,146 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.37.0] — 2026-05-18 — TokenizationLeakageCheck + per-module coverage floors
11
+
12
+ Two-issue bundle (#35 + #37) plus housekeeping closure of stale items
13
+ (PR #27, #38) that turned out to have been resolved in v0.33.x without
14
+ being checked off. Roadmap refresh in `3d40796` (this minor's
15
+ predecessor commit) replaced the version-keyed candidate list with
16
+ issue-keyed tracking, so this class of stale-roadmap bug shouldn't
17
+ recur.
18
+
19
+ ### Added
20
+
21
+ - **`eval_toolkit.leakage.TokenizationLeakageCheck`** — new within-split
22
+ `LeakageCheck` that dedups on tokenizer output rather than raw text.
23
+ Catches encoding-obfuscated dupes that survive
24
+ `NormalizedFormLeakageCheck` but collapse to identical `input_ids`
25
+ under a transformer's BPE / SentencePiece / WordPiece tokenizer.
26
+ Accepts any `Callable[[str], Mapping[str, object]]` returning HF-style
27
+ output with an `"input_ids"` key — does **not** import `transformers`
28
+ itself; consumers pass an already-instantiated tokenizer. Default
29
+ severity `"error"` (mirrors `NormalizedFormLeakageCheck`). Closes #35.
30
+ - New optional install extra **`[transformers]`** (`transformers>=4.0`).
31
+ Intentionally **not** in `[all]` / `[dev]` — mirrors the `[embeddings]`
32
+ precedent from v0.33.1 to keep contributor setup small (transformers
33
+ transitively pulls torch ~700MB).
34
+
35
+ ### Test
36
+
37
+ - **Per-module coverage floors restored.** `scripts/check_module_floors.py`
38
+ enforces an 85 % per-file floor (coverage.py natively only ships
39
+ global `--fail-under`). Hooked into `make coverage` via a post-pytest
40
+ invocation. Closes #37.
41
+ - **`# pragma: no cover` on optional-dep-active paths** in `seeds.py`
42
+ (torch) and `embeddings.py` (sentence-transformers). Reflects the
43
+ reality that these branches execute in user code, not CI. Both
44
+ modules now report 100 % coverage; previously sat at ~70 % which
45
+ obscured per-module floor enforcement.
46
+
47
+ ### Fixed
48
+
49
+ - **`make coverage` Makefile parity with PR CI.** PR #27 (external
50
+ contributor @leno23, draft) proposed adding `-m "not monte_carlo and
51
+ not benchmark"` to the `coverage` target. Audit found the same fix
52
+ had landed in v0.33.0 commit `9e375a8` ahead of the PR being filed;
53
+ closed PR #27 as superseded with thanks. No change in this release.
54
+
55
+ ### Closed (already-resolved)
56
+
57
+ - **#38 — CI doctests for `paths.py` / `provenance.py` / `seeds.py` /
58
+ `docs.py`.** All four modules were added to `.doctest-modules` in
59
+ `a26fd44` (2026-05-14, v0.32.x era); 7 doctests collected across the
60
+ named modules in current CI. Closed as already-resolved.
61
+
62
+ ### Test coverage
63
+
64
+ Test count 1376 → 1387 (+11). Aggregate 95.65 % → 95.69 %. All 28
65
+ modules ≥ 90 % individually post-pragma.
66
+
67
+ ## [0.36.0] — 2026-05-18 — harness parallelization (#29, #30) + Node 24 actions
68
+
69
+ Wires the v0.34.0 unified parallelism pattern into the harness evaluation
70
+ loop. `evaluate()` and `evaluate_folded()` now accept an `n_jobs` kwarg
71
+ (default `1` preserves bit-identical sequential behavior); under
72
+ `n_jobs != 1`, the `(slice × scorer)` work-unit loop in
73
+ `_score_all_slices` and the `(spec × scorer)` fit phase in
74
+ `_attach_transferred_operating_points` dispatch through joblib loky via
75
+ the existing `_parallel.parallel_map` helper.
76
+
77
+ ### Added
78
+
79
+ - `evaluate(..., n_jobs: int = 1)` and `evaluate_folded(..., n_jobs: int = 1)`
80
+ — keyword-only kwarg per Principle #3 of `methodology/parallelism.md`.
81
+ `n_jobs=1` (default) runs the existing pure-Python sequential loop
82
+ (Principle #4 — bit-identical to v0.35). `n_jobs > 1` uses joblib loky;
83
+ `n_jobs=-1` uses all cores; `n_jobs=0` is rejected. Closes #29, #30.
84
+ - Strict-pickle Scorer sniff at `evaluate()` entry when `n_jobs != 1`:
85
+ raises a clean `TypeError` referencing
86
+ `methodology/parallelism.md#scorer-picklability` with the underlying
87
+ pickle error attached. Reuses the v0.35 ADR contract; no new exception
88
+ class. Catches non-picklable scorers up front rather than relying on
89
+ joblib's more permissive cloudpickle path (which would silently absorb
90
+ closures and obscure the contract documented in v0.35).
91
+
92
+ ### Internal
93
+
94
+ - New module-scope step functions `_score_one_pair` and
95
+ `_fit_one_op_point_pair` in `harness.py` (picklable; required by loky).
96
+ - `_score_all_slices` and `_attach_transferred_operating_points`
97
+ refactored to use flat work-unit dispatch via `parallel_map`.
98
+
99
+ ### Tests
100
+
101
+ - New `tests/test_harness_parallelism.py` (7 tests): bit-identical
102
+ reproducibility across `n_jobs=1` vs `n_jobs=2` for `evaluate`
103
+ (basic, paired-diffs, operating-points), `evaluate_folded`,
104
+ picklability rejection (closure scorer), `n_jobs=0` rejection,
105
+ `n_jobs=-1` smoke. All 66 harness tests pass (7 new + 59 existing).
106
+
107
+ ### Infrastructure
108
+
109
+ - Bumped `actions/upload-artifact` and `actions/download-artifact` from
110
+ `@v5` → `@v6` across `publish.yml` / `nightly-mc.yml` /
111
+ `nightly-benchmarks.yml`. The v6 majors run on Node.js 24
112
+ (GitHub deprecates Node 20 actions from 2026-06-02). Other pinned
113
+ actions (`checkout@v6`, `setup-uv@v8.1.0`, `codeql-action@v3`,
114
+ `deploy-pages@v4`, `upload-pages-artifact@v3`) were not flagged in
115
+ the v0.35 publish annotation and are deferred to a separate audit.
116
+
117
+ ## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
118
+
119
+ Small, additive release. Adds a binary-classification calibration helper
120
+ that lets consumers drop the ~50 LOC scalar-proba adapter many were
121
+ carrying, plus a design ADR that unblocks the v0.36 harness / operating-
122
+ point parallelization work (#29, #30) without re-litigating picklability.
123
+
124
+ ### Added
125
+
126
+ - `eval_toolkit.fit_temperature_binary(y_true, y_score)` — scalar-proba
127
+ adapter for the multi-class `fit_temperature` fitter. Converts `(n,)`
128
+ probabilities of class 1 to a 2-column logit array via clipped logit
129
+ (`[0, logit(p)]` so softmax row 1 reproduces `p`), delegates to the
130
+ deployment-quality fitter, and returns `(T_opt, apply)` where
131
+ `apply: (n,) -> (n,)` does scalar-in / scalar-out T-scaling. Unlike
132
+ `fit_temperature_oracle`, no warning — the contract assumes val / test
133
+ separation (deployment-quality calibration, not fit-on-test). Closes
134
+ #28.
135
+
136
+ ### Documentation
137
+
138
+ - `docs/source/methodology/parallelism.md` — new `## Scorer picklability`
139
+ sub-section documenting the Scorer protocol's picklability contract
140
+ for `n_jobs > 1` usage. Includes worked picklable / broken-closure /
141
+ fix examples plus a list of common non-picklable patterns to watch for
142
+ in user-supplied Scorers (closures, lambdas on instances, local-scope
143
+ classes, attributes holding live sockets / file handles). Anchors on
144
+ the existing v0.34.0 `parallel_map` pickle sniff + `TypeError`
145
+ channel — no new exception class. Unblocks v0.36 implementation of
146
+ #29 and #30.
147
+ - `eval_toolkit.protocols.Scorer` docstring — Notes block pointing at
148
+ the new methodology section.
149
+
10
150
  ## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
11
151
 
12
152
  Closes all 7 open backlog issues in one consumer-closing release. Also
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.34.0
3
+ Version: 0.37.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -69,6 +69,8 @@ Requires-Dist: matplotlib>=3.8; extra == 'plotting'
69
69
  Requires-Dist: pillow>=10.0; extra == 'plotting'
70
70
  Provides-Extra: property
71
71
  Requires-Dist: hypothesis>=6.100; extra == 'property'
72
+ Provides-Extra: transformers
73
+ Requires-Dist: transformers>=4.0; extra == 'transformers'
72
74
  Provides-Extra: validation
73
75
  Provides-Extra: yaml
74
76
  Requires-Dist: pyyaml>=6.0; extra == 'yaml'
@@ -56,6 +56,13 @@ parquet = ["pyarrow>=15.0"]
56
56
  # setup small. The canonical semantic-dedup recipe (all-MiniLM-L6-v2 +
57
57
  # cosine@0.80) is what this factory pre-wires for callers.
58
58
  embeddings = ["sentence-transformers>=3.0"]
59
+ # v0.37.0: TokenizationLeakageCheck — HF-tokenizer-aware dedup.
60
+ # transformers transitively pulls torch + tokenizers (~700MB) so we
61
+ # follow the [embeddings] precedent: opt-in only, NOT in [all] / [dev].
62
+ # Consumers pass an already-instantiated tokenizer callable; the check
63
+ # itself does not import transformers, so the optional install is
64
+ # strictly for callers wanting AutoTokenizer.from_pretrained(...).
65
+ transformers = ["transformers>=4.0"]
59
66
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
60
67
  #
61
68
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -87,6 +87,7 @@ _EXPORTS: dict[str, str] = {
87
87
  "fit_isotonic_calibrator": "eval_toolkit.calibration",
88
88
  "fit_platt_calibrator": "eval_toolkit.calibration",
89
89
  "fit_temperature": "eval_toolkit.calibration",
90
+ "fit_temperature_binary": "eval_toolkit.calibration",
90
91
  "fit_temperature_oracle": "eval_toolkit.calibration",
91
92
  "reliability_curve": "eval_toolkit.calibration",
92
93
  "reliability_diagram_data": "eval_toolkit.calibration",
@@ -146,6 +147,7 @@ _EXPORTS: dict[str, str] = {
146
147
  "NearDuplicateCheck": "eval_toolkit.leakage",
147
148
  "NormalizedFormLeakageCheck": "eval_toolkit.leakage",
148
149
  "TemporalLeakageCheck": "eval_toolkit.leakage",
150
+ "TokenizationLeakageCheck": "eval_toolkit.leakage",
149
151
  "run_leakage_checks": "eval_toolkit.leakage",
150
152
  # --- loaders ---
151
153
  "DataFrameLoader": "eval_toolkit.loaders",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.34.0"
5
+ __version__ = "0.37.0"
@@ -57,6 +57,7 @@ __all__ = [
57
57
  "fit_isotonic_calibrator",
58
58
  "fit_platt_calibrator",
59
59
  "fit_temperature",
60
+ "fit_temperature_binary",
60
61
  "fit_temperature_oracle",
61
62
  "maximum_calibration_error",
62
63
  "reliability_curve",
@@ -1038,6 +1039,102 @@ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -
1038
1039
  return float(-log_probs[np.arange(len(labels)), labels].mean())
1039
1040
 
1040
1041
 
1042
+ def fit_temperature_binary(
1043
+ y_true: np.ndarray,
1044
+ y_score: np.ndarray,
1045
+ *,
1046
+ bounds: tuple[float, float] = (0.05, 20.0),
1047
+ ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
1048
+ r"""Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).
1049
+
1050
+ Fits a scalar T > 0 on *validation* probabilities of class 1 and returns
1051
+ both T and a callable that applies the same T-scaling to test
1052
+ probabilities. Internally:
1053
+
1054
+ 1. Clips ``y_score`` to ``[1e-7, 1-1e-7]`` for finite logit inversion.
1055
+ 2. Builds a 2-column logit array ``[0, logit(p)]`` so softmax row 1
1056
+ reproduces ``p`` exactly.
1057
+ 3. Delegates to :func:`fit_temperature` for the bounded NLL minimization.
1058
+ 4. Returns ``(T, apply)`` where ``apply(p_test) = sigmoid(logit(p_test)/T)``.
1059
+
1060
+ Unlike :func:`fit_temperature_oracle`, this does NOT emit a warning — the
1061
+ contract is that ``y_true`` / ``y_score`` come from a held-out validation
1062
+ set and ``apply`` is invoked on a separate test set (deployment-quality
1063
+ calibration, not fit-on-test).
1064
+
1065
+ Parameters
1066
+ ----------
1067
+ y_true : np.ndarray, shape (n,)
1068
+ Binary validation labels in {0, 1}.
1069
+ y_score : np.ndarray, shape (n,)
1070
+ Validation predicted probabilities of class 1, in [0, 1]. Values at
1071
+ the extremes are clipped to ``[1e-7, 1 - 1e-7]``.
1072
+ bounds : tuple of float, optional
1073
+ ``(lo, hi)`` bracket for T. Default ``(0.05, 20.0)``, matches
1074
+ :func:`fit_temperature`.
1075
+
1076
+ Returns
1077
+ -------
1078
+ tuple
1079
+ ``(T_optimal, apply)`` where ``apply: (n,) -> (n,)`` maps any input
1080
+ probability array through :math:`\sigma(\mathrm{logit}(p) / T)`.
1081
+
1082
+ Raises
1083
+ ------
1084
+ ValueError
1085
+ On shape mismatch, empty input, non-finite scores, or single-class
1086
+ ``y_true``.
1087
+ RuntimeError
1088
+ If the bounded scalar optimizer fails to converge.
1089
+
1090
+ Examples
1091
+ --------
1092
+ >>> import numpy as np
1093
+ >>> rng = np.random.default_rng(0)
1094
+ >>> n = 500
1095
+ >>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
1096
+ >>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
1097
+ >>> T, apply = fit_temperature_binary(y_val, p_val)
1098
+ >>> T > 0
1099
+ True
1100
+ >>> p_test = np.array([0.1, 0.5, 0.9])
1101
+ >>> apply(p_test).shape == (3,)
1102
+ True
1103
+
1104
+ See Also
1105
+ --------
1106
+ fit_temperature : underlying multi-class fitter (operates on 2-col logits)
1107
+ fit_temperature_oracle : diagnostic-only variant that fits T on the same
1108
+ probabilities it scores
1109
+
1110
+ References
1111
+ ----------
1112
+ .. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
1113
+ calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
1114
+ """
1115
+ y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
1116
+
1117
+ # Build 2-col logits [0, logit(p)] so softmax([0, logit(p)])[1] == p exactly.
1118
+ s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1119
+ logit_pos = np.log(s_clipped / (1.0 - s_clipped))
1120
+ val_logits_2col = np.column_stack([np.zeros_like(logit_pos), logit_pos])
1121
+
1122
+ result = fit_temperature(val_logits_2col, y_true_arr, bounds=bounds)
1123
+ t_optimal = float(result["temperature"])
1124
+
1125
+ def apply(scores: np.ndarray) -> np.ndarray:
1126
+ arr = np.asarray(scores, dtype=float).ravel()
1127
+ if not np.isfinite(arr).all():
1128
+ raise ValueError("scores contains NaN or inf")
1129
+ clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
1130
+ logit = np.log(clipped / (1.0 - clipped))
1131
+ scaled = logit / t_optimal
1132
+ out: np.ndarray = (1.0 / (1.0 + np.exp(-scaled))).astype(float)
1133
+ return out
1134
+
1135
+ return t_optimal, apply
1136
+
1137
+
1041
1138
  def fit_temperature_oracle(
1042
1139
  y_true: np.ndarray, y_score: np.ndarray
1043
1140
  ) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
@@ -88,15 +88,18 @@ def make_minilm_embedder(
88
88
  "Install via: pip install eval-toolkit[embeddings]"
89
89
  ) from e
90
90
 
91
- _logger.debug(
91
+ # sentence-transformers-active path: excluded from CI coverage
92
+ # because [embeddings] is intentionally kept out of [dev]/[all]
93
+ # (transitive torch cost ~700MB per the v0.33.1 design note).
94
+ _logger.debug( # pragma: no cover
92
95
  "loading SentenceTransformer model_id=%s device=%s batch_size=%d",
93
96
  model_id,
94
97
  device,
95
98
  batch_size,
96
99
  )
97
- model = SentenceTransformer(model_id, device=device)
100
+ model = SentenceTransformer(model_id, device=device) # pragma: no cover
98
101
 
99
- def embedder(texts: Sequence[str]) -> np.ndarray:
102
+ def embedder(texts: Sequence[str]) -> np.ndarray: # pragma: no cover
100
103
  result = model.encode(
101
104
  list(texts),
102
105
  convert_to_numpy=True,
@@ -105,4 +108,4 @@ def make_minilm_embedder(
105
108
  )
106
109
  return np.asarray(result, dtype=np.float64)
107
110
 
108
- return embedder
111
+ return embedder # pragma: no cover