eval-toolkit 0.35.0__tar.gz → 0.38.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/.gitignore +1 -0
  2. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/CHANGELOG.md +147 -0
  3. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/PKG-INFO +3 -1
  4. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/pyproject.toml +7 -0
  5. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/__init__.py +1 -0
  6. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/_version.py +1 -1
  7. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/embeddings.py +7 -4
  8. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/harness.py +227 -35
  9. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/leakage.py +142 -2
  10. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/seeds.py +11 -7
  11. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/public_api/snapshot.json +12 -3
  12. eval_toolkit-0.38.0/tests/test_harness_parallelism.py +266 -0
  13. eval_toolkit-0.38.0/tests/test_tokenization_leakage_check.py +194 -0
  14. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/LICENSE +0 -0
  15. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/README.md +0 -0
  16. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/STYLE.md +0 -0
  17. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/archive/README.md +0 -0
  18. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/README.md +0 -0
  19. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/datasets/README.md +0 -0
  20. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/papers/data-integrity/README.md +0 -0
  21. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  22. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/papers/inference/README.md +0 -0
  23. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/research/papers/prompt-injection/README.md +0 -0
  24. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/docs/source/methodology/README.md +0 -0
  25. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/__main__.py +0 -0
  26. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/_deprecated.py +0 -0
  27. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/_parallel.py +0 -0
  28. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/analysis.py +0 -0
  29. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/artifacts.py +0 -0
  30. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/bootstrap.py +0 -0
  31. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/calibration.py +0 -0
  32. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/claims.py +0 -0
  33. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/config.py +0 -0
  34. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/docs.py +0 -0
  35. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/evidence.py +0 -0
  36. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/loaders.py +0 -0
  37. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/manifest.py +0 -0
  38. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/metrics.py +0 -0
  39. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/operating_points.py +0 -0
  40. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/paths.py +0 -0
  41. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/plotting.py +0 -0
  42. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/protocols.py +0 -0
  43. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/provenance.py +0 -0
  44. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/py.typed +0 -0
  45. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  46. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  47. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  48. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  49. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  50. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/splits.py +0 -0
  51. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/text_dedup.py +0 -0
  52. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/src/eval_toolkit/thresholds.py +0 -0
  53. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  54. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  55. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  56. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  57. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  58. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  59. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  60. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  61. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  62. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  63. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/benchmarks/__init__.py +0 -0
  64. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  65. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/conftest.py +0 -0
  66. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  67. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
  68. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
  69. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
  70. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/docs/expected.md +0 -0
  71. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/docs/input.md +0 -0
  72. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/docs/metrics.json +0 -0
  73. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
  74. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/strategies.py +0 -0
  75. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_analysis.py +0 -0
  76. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_artifacts.py +0 -0
  77. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_block_bootstrap_on_folds.py +0 -0
  78. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  79. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_edge_cases.py +0 -0
  80. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_golden.py +0 -0
  81. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_njobs.py +0 -0
  82. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_props.py +0 -0
  83. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_research_grounded.py +0 -0
  84. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_bootstrap_unit.py +0 -0
  85. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  86. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_determinism.py +0 -0
  87. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_optimization_failures.py +0 -0
  88. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_props.py +0 -0
  89. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_research_grounded.py +0 -0
  90. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_calibration_unit.py +0 -0
  91. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_claims.py +0 -0
  92. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_claims_coverage.py +0 -0
  93. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_claims_props.py +0 -0
  94. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_cli.py +0 -0
  95. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_config.py +0 -0
  96. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_coverage_bootstrap.py +0 -0
  97. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_coverage_calibration.py +0 -0
  98. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_coverage_harness.py +0 -0
  99. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_coverage_metrics.py +0 -0
  100. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_coverage_plotting.py +0 -0
  101. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  102. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_deprecations.py +0 -0
  103. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_docs_golden.py +0 -0
  104. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_docs_props.py +0 -0
  105. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_embeddings.py +0 -0
  106. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_evidence_validators.py +0 -0
  107. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_edge_cases.py +0 -0
  108. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_fault_injection.py +0 -0
  109. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_folded.py +0 -0
  110. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_internals.py +0 -0
  111. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_metric_options.py +0 -0
  112. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_harness_smoke.py +0 -0
  113. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_import_boundaries.py +0 -0
  114. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_leakage.py +0 -0
  115. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_leakage_error_paths.py +0 -0
  116. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_leakage_props.py +0 -0
  117. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_loaders.py +0 -0
  118. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_loaders_coverage.py +0 -0
  119. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_loaders_props.py +0 -0
  120. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_logging.py +0 -0
  121. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_manifest.py +0 -0
  122. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  123. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_manifest_props.py +0 -0
  124. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_manifest_validation.py +0 -0
  125. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_metrics_props.py +0 -0
  126. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_metrics_stratified_subsets.py +0 -0
  127. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_metrics_unit.py +0 -0
  128. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_misc_coverage.py +0 -0
  129. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_numeric_edge_cases.py +0 -0
  130. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_operating_points.py +0 -0
  131. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_operating_points_props.py +0 -0
  132. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_parallel.py +0 -0
  133. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_paths.py +0 -0
  134. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_pipeline_e2e.py +0 -0
  135. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_plotting_edge.py +0 -0
  136. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_plotting_smoke.py +0 -0
  137. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_plotting_visual.py +0 -0
  138. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_protocol_conformance.py +0 -0
  139. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_provenance.py +0 -0
  140. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_public_api.py +0 -0
  141. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_recall_at_fpr.py +0 -0
  142. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_reference_equivalence.py +0 -0
  143. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_reproducibility_integration.py +0 -0
  144. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_schemas.py +0 -0
  145. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_seeds.py +0 -0
  146. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_splits.py +0 -0
  147. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_splits_leakage_integration.py +0 -0
  148. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_splits_props.py +0 -0
  149. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_text_dedup.py +0 -0
  150. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_text_dedup_coverage.py +0 -0
  151. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_text_dedup_props.py +0 -0
  152. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_text_dedup_strategies.py +0 -0
  153. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_thresholds.py +0 -0
  154. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_thresholds_constant_score.py +0 -0
  155. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_thresholds_coverage.py +0 -0
  156. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_thresholds_props.py +0 -0
  157. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_thresholds_research_grounded.py +0 -0
  158. {eval_toolkit-0.35.0 → eval_toolkit-0.38.0}/tests/test_v09_contracts.py +0 -0
@@ -22,6 +22,7 @@ wheels/
22
22
  .coverage.*
23
23
  htmlcov/
24
24
  coverage.xml
25
+ coverage.json
25
26
  .hypothesis/
26
27
 
27
28
  # Type-checker / linter caches
@@ -7,6 +7,153 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.38.0] — 2026-05-18 — executable examples (myst-nb migration)
11
+
12
+ Docs-only minor. Migrates the 14 walkthrough pages in
13
+ `docs/source/examples/` from sybil-validated `` ```python `` blocks to
14
+ myst-nb `{code-cell}` directives. Cells now execute during
15
+ `sphinx-build` (`nb_execution_mode = "cache"`) rather than during
16
+ `pytest` via sybil. Cell outputs (printed text, tables, figures)
17
+ render inline in the published HTML, so the docs site reflects the
18
+ actual library behavior rather than a snapshot from the last manual
19
+ screenshot. Closes #31 (deferred from v0.34.1 and v0.35).
20
+
21
+ No public API changes.
22
+
23
+ ### Changed
24
+
25
+ - **14 example pages migrated** to myst-nb (`kernelspec` frontmatter +
26
+ `{code-cell}` directives in place of `` ```python ``). 73 code blocks
27
+ converted in total.
28
+ - **Two pages skip execution at page level** (`mystnb.execution_mode:
29
+ 'off'`) because they require optional deps kept out of `[dev]`:
30
+ - `pytorch_scorer_example.md` (needs `torch`)
31
+ - `callable_embedder_dedup.md` (needs `[embeddings]` /
32
+ `sentence-transformers`)
33
+ Both pages render their code statically.
34
+ - **`docs/source/examples/index.md`** — "How these run" section
35
+ rewritten to reflect myst-nb instead of sybil; new "skip-execed
36
+ pages" callout.
37
+ - **`conftest.py`** — dropped `docs/source/examples/*.md` from sybil
38
+ patterns. Sybil still covers `README`, `methodology/`, `migration/`,
39
+ `getting-started`, etc. (parts without executable-notebook value).
40
+
41
+ ### Why this matters
42
+
43
+ myst-nb infrastructure has been wired since v0.31.0 (the Sphinx docs
44
+ migration) but was underutilized — all example pages used static
45
+ `` ```python `` blocks. This release closes that gap. API drift in
46
+ the future will fail the docs build via runtime-output verification
47
+ (in addition to sybil's existing Python-level error catch on the
48
+ other doc trees).
49
+
50
+ ## [0.37.0] — 2026-05-18 — TokenizationLeakageCheck + per-module coverage floors
51
+
52
+ Two-issue bundle (#35 + #37) plus housekeeping closure of stale items
53
+ (PR #27, #38) that turned out to have been resolved in v0.33.x without
54
+ being checked off. Roadmap refresh in `3d40796` (this minor's
55
+ predecessor commit) replaced the version-keyed candidate list with
56
+ issue-keyed tracking, so this class of stale-roadmap bug shouldn't
57
+ recur.
58
+
59
+ ### Added
60
+
61
+ - **`eval_toolkit.leakage.TokenizationLeakageCheck`** — new within-split
62
+ `LeakageCheck` that dedups on tokenizer output rather than raw text.
63
+ Catches encoding-obfuscated dupes that survive
64
+ `NormalizedFormLeakageCheck` but collapse to identical `input_ids`
65
+ under a transformer's BPE / SentencePiece / WordPiece tokenizer.
66
+ Accepts any `Callable[[str], Mapping[str, object]]` returning HF-style
67
+ output with an `"input_ids"` key — does **not** import `transformers`
68
+ itself; consumers pass an already-instantiated tokenizer. Default
69
+ severity `"error"` (mirrors `NormalizedFormLeakageCheck`). Closes #35.
70
+ - New optional install extra **`[transformers]`** (`transformers>=4.0`).
71
+ Intentionally **not** in `[all]` / `[dev]` — mirrors the `[embeddings]`
72
+ precedent from v0.33.1 to keep contributor setup small (transformers
73
+ transitively pulls torch ~700MB).
74
+
75
+ ### Test
76
+
77
+ - **Per-module coverage floors restored.** `scripts/check_module_floors.py`
78
+ enforces an 85 % per-file floor (coverage.py natively only ships
79
+ global `--fail-under`). Hooked into `make coverage` via a post-pytest
80
+ invocation. Closes #37.
81
+ - **`# pragma: no cover` on optional-dep-active paths** in `seeds.py`
82
+ (torch) and `embeddings.py` (sentence-transformers). Reflects the
83
+ reality that these branches execute in user code, not CI. Both
84
+ modules now report 100 % coverage; previously sat at ~70 % which
85
+ obscured per-module floor enforcement.
86
+
87
+ ### Fixed
88
+
89
+ - **`make coverage` Makefile parity with PR CI.** PR #27 (external
90
+ contributor @leno23, draft) proposed adding `-m "not monte_carlo and
91
+ not benchmark"` to the `coverage` target. Audit found the same fix
92
+ had landed in v0.33.0 commit `9e375a8` ahead of the PR being filed;
93
+ closed PR #27 as superseded with thanks. No change in this release.
94
+
95
+ ### Closed (already-resolved)
96
+
97
+ - **#38 — CI doctests for `paths.py` / `provenance.py` / `seeds.py` /
98
+ `docs.py`.** All four modules were added to `.doctest-modules` in
99
+ `a26fd44` (2026-05-14, v0.32.x era); 7 doctests collected across the
100
+ named modules in current CI. Closed as already-resolved.
101
+
102
+ ### Test coverage
103
+
104
+ Test count 1376 → 1387 (+11). Aggregate 95.65 % → 95.69 %. All 28
105
+ modules ≥ 90 % individually post-pragma.
106
+
107
+ ## [0.36.0] — 2026-05-18 — harness parallelization (#29, #30) + Node 24 actions
108
+
109
+ Wires the v0.34.0 unified parallelism pattern into the harness evaluation
110
+ loop. `evaluate()` and `evaluate_folded()` now accept an `n_jobs` kwarg
111
+ (default `1` preserves bit-identical sequential behavior); under
112
+ `n_jobs != 1`, the `(slice × scorer)` work-unit loop in
113
+ `_score_all_slices` and the `(spec × scorer)` fit phase in
114
+ `_attach_transferred_operating_points` dispatch through joblib loky via
115
+ the existing `_parallel.parallel_map` helper.
116
+
117
+ ### Added
118
+
119
+ - `evaluate(..., n_jobs: int = 1)` and `evaluate_folded(..., n_jobs: int = 1)`
120
+ — keyword-only kwarg per Principle #3 of `methodology/parallelism.md`.
121
+ `n_jobs=1` (default) runs the existing pure-Python sequential loop
122
+ (Principle #4 — bit-identical to v0.35). `n_jobs > 1` uses joblib loky;
123
+ `n_jobs=-1` uses all cores; `n_jobs=0` is rejected. Closes #29, #30.
124
+ - Strict-pickle Scorer sniff at `evaluate()` entry when `n_jobs != 1`:
125
+ raises a clean `TypeError` referencing
126
+ `methodology/parallelism.md#scorer-picklability` with the underlying
127
+ pickle error attached. Reuses the v0.35 ADR contract; no new exception
128
+ class. Catches non-picklable scorers up front rather than relying on
129
+ joblib's more permissive cloudpickle path (which would silently absorb
130
+ closures and obscure the contract documented in v0.35).
131
+
132
+ ### Internal
133
+
134
+ - New module-scope step functions `_score_one_pair` and
135
+ `_fit_one_op_point_pair` in `harness.py` (picklable; required by loky).
136
+ - `_score_all_slices` and `_attach_transferred_operating_points`
137
+ refactored to use flat work-unit dispatch via `parallel_map`.
138
+
139
+ ### Tests
140
+
141
+ - New `tests/test_harness_parallelism.py` (7 tests): bit-identical
142
+ reproducibility across `n_jobs=1` vs `n_jobs=2` for `evaluate`
143
+ (basic, paired-diffs, operating-points), `evaluate_folded`,
144
+ picklability rejection (closure scorer), `n_jobs=0` rejection,
145
+ `n_jobs=-1` smoke. All 66 harness tests pass (7 new + 59 existing).
146
+
147
+ ### Infrastructure
148
+
149
+ - Bumped `actions/upload-artifact` and `actions/download-artifact` from
150
+ `@v5` → `@v6` across `publish.yml` / `nightly-mc.yml` /
151
+ `nightly-benchmarks.yml`. The v6 majors run on Node.js 24
152
+ (GitHub deprecates Node 20 actions from 2026-06-02). Other pinned
153
+ actions (`checkout@v6`, `setup-uv@v8.1.0`, `codeql-action@v3`,
154
+ `deploy-pages@v4`, `upload-pages-artifact@v3`) were not flagged in
155
+ the v0.35 publish annotation and are deferred to a separate audit.
156
+
10
157
  ## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
11
158
 
12
159
  Small, additive release. Adds a binary-classification calibration helper
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.35.0
3
+ Version: 0.38.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -69,6 +69,8 @@ Requires-Dist: matplotlib>=3.8; extra == 'plotting'
69
69
  Requires-Dist: pillow>=10.0; extra == 'plotting'
70
70
  Provides-Extra: property
71
71
  Requires-Dist: hypothesis>=6.100; extra == 'property'
72
+ Provides-Extra: transformers
73
+ Requires-Dist: transformers>=4.0; extra == 'transformers'
72
74
  Provides-Extra: validation
73
75
  Provides-Extra: yaml
74
76
  Requires-Dist: pyyaml>=6.0; extra == 'yaml'
@@ -56,6 +56,13 @@ parquet = ["pyarrow>=15.0"]
56
56
  # setup small. The canonical semantic-dedup recipe (all-MiniLM-L6-v2 +
57
57
  # cosine@0.80) is what this factory pre-wires for callers.
58
58
  embeddings = ["sentence-transformers>=3.0"]
59
+ # v0.37.0: TokenizationLeakageCheck — HF-tokenizer-aware dedup.
60
+ # transformers transitively pulls torch + tokenizers (~700MB) so we
61
+ # follow the [embeddings] precedent: opt-in only, NOT in [all] / [dev].
62
+ # Consumers pass an already-instantiated tokenizer callable; the check
63
+ # itself does not import transformers, so the optional install is
64
+ # strictly for callers wanting AutoTokenizer.from_pretrained(...).
65
+ transformers = ["transformers>=4.0"]
59
66
  # DEPRECATED (announced v0.30.1, removal v0.33.0).
60
67
  #
61
68
  # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -147,6 +147,7 @@ _EXPORTS: dict[str, str] = {
147
147
  "NearDuplicateCheck": "eval_toolkit.leakage",
148
148
  "NormalizedFormLeakageCheck": "eval_toolkit.leakage",
149
149
  "TemporalLeakageCheck": "eval_toolkit.leakage",
150
+ "TokenizationLeakageCheck": "eval_toolkit.leakage",
150
151
  "run_leakage_checks": "eval_toolkit.leakage",
151
152
  # --- loaders ---
152
153
  "DataFrameLoader": "eval_toolkit.loaders",
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.35.0"
5
+ __version__ = "0.38.0"
@@ -88,15 +88,18 @@ def make_minilm_embedder(
88
88
  "Install via: pip install eval-toolkit[embeddings]"
89
89
  ) from e
90
90
 
91
- _logger.debug(
91
+ # sentence-transformers-active path: excluded from CI coverage
92
+ # because [embeddings] is intentionally kept out of [dev]/[all]
93
+ # (transitive torch cost ~700MB per the v0.33.1 design note).
94
+ _logger.debug( # pragma: no cover
92
95
  "loading SentenceTransformer model_id=%s device=%s batch_size=%d",
93
96
  model_id,
94
97
  device,
95
98
  batch_size,
96
99
  )
97
- model = SentenceTransformer(model_id, device=device)
100
+ model = SentenceTransformer(model_id, device=device) # pragma: no cover
98
101
 
99
- def embedder(texts: Sequence[str]) -> np.ndarray:
102
+ def embedder(texts: Sequence[str]) -> np.ndarray: # pragma: no cover
100
103
  result = model.encode(
101
104
  list(texts),
102
105
  convert_to_numpy=True,
@@ -105,4 +108,4 @@ def make_minilm_embedder(
105
108
  )
106
109
  return np.asarray(result, dtype=np.float64)
107
110
 
108
- return embedder
111
+ return embedder # pragma: no cover
@@ -32,6 +32,7 @@ v0.7.0 additions:
32
32
  from __future__ import annotations
33
33
 
34
34
  import logging
35
+ import pickle
35
36
  import time
36
37
  import traceback
37
38
  from collections.abc import Mapping, Sequence
@@ -41,6 +42,7 @@ from typing import TYPE_CHECKING, Final, Literal, cast
41
42
 
42
43
  import numpy as np
43
44
 
45
+ from eval_toolkit._parallel import parallel_map
44
46
  from eval_toolkit.artifacts import (
45
47
  error_metric,
46
48
  sanitize_for_json,
@@ -62,7 +64,7 @@ from eval_toolkit.operating_points import (
62
64
  fit_operating_points,
63
65
  )
64
66
  from eval_toolkit.protocols import Scorer, SliceAwareScorer
65
- from eval_toolkit.thresholds import TargetFPRSelector
67
+ from eval_toolkit.thresholds import TargetFPRSelector, ThresholdSelector
66
68
 
67
69
  if TYPE_CHECKING:
68
70
  import pandas as pd
@@ -278,6 +280,31 @@ def _object_to_dict(obj: object, *, what: str) -> dict[str, object]:
278
280
  raise TypeError(f"expected {what} mapping or object with to_dict(), got {type(obj).__name__}")
279
281
 
280
282
 
283
+ def _assert_scorers_picklable(scorers: Mapping[str, Scorer]) -> None:
284
+ """Strict-pickle sniff for Scorer args when ``n_jobs != 1``.
285
+
286
+ joblib's loky backend uses cloudpickle (which absorbs closures + local
287
+ classes), but the v0.35 Scorer picklability ADR
288
+ (``methodology/parallelism.md#scorer-picklability``) is a *strict* pickle
289
+ contract — cloudpickle behavior is platform-dependent and the more
290
+ permissive failure modes are harder to debug. Fail fast at
291
+ :func:`evaluate` entry with the same ``TypeError`` style as
292
+ :func:`eval_toolkit._parallel.parallel_map`'s fn-sniff (no new exception
293
+ class — single channel for the picklability contract).
294
+ """
295
+ for sname, scorer in scorers.items():
296
+ try:
297
+ pickle.dumps(scorer)
298
+ except (pickle.PicklingError, AttributeError, TypeError) as exc:
299
+ raise TypeError(
300
+ f"evaluate(n_jobs != 1): scorer {sname!r} "
301
+ f"({type(scorer).__name__}) is not picklable. See "
302
+ f"methodology/parallelism.md#scorer-picklability for the "
303
+ f"contract and worked picklable / broken / fix examples. "
304
+ f"Underlying error: {exc}"
305
+ ) from exc
306
+
307
+
281
308
  def _should_score_slice(scorer: Scorer, slice_name: str) -> bool:
282
309
  """Honor optional slice-aware scorer hooks without widening the base Protocol."""
283
310
  should_score = getattr(scorer, "should_score_slice", None)
@@ -696,6 +723,36 @@ def _run_leakage_phase(
696
723
  )
697
724
 
698
725
 
726
+ # Tuple shape for the flat `(slice × scorer)` work-unit dispatched to
727
+ # parallel_map by `_score_all_slices`. Defined at module scope so workers
728
+ # can pickle the function reference.
729
+ _ScoreOnePairItem = tuple[EvalSlice, str, Scorer, int, int, Literal["raise", "record"]]
730
+ _ScoreOnePairResult = tuple[str, str, dict[str, object], np.ndarray]
731
+
732
+
733
+ def _score_one_pair(item: _ScoreOnePairItem) -> _ScoreOnePairResult:
734
+ """Picklable step function for ``(slice × scorer)`` parallel dispatch.
735
+
736
+ Module-scope so loky workers can serialize the reference (closures over
737
+ enclosing locals would fail :func:`parallel_map`'s pickle sniff). All
738
+ inputs flow through the ``item`` tuple — no captured state.
739
+
740
+ Returns ``(slice_name, scorer_name, result_dict, scores_array)`` so the
741
+ caller can reassemble ``by_slice`` + ``score_cache`` in the original
742
+ iteration order.
743
+ """
744
+ slice_, sname, scorer, n_resamples, seed, on_scorer_error = item
745
+ result = evaluate_scorer_on_slice(
746
+ scorer,
747
+ slice_,
748
+ n_resamples=n_resamples,
749
+ seed=seed,
750
+ on_scorer_error=on_scorer_error,
751
+ )
752
+ scores = np.asarray(result["scores"], dtype=np.float64)
753
+ return slice_.name, sname, result, scores
754
+
755
+
699
756
  def _score_all_slices(
700
757
  scorers: dict[str, Scorer],
701
758
  slices: Sequence[EvalSlice],
@@ -704,6 +761,7 @@ def _score_all_slices(
704
761
  seed: int,
705
762
  paired_diffs: list[tuple[str, str]] | None,
706
763
  on_scorer_error: Literal["raise", "record"],
764
+ n_jobs: int = 1,
707
765
  ) -> tuple[dict[str, dict[str, object]], dict[tuple[str, str], np.ndarray]]:
708
766
  """Score every ``(slice, scorer)`` pair; return ``(by_slice, score_cache)``.
709
767
 
@@ -714,10 +772,17 @@ def _score_all_slices(
714
772
  ``score_cache`` is keyed ``(slice.name, scorer.name)`` and carries the
715
773
  raw score arrays so :func:`_attach_transferred_operating_points` can
716
774
  re-use them without re-calling scorers.
717
- """
718
- by_slice: dict[str, dict[str, object]] = {}
719
- score_cache: dict[tuple[str, str], np.ndarray] = {}
720
775
 
776
+ v0.36 added ``n_jobs``: a flat ``(slice × scorer)`` parallel dispatch
777
+ via :func:`eval_toolkit._parallel.parallel_map`. Default ``1`` preserves
778
+ bit-identical sequential behavior. ``n_jobs != 1`` requires picklable
779
+ scorers per the v0.35 ADR
780
+ (``docs/source/methodology/parallelism.md#scorer-picklability``).
781
+ """
782
+ # Pre-filter skipped pairs (allow-list miss) before dispatching parallel
783
+ # work-units. Logs the same skip messages as the pre-parallel version.
784
+ work_units: list[_ScoreOnePairItem] = []
785
+ skipped: dict[tuple[str, str], dict[str, object]] = {}
721
786
  for slice_ in slices:
722
787
  _logger.info(
723
788
  "[slice %s] n=%d, positives=%d",
@@ -725,32 +790,61 @@ def _score_all_slices(
725
790
  len(slice_.df),
726
791
  int(slice_.y_true.sum()),
727
792
  )
728
- slice_data: dict[str, dict[str, object]] = {}
729
- scores_by_scorer: dict[str, np.ndarray] = {}
730
793
  for sname, scorer in scorers.items():
731
794
  if not _should_score_slice(scorer, slice_.name):
732
795
  reason = f"slice {slice_.name!r} not in scorer allow-list"
733
- slice_data[sname] = _skipped_scorer_result(slice_, reason)
796
+ skipped[(slice_.name, sname)] = _skipped_scorer_result(slice_, reason)
734
797
  _logger.info(" skipped %s: %s", sname, reason)
735
798
  continue
736
- t0 = time.time()
737
- slice_data[sname] = evaluate_scorer_on_slice(
738
- scorer,
739
- slice_,
740
- n_resamples=n_resamples,
741
- seed=seed,
742
- on_scorer_error=on_scorer_error,
743
- )
744
- # If the scorer raised under on_scorer_error="record", scores is [].
745
- # Subsequent paired-diff machinery sees the empty array and will
746
- # short-circuit on the same len-check it already does for skipped
747
- # scorers; no special-case needed.
748
- scores_by_scorer[sname] = np.asarray(slice_data[sname]["scores"], dtype=np.float64)
749
- score_cache[(slice_.name, sname)] = scores_by_scorer[sname]
750
- elapsed = time.time() - t0
751
- pr = slice_data[sname].get("pr_auc")
799
+ work_units.append((slice_, sname, scorer, n_resamples, seed, on_scorer_error))
800
+
801
+ # Parallel scoring. parallel_map at n_jobs=1 is a pure-Python for-loop
802
+ # (Principle #4) — bit-identical to the pre-v0.36 sequential code.
803
+ if work_units:
804
+ t0_total = time.time()
805
+ results = parallel_map(
806
+ _score_one_pair,
807
+ work_units,
808
+ n_jobs=n_jobs,
809
+ description="harness _score_all_slices",
810
+ )
811
+ elapsed_total = time.time() - t0_total
812
+ _logger.info(
813
+ " scored %d (slice, scorer) pairs in %.1fs (n_jobs=%d)",
814
+ len(work_units),
815
+ elapsed_total,
816
+ n_jobs,
817
+ )
818
+ else:
819
+ results = []
820
+
821
+ # Index results for O(1) lookup during reassembly.
822
+ results_by_key: dict[tuple[str, str], _ScoreOnePairResult] = {
823
+ (slice_name, sname): (slice_name, sname, result_dict, scores_arr)
824
+ for slice_name, sname, result_dict, scores_arr in results
825
+ }
826
+
827
+ # Reassemble in the original (slices × scorers.items()) iteration order.
828
+ by_slice: dict[str, dict[str, object]] = {}
829
+ score_cache: dict[tuple[str, str], np.ndarray] = {}
830
+ for slice_ in slices:
831
+ slice_data: dict[str, dict[str, object]] = {}
832
+ scores_by_scorer: dict[str, np.ndarray] = {}
833
+ for sname in scorers:
834
+ key = (slice_.name, sname)
835
+ if key in skipped:
836
+ slice_data[sname] = skipped[key]
837
+ continue
838
+ _, _, result_dict, scores_arr = results_by_key[key]
839
+ slice_data[sname] = result_dict
840
+ # If the scorer raised under on_scorer_error="record", scores_arr is [].
841
+ # Paired-diff machinery short-circuits on the same len-check it uses
842
+ # for skipped scorers; no special-case needed.
843
+ scores_by_scorer[sname] = scores_arr
844
+ score_cache[key] = scores_arr
845
+ pr = result_dict.get("pr_auc")
752
846
  pr_display = f"{pr:.4f}" if isinstance(pr, float) else "N/A"
753
- _logger.info(" %s: PR-AUC=%s (%.1fs)", sname, pr_display, elapsed)
847
+ _logger.info(" %s: PR-AUC=%s", sname, pr_display)
754
848
 
755
849
  diffs = (
756
850
  _compute_paired_diffs(
@@ -789,6 +883,7 @@ def evaluate(
789
883
  on_leakage: Literal["raise", "record", "skip"] = "raise",
790
884
  on_scorer_error: Literal["raise", "record"] = "raise",
791
885
  operating_point_specs: Sequence[OperatingPointSpec] = (),
886
+ n_jobs: int = 1,
792
887
  ) -> RunResult:
793
888
  """Run every scorer on every slice; return a pure :class:`RunResult` (no IO).
794
889
 
@@ -830,6 +925,15 @@ def evaluate(
830
925
  Fit thresholds on one mixed-class slice and apply them to named target
831
926
  slices. Results are attached under each scorer's
832
927
  ``"transferred_operating_points"`` block. Default empty (skip).
928
+ n_jobs : int, optional
929
+ Parallel workers (default 1 — sequential). ``n_jobs > 1`` uses
930
+ joblib loky to parallelize the flat ``(slice × scorer)`` work-unit
931
+ loop in :func:`_score_all_slices` (and the operating-point fit
932
+ phase when ``operating_point_specs`` is non-empty). ``n_jobs=-1``
933
+ uses all cores; ``n_jobs=0`` is rejected. Scorers must be picklable
934
+ when ``n_jobs != 1`` — see
935
+ :doc:`methodology/parallelism` § Scorer picklability for the
936
+ contract + worked examples.
833
937
 
834
938
  Returns
835
939
  -------
@@ -850,6 +954,9 @@ def evaluate(
850
954
  if not slices:
851
955
  raise ValueError("at least one slice required")
852
956
 
957
+ if n_jobs != 1:
958
+ _assert_scorers_picklable(scorers)
959
+
853
960
  config: dict[str, object] = {
854
961
  "n_resamples": n_resamples,
855
962
  "seed": seed,
@@ -872,6 +979,7 @@ def evaluate(
872
979
  seed=seed,
873
980
  paired_diffs=paired_diffs,
874
981
  on_scorer_error=on_scorer_error,
982
+ n_jobs=n_jobs,
875
983
  )
876
984
 
877
985
  if operating_point_specs:
@@ -882,11 +990,45 @@ def evaluate(
882
990
  score_cache=score_cache,
883
991
  scorer_names=list(scorers.keys()),
884
992
  specs=operating_point_specs,
993
+ n_jobs=n_jobs,
885
994
  )
886
995
 
887
996
  return RunResult(run_id=run_id, git_sha=git_sha, config=config, by_slice=by_slice)
888
997
 
889
998
 
999
+ _OpPointFitItem = tuple[
1000
+ str, # spec_name (for reassembly key)
1001
+ str, # fit_slice_name (passed through to fit_operating_points)
1002
+ str, # scorer_name
1003
+ np.ndarray, # fit_y_true
1004
+ np.ndarray, # fit_scores
1005
+ Sequence[ThresholdSelector], # spec.selectors (passed through to fit_operating_points)
1006
+ ]
1007
+ _OpPointFitResult = tuple[str, str, object] # (spec_name, scorer_name, fitted | error_dict)
1008
+
1009
+
1010
+ def _fit_one_op_point_pair(item: _OpPointFitItem) -> _OpPointFitResult:
1011
+ """Picklable step function for ``(spec × scorer)`` operating-point fitting.
1012
+
1013
+ Module-scope so loky workers can serialize the reference. All inputs flow
1014
+ through the ``item`` tuple. Returns ``(spec_name, scorer_name, fitted)``
1015
+ where ``fitted`` is either the :func:`fit_operating_points` result or a
1016
+ ``{"error": str}`` dict matching the sequential code path.
1017
+ """
1018
+ spec_name, fit_slice_name, scorer_name, y_true, fit_scores, selectors = item
1019
+ try:
1020
+ fitted = fit_operating_points(
1021
+ y_true,
1022
+ fit_scores,
1023
+ selectors,
1024
+ fitted_on_slice=fit_slice_name,
1025
+ scorer_name=scorer_name,
1026
+ )
1027
+ except (ValueError, RuntimeError) as exc:
1028
+ return spec_name, scorer_name, {"error": str(exc)}
1029
+ return spec_name, scorer_name, fitted
1030
+
1031
+
890
1032
  def _attach_transferred_operating_points(
891
1033
  *,
892
1034
  by_slice: dict[str, dict[str, object]],
@@ -894,34 +1036,73 @@ def _attach_transferred_operating_points(
894
1036
  score_cache: Mapping[tuple[str, str], np.ndarray],
895
1037
  scorer_names: Sequence[str],
896
1038
  specs: Sequence[OperatingPointSpec],
1039
+ n_jobs: int = 1,
897
1040
  ) -> None:
898
- """Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics."""
1041
+ """Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics.
1042
+
1043
+ v0.36 added ``n_jobs``: parallelizes the ``(spec × scorer)`` fit phase
1044
+ via :func:`eval_toolkit._parallel.parallel_map`. The apply phase
1045
+ (writing into ``by_slice``) stays sequential — fitting dominates runtime.
1046
+ Default ``n_jobs=1`` preserves bit-identical sequential behavior.
1047
+ """
1048
+ # Pre-flight: handle "fit slice not found" errors (these short-circuit the
1049
+ # entire spec) + collect valid fit work-units. Tracks pre-conditions
1050
+ # ("fit scorer skipped") as separate state so the parallel dispatch only
1051
+ # carries actual work.
1052
+ fit_work: list[_OpPointFitItem] = []
1053
+ fit_skip_reasons: dict[tuple[str, str], dict[str, object]] = {}
1054
+ specs_with_valid_fit: list[OperatingPointSpec] = []
1055
+ names_per_spec: dict[str, list[str]] = {}
1056
+
899
1057
  for spec in specs:
900
1058
  names = list(spec.scorer_names) if spec.scorer_names else list(scorer_names)
1059
+ names_per_spec[spec.name] = names
901
1060
  if spec.fit_slice not in slices_by_name:
902
1061
  _record_spec_error(by_slice, spec, names, f"fit slice {spec.fit_slice!r} not found")
903
1062
  continue
904
-
1063
+ specs_with_valid_fit.append(spec)
905
1064
  fit_slice = slices_by_name[spec.fit_slice]
906
- fitted_by_scorer: dict[str, object] = {}
907
1065
  for scorer_name in names:
908
1066
  fit_scores = score_cache.get((spec.fit_slice, scorer_name))
909
1067
  if fit_scores is None or len(fit_scores) != len(fit_slice.y_true):
910
- fitted_by_scorer[scorer_name] = {
1068
+ fit_skip_reasons[(spec.name, scorer_name)] = {
911
1069
  "error": "fit scorer skipped, errored, or produced no scores"
912
1070
  }
913
1071
  continue
914
- try:
915
- fitted_by_scorer[scorer_name] = fit_operating_points(
1072
+ fit_work.append(
1073
+ (
1074
+ spec.name,
1075
+ spec.fit_slice,
1076
+ scorer_name,
916
1077
  fit_slice.y_true,
917
1078
  fit_scores,
918
1079
  spec.selectors,
919
- fitted_on_slice=spec.fit_slice,
920
- scorer_name=scorer_name,
921
1080
  )
922
- except (ValueError, RuntimeError) as exc:
923
- fitted_by_scorer[scorer_name] = {"error": str(exc)}
1081
+ )
1082
+
1083
+ # Parallel fit phase. parallel_map at n_jobs=1 is a pure-Python for-loop
1084
+ # (Principle #4) — bit-identical to the pre-v0.36 sequential code.
1085
+ fit_results: list[_OpPointFitResult] = (
1086
+ parallel_map(
1087
+ _fit_one_op_point_pair,
1088
+ fit_work,
1089
+ n_jobs=n_jobs,
1090
+ description="harness _attach_transferred_operating_points (fit)",
1091
+ )
1092
+ if fit_work
1093
+ else []
1094
+ )
1095
+
1096
+ # Index by (spec_name, scorer_name) for O(1) lookup in the apply phase.
1097
+ fitted_by_pair: dict[tuple[str, str], object] = {
1098
+ (spec_name, scorer_name): fitted for spec_name, scorer_name, fitted in fit_results
1099
+ }
1100
+ fitted_by_pair.update(fit_skip_reasons)
924
1101
 
1102
+ # Sequential apply phase — preserves the original by_slice mutation order
1103
+ # and the schema of error / skipped markers.
1104
+ for spec in specs_with_valid_fit:
1105
+ names = names_per_spec[spec.name]
925
1106
  for target_name in spec.apply_slices:
926
1107
  if target_name not in slices_by_name:
927
1108
  _record_spec_error(
@@ -939,7 +1120,7 @@ def _attach_transferred_operating_points(
939
1120
  spec_block: dict[str, object] = {}
940
1121
  transfer_block[spec.name] = spec_block
941
1122
 
942
- fitted = fitted_by_scorer.get(scorer_name)
1123
+ fitted = fitted_by_pair.get((spec.name, scorer_name))
943
1124
  if not isinstance(fitted, dict) or "error" in fitted:
944
1125
  spec_block["error"] = (
945
1126
  str(fitted.get("error", "threshold fitting failed"))
@@ -1099,6 +1280,7 @@ def evaluate_folded(
1099
1280
  on_scorer_error: Literal["raise", "record"] = "raise",
1100
1281
  eval_split_names: Sequence[str] = ("test",),
1101
1282
  summary_metrics: Sequence[str] = ("pr_auc", "roc_auc"),
1283
+ n_jobs: int = 1,
1102
1284
  ) -> RunResult:
1103
1285
  """Run a fold aggregator: ``Splitter × seeds → RunResult`` with CV-CI summary.
1104
1286
 
@@ -1128,6 +1310,15 @@ def evaluate_folded(
1128
1310
  RNG seeds for multi-seed × CV. Default ``(42,)`` (single seed).
1129
1311
  n_resamples, paired_diffs, leakage_checks, on_leakage, on_scorer_error :
1130
1312
  Forwarded to :func:`evaluate` per fold.
1313
+ n_jobs : int, optional
1314
+ Parallel workers (default 1 — sequential). Forwarded to
1315
+ :func:`evaluate` per fold; parallelizes the inner
1316
+ ``(slice × scorer)`` work-unit loop within each fold. Folds
1317
+ themselves run sequentially to keep determinism + traceback
1318
+ fidelity simple; for fold-level parallelism, consider an external
1319
+ ``joblib.Parallel`` wrapper at the call site. See
1320
+ :doc:`methodology/parallelism` § Scorer picklability for the
1321
+ Scorer picklability contract when ``n_jobs != 1``.
1131
1322
  eval_split_names : sequence of str, optional
1132
1323
  Subset of each fold-dict's keys to actually evaluate. Default
1133
1324
  ``("test",)`` — train sets are skipped (eval-only K-fold). Pass
@@ -1183,6 +1374,7 @@ def evaluate_folded(
1183
1374
  leakage_checks=leakage_checks,
1184
1375
  on_leakage=on_leakage,
1185
1376
  on_scorer_error=on_scorer_error,
1377
+ n_jobs=n_jobs,
1186
1378
  )
1187
1379
  by_fold[fold_id] = fold_result
1188
1380