eval-toolkit 0.34.0__tar.gz → 0.37.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/.gitignore +1 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/CHANGELOG.md +140 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/PKG-INFO +3 -1
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/pyproject.toml +7 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/__init__.py +2 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/calibration.py +97 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/embeddings.py +7 -4
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/harness.py +227 -35
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/leakage.py +142 -2
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/protocols.py +10 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/seeds.py +11 -7
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/public_api/snapshot.json +18 -3
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_unit.py +126 -0
- eval_toolkit-0.37.0/tests/test_harness_parallelism.py +266 -0
- eval_toolkit-0.37.0/tests/test_tokenization_leakage_check.py +194 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/LICENSE +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/STYLE.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.34.0 → eval_toolkit-0.37.0}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,146 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.37.0] — 2026-05-18 — TokenizationLeakageCheck + per-module coverage floors
|
|
11
|
+
|
|
12
|
+
Two-issue bundle (#35 + #37) plus housekeeping closure of stale items
|
|
13
|
+
(PR #27, #38) that turned out to have been resolved in v0.33.x without
|
|
14
|
+
being checked off. Roadmap refresh in `3d40796` (this minor's
|
|
15
|
+
predecessor commit) replaced the version-keyed candidate list with
|
|
16
|
+
issue-keyed tracking, so this class of stale-roadmap bug shouldn't
|
|
17
|
+
recur.
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
|
|
21
|
+
- **`eval_toolkit.leakage.TokenizationLeakageCheck`** — new within-split
|
|
22
|
+
`LeakageCheck` that dedups on tokenizer output rather than raw text.
|
|
23
|
+
Catches encoding-obfuscated dupes that survive
|
|
24
|
+
`NormalizedFormLeakageCheck` but collapse to identical `input_ids`
|
|
25
|
+
under a transformer's BPE / SentencePiece / WordPiece tokenizer.
|
|
26
|
+
Accepts any `Callable[[str], Mapping[str, object]]` returning HF-style
|
|
27
|
+
output with an `"input_ids"` key — does **not** import `transformers`
|
|
28
|
+
itself; consumers pass an already-instantiated tokenizer. Default
|
|
29
|
+
severity `"error"` (mirrors `NormalizedFormLeakageCheck`). Closes #35.
|
|
30
|
+
- New optional install extra **`[transformers]`** (`transformers>=4.0`).
|
|
31
|
+
Intentionally **not** in `[all]` / `[dev]` — mirrors the `[embeddings]`
|
|
32
|
+
precedent from v0.33.1 to keep contributor setup small (transformers
|
|
33
|
+
transitively pulls torch ~700MB).
|
|
34
|
+
|
|
35
|
+
### Test
|
|
36
|
+
|
|
37
|
+
- **Per-module coverage floors restored.** `scripts/check_module_floors.py`
|
|
38
|
+
enforces an 85 % per-file floor (coverage.py natively only ships
|
|
39
|
+
global `--fail-under`). Hooked into `make coverage` via a post-pytest
|
|
40
|
+
invocation. Closes #37.
|
|
41
|
+
- **`# pragma: no cover` on optional-dep-active paths** in `seeds.py`
|
|
42
|
+
(torch) and `embeddings.py` (sentence-transformers). Reflects the
|
|
43
|
+
reality that these branches execute in user code, not CI. Both
|
|
44
|
+
modules now report 100 % coverage; previously sat at ~70 % which
|
|
45
|
+
obscured per-module floor enforcement.
|
|
46
|
+
|
|
47
|
+
### Fixed
|
|
48
|
+
|
|
49
|
+
- **`make coverage` Makefile parity with PR CI.** PR #27 (external
|
|
50
|
+
contributor @leno23, draft) proposed adding `-m "not monte_carlo and
|
|
51
|
+
not benchmark"` to the `coverage` target. Audit found the same fix
|
|
52
|
+
had landed in v0.33.0 commit `9e375a8` ahead of the PR being filed;
|
|
53
|
+
closed PR #27 as superseded with thanks. No change in this release.
|
|
54
|
+
|
|
55
|
+
### Closed (already-resolved)
|
|
56
|
+
|
|
57
|
+
- **#38 — CI doctests for `paths.py` / `provenance.py` / `seeds.py` /
|
|
58
|
+
`docs.py`.** All four modules were added to `.doctest-modules` in
|
|
59
|
+
`a26fd44` (2026-05-14, v0.32.x era); 7 doctests collected across the
|
|
60
|
+
named modules in current CI. Closed as already-resolved.
|
|
61
|
+
|
|
62
|
+
### Test coverage
|
|
63
|
+
|
|
64
|
+
Test count 1376 → 1387 (+11). Aggregate 95.65 % → 95.69 %. All 28
|
|
65
|
+
modules ≥ 90 % individually post-pragma.
|
|
66
|
+
|
|
67
|
+
## [0.36.0] — 2026-05-18 — harness parallelization (#29, #30) + Node 24 actions
|
|
68
|
+
|
|
69
|
+
Wires the v0.34.0 unified parallelism pattern into the harness evaluation
|
|
70
|
+
loop. `evaluate()` and `evaluate_folded()` now accept an `n_jobs` kwarg
|
|
71
|
+
(default `1` preserves bit-identical sequential behavior); under
|
|
72
|
+
`n_jobs != 1`, the `(slice × scorer)` work-unit loop in
|
|
73
|
+
`_score_all_slices` and the `(spec × scorer)` fit phase in
|
|
74
|
+
`_attach_transferred_operating_points` dispatch through joblib loky via
|
|
75
|
+
the existing `_parallel.parallel_map` helper.
|
|
76
|
+
|
|
77
|
+
### Added
|
|
78
|
+
|
|
79
|
+
- `evaluate(..., n_jobs: int = 1)` and `evaluate_folded(..., n_jobs: int = 1)`
|
|
80
|
+
— keyword-only kwarg per Principle #3 of `methodology/parallelism.md`.
|
|
81
|
+
`n_jobs=1` (default) runs the existing pure-Python sequential loop
|
|
82
|
+
(Principle #4 — bit-identical to v0.35). `n_jobs > 1` uses joblib loky;
|
|
83
|
+
`n_jobs=-1` uses all cores; `n_jobs=0` is rejected. Closes #29, #30.
|
|
84
|
+
- Strict-pickle Scorer sniff at `evaluate()` entry when `n_jobs != 1`:
|
|
85
|
+
raises a clean `TypeError` referencing
|
|
86
|
+
`methodology/parallelism.md#scorer-picklability` with the underlying
|
|
87
|
+
pickle error attached. Reuses the v0.35 ADR contract; no new exception
|
|
88
|
+
class. Catches non-picklable scorers up front rather than relying on
|
|
89
|
+
joblib's more permissive cloudpickle path (which would silently absorb
|
|
90
|
+
closures and obscure the contract documented in v0.35).
|
|
91
|
+
|
|
92
|
+
### Internal
|
|
93
|
+
|
|
94
|
+
- New module-scope step functions `_score_one_pair` and
|
|
95
|
+
`_fit_one_op_point_pair` in `harness.py` (picklable; required by loky).
|
|
96
|
+
- `_score_all_slices` and `_attach_transferred_operating_points`
|
|
97
|
+
refactored to use flat work-unit dispatch via `parallel_map`.
|
|
98
|
+
|
|
99
|
+
### Tests
|
|
100
|
+
|
|
101
|
+
- New `tests/test_harness_parallelism.py` (7 tests): bit-identical
|
|
102
|
+
reproducibility across `n_jobs=1` vs `n_jobs=2` for `evaluate`
|
|
103
|
+
(basic, paired-diffs, operating-points), `evaluate_folded`,
|
|
104
|
+
picklability rejection (closure scorer), `n_jobs=0` rejection,
|
|
105
|
+
`n_jobs=-1` smoke. All 66 harness tests pass (7 new + 59 existing).
|
|
106
|
+
|
|
107
|
+
### Infrastructure
|
|
108
|
+
|
|
109
|
+
- Bumped `actions/upload-artifact` and `actions/download-artifact` from
|
|
110
|
+
`@v5` → `@v6` across `publish.yml` / `nightly-mc.yml` /
|
|
111
|
+
`nightly-benchmarks.yml`. The v6 majors run on Node.js 24
|
|
112
|
+
(GitHub deprecates Node 20 actions from 2026-06-02). Other pinned
|
|
113
|
+
actions (`checkout@v6`, `setup-uv@v8.1.0`, `codeql-action@v3`,
|
|
114
|
+
`deploy-pages@v4`, `upload-pages-artifact@v3`) were not flagged in
|
|
115
|
+
the v0.35 publish annotation and are deferred to a separate audit.
|
|
116
|
+
|
|
117
|
+
## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
|
|
118
|
+
|
|
119
|
+
Small, additive release. Adds a binary-classification calibration helper
|
|
120
|
+
that lets consumers drop the ~50 LOC scalar-proba adapter many were
|
|
121
|
+
carrying, plus a design ADR that unblocks the v0.36 harness / operating-
|
|
122
|
+
point parallelization work (#29, #30) without re-litigating picklability.
|
|
123
|
+
|
|
124
|
+
### Added
|
|
125
|
+
|
|
126
|
+
- `eval_toolkit.fit_temperature_binary(y_true, y_score)` — scalar-proba
|
|
127
|
+
adapter for the multi-class `fit_temperature` fitter. Converts `(n,)`
|
|
128
|
+
probabilities of class 1 to a 2-column logit array via clipped logit
|
|
129
|
+
(`[0, logit(p)]` so softmax row 1 reproduces `p`), delegates to the
|
|
130
|
+
deployment-quality fitter, and returns `(T_opt, apply)` where
|
|
131
|
+
`apply: (n,) -> (n,)` does scalar-in / scalar-out T-scaling. Unlike
|
|
132
|
+
`fit_temperature_oracle`, no warning — the contract assumes val / test
|
|
133
|
+
separation (deployment-quality calibration, not fit-on-test). Closes
|
|
134
|
+
#28.
|
|
135
|
+
|
|
136
|
+
### Documentation
|
|
137
|
+
|
|
138
|
+
- `docs/source/methodology/parallelism.md` — new `## Scorer picklability`
|
|
139
|
+
sub-section documenting the Scorer protocol's picklability contract
|
|
140
|
+
for `n_jobs > 1` usage. Includes worked picklable / broken-closure /
|
|
141
|
+
fix examples plus a list of common non-picklable patterns to watch for
|
|
142
|
+
in user-supplied Scorers (closures, lambdas on instances, local-scope
|
|
143
|
+
classes, attributes holding live sockets / file handles). Anchors on
|
|
144
|
+
the existing v0.34.0 `parallel_map` pickle sniff + `TypeError`
|
|
145
|
+
channel — no new exception class. Unblocks v0.36 implementation of
|
|
146
|
+
#29 and #30.
|
|
147
|
+
- `eval_toolkit.protocols.Scorer` docstring — Notes block pointing at
|
|
148
|
+
the new methodology section.
|
|
149
|
+
|
|
10
150
|
## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
|
|
11
151
|
|
|
12
152
|
Closes all 7 open backlog issues in one consumer-closing release. Also
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.37.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -69,6 +69,8 @@ Requires-Dist: matplotlib>=3.8; extra == 'plotting'
|
|
|
69
69
|
Requires-Dist: pillow>=10.0; extra == 'plotting'
|
|
70
70
|
Provides-Extra: property
|
|
71
71
|
Requires-Dist: hypothesis>=6.100; extra == 'property'
|
|
72
|
+
Provides-Extra: transformers
|
|
73
|
+
Requires-Dist: transformers>=4.0; extra == 'transformers'
|
|
72
74
|
Provides-Extra: validation
|
|
73
75
|
Provides-Extra: yaml
|
|
74
76
|
Requires-Dist: pyyaml>=6.0; extra == 'yaml'
|
|
@@ -56,6 +56,13 @@ parquet = ["pyarrow>=15.0"]
|
|
|
56
56
|
# setup small. The canonical semantic-dedup recipe (all-MiniLM-L6-v2 +
|
|
57
57
|
# cosine@0.80) is what this factory pre-wires for callers.
|
|
58
58
|
embeddings = ["sentence-transformers>=3.0"]
|
|
59
|
+
# v0.37.0: TokenizationLeakageCheck — HF-tokenizer-aware dedup.
|
|
60
|
+
# transformers transitively pulls torch + tokenizers (~700MB) so we
|
|
61
|
+
# follow the [embeddings] precedent: opt-in only, NOT in [all] / [dev].
|
|
62
|
+
# Consumers pass an already-instantiated tokenizer callable; the check
|
|
63
|
+
# itself does not import transformers, so the optional install is
|
|
64
|
+
# strictly for callers wanting AutoTokenizer.from_pretrained(...).
|
|
65
|
+
transformers = ["transformers>=4.0"]
|
|
59
66
|
# DEPRECATED (announced v0.30.1, removal v0.33.0).
|
|
60
67
|
#
|
|
61
68
|
# Retained as a transitive no-op so `pip install eval-toolkit[validation]`
|
|
@@ -87,6 +87,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
87
87
|
"fit_isotonic_calibrator": "eval_toolkit.calibration",
|
|
88
88
|
"fit_platt_calibrator": "eval_toolkit.calibration",
|
|
89
89
|
"fit_temperature": "eval_toolkit.calibration",
|
|
90
|
+
"fit_temperature_binary": "eval_toolkit.calibration",
|
|
90
91
|
"fit_temperature_oracle": "eval_toolkit.calibration",
|
|
91
92
|
"reliability_curve": "eval_toolkit.calibration",
|
|
92
93
|
"reliability_diagram_data": "eval_toolkit.calibration",
|
|
@@ -146,6 +147,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
146
147
|
"NearDuplicateCheck": "eval_toolkit.leakage",
|
|
147
148
|
"NormalizedFormLeakageCheck": "eval_toolkit.leakage",
|
|
148
149
|
"TemporalLeakageCheck": "eval_toolkit.leakage",
|
|
150
|
+
"TokenizationLeakageCheck": "eval_toolkit.leakage",
|
|
149
151
|
"run_leakage_checks": "eval_toolkit.leakage",
|
|
150
152
|
# --- loaders ---
|
|
151
153
|
"DataFrameLoader": "eval_toolkit.loaders",
|
|
@@ -57,6 +57,7 @@ __all__ = [
|
|
|
57
57
|
"fit_isotonic_calibrator",
|
|
58
58
|
"fit_platt_calibrator",
|
|
59
59
|
"fit_temperature",
|
|
60
|
+
"fit_temperature_binary",
|
|
60
61
|
"fit_temperature_oracle",
|
|
61
62
|
"maximum_calibration_error",
|
|
62
63
|
"reliability_curve",
|
|
@@ -1038,6 +1039,102 @@ def _negative_log_likelihood(t: float, logits: np.ndarray, labels: np.ndarray) -
|
|
|
1038
1039
|
return float(-log_probs[np.arange(len(labels)), labels].mean())
|
|
1039
1040
|
|
|
1040
1041
|
|
|
1042
|
+
def fit_temperature_binary(
|
|
1043
|
+
y_true: np.ndarray,
|
|
1044
|
+
y_score: np.ndarray,
|
|
1045
|
+
*,
|
|
1046
|
+
bounds: tuple[float, float] = (0.05, 20.0),
|
|
1047
|
+
) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
|
|
1048
|
+
r"""Binary-probability adapter for :func:`fit_temperature` (Guo et al. 2017 [#guo]_).
|
|
1049
|
+
|
|
1050
|
+
Fits a scalar T > 0 on *validation* probabilities of class 1 and returns
|
|
1051
|
+
both T and a callable that applies the same T-scaling to test
|
|
1052
|
+
probabilities. Internally:
|
|
1053
|
+
|
|
1054
|
+
1. Clips ``y_score`` to ``[1e-7, 1-1e-7]`` for finite logit inversion.
|
|
1055
|
+
2. Builds a 2-column logit array ``[0, logit(p)]`` so softmax row 1
|
|
1056
|
+
reproduces ``p`` exactly.
|
|
1057
|
+
3. Delegates to :func:`fit_temperature` for the bounded NLL minimization.
|
|
1058
|
+
4. Returns ``(T, apply)`` where ``apply(p_test) = sigmoid(logit(p_test)/T)``.
|
|
1059
|
+
|
|
1060
|
+
Unlike :func:`fit_temperature_oracle`, this does NOT emit a warning — the
|
|
1061
|
+
contract is that ``y_true`` / ``y_score`` come from a held-out validation
|
|
1062
|
+
set and ``apply`` is invoked on a separate test set (deployment-quality
|
|
1063
|
+
calibration, not fit-on-test).
|
|
1064
|
+
|
|
1065
|
+
Parameters
|
|
1066
|
+
----------
|
|
1067
|
+
y_true : np.ndarray, shape (n,)
|
|
1068
|
+
Binary validation labels in {0, 1}.
|
|
1069
|
+
y_score : np.ndarray, shape (n,)
|
|
1070
|
+
Validation predicted probabilities of class 1, in [0, 1]. Values at
|
|
1071
|
+
the extremes are clipped to ``[1e-7, 1 - 1e-7]``.
|
|
1072
|
+
bounds : tuple of float, optional
|
|
1073
|
+
``(lo, hi)`` bracket for T. Default ``(0.05, 20.0)``, matches
|
|
1074
|
+
:func:`fit_temperature`.
|
|
1075
|
+
|
|
1076
|
+
Returns
|
|
1077
|
+
-------
|
|
1078
|
+
tuple
|
|
1079
|
+
``(T_optimal, apply)`` where ``apply: (n,) -> (n,)`` maps any input
|
|
1080
|
+
probability array through :math:`\sigma(\mathrm{logit}(p) / T)`.
|
|
1081
|
+
|
|
1082
|
+
Raises
|
|
1083
|
+
------
|
|
1084
|
+
ValueError
|
|
1085
|
+
On shape mismatch, empty input, non-finite scores, or single-class
|
|
1086
|
+
``y_true``.
|
|
1087
|
+
RuntimeError
|
|
1088
|
+
If the bounded scalar optimizer fails to converge.
|
|
1089
|
+
|
|
1090
|
+
Examples
|
|
1091
|
+
--------
|
|
1092
|
+
>>> import numpy as np
|
|
1093
|
+
>>> rng = np.random.default_rng(0)
|
|
1094
|
+
>>> n = 500
|
|
1095
|
+
>>> y_val = rng.binomial(1, 0.3, size=n).astype(int)
|
|
1096
|
+
>>> p_val = np.clip(y_val * 0.6 + rng.normal(0, 0.2, n), 0.01, 0.99)
|
|
1097
|
+
>>> T, apply = fit_temperature_binary(y_val, p_val)
|
|
1098
|
+
>>> T > 0
|
|
1099
|
+
True
|
|
1100
|
+
>>> p_test = np.array([0.1, 0.5, 0.9])
|
|
1101
|
+
>>> apply(p_test).shape == (3,)
|
|
1102
|
+
True
|
|
1103
|
+
|
|
1104
|
+
See Also
|
|
1105
|
+
--------
|
|
1106
|
+
fit_temperature : underlying multi-class fitter (operates on 2-col logits)
|
|
1107
|
+
fit_temperature_oracle : diagnostic-only variant that fits T on the same
|
|
1108
|
+
probabilities it scores
|
|
1109
|
+
|
|
1110
|
+
References
|
|
1111
|
+
----------
|
|
1112
|
+
.. [#guo] Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. "On
|
|
1113
|
+
calibration of modern neural networks." ICML 2017. arXiv:1706.04599.
|
|
1114
|
+
"""
|
|
1115
|
+
y_true_arr, y_score_arr = _validate_calibrator_inputs(y_true, y_score)
|
|
1116
|
+
|
|
1117
|
+
# Build 2-col logits [0, logit(p)] so softmax([0, logit(p)])[1] == p exactly.
|
|
1118
|
+
s_clipped = np.clip(y_score_arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
1119
|
+
logit_pos = np.log(s_clipped / (1.0 - s_clipped))
|
|
1120
|
+
val_logits_2col = np.column_stack([np.zeros_like(logit_pos), logit_pos])
|
|
1121
|
+
|
|
1122
|
+
result = fit_temperature(val_logits_2col, y_true_arr, bounds=bounds)
|
|
1123
|
+
t_optimal = float(result["temperature"])
|
|
1124
|
+
|
|
1125
|
+
def apply(scores: np.ndarray) -> np.ndarray:
|
|
1126
|
+
arr = np.asarray(scores, dtype=float).ravel()
|
|
1127
|
+
if not np.isfinite(arr).all():
|
|
1128
|
+
raise ValueError("scores contains NaN or inf")
|
|
1129
|
+
clipped = np.clip(arr, _SCORE_CLIP_LO, _SCORE_CLIP_HI)
|
|
1130
|
+
logit = np.log(clipped / (1.0 - clipped))
|
|
1131
|
+
scaled = logit / t_optimal
|
|
1132
|
+
out: np.ndarray = (1.0 / (1.0 + np.exp(-scaled))).astype(float)
|
|
1133
|
+
return out
|
|
1134
|
+
|
|
1135
|
+
return t_optimal, apply
|
|
1136
|
+
|
|
1137
|
+
|
|
1041
1138
|
def fit_temperature_oracle(
|
|
1042
1139
|
y_true: np.ndarray, y_score: np.ndarray
|
|
1043
1140
|
) -> tuple[float, Callable[[np.ndarray], np.ndarray]]:
|
|
@@ -88,15 +88,18 @@ def make_minilm_embedder(
|
|
|
88
88
|
"Install via: pip install eval-toolkit[embeddings]"
|
|
89
89
|
) from e
|
|
90
90
|
|
|
91
|
-
|
|
91
|
+
# sentence-transformers-active path: excluded from CI coverage
|
|
92
|
+
# because [embeddings] is intentionally kept out of [dev]/[all]
|
|
93
|
+
# (transitive torch cost ~700MB per the v0.33.1 design note).
|
|
94
|
+
_logger.debug( # pragma: no cover
|
|
92
95
|
"loading SentenceTransformer model_id=%s device=%s batch_size=%d",
|
|
93
96
|
model_id,
|
|
94
97
|
device,
|
|
95
98
|
batch_size,
|
|
96
99
|
)
|
|
97
|
-
model = SentenceTransformer(model_id, device=device)
|
|
100
|
+
model = SentenceTransformer(model_id, device=device) # pragma: no cover
|
|
98
101
|
|
|
99
|
-
def embedder(texts: Sequence[str]) -> np.ndarray:
|
|
102
|
+
def embedder(texts: Sequence[str]) -> np.ndarray: # pragma: no cover
|
|
100
103
|
result = model.encode(
|
|
101
104
|
list(texts),
|
|
102
105
|
convert_to_numpy=True,
|
|
@@ -105,4 +108,4 @@ def make_minilm_embedder(
|
|
|
105
108
|
)
|
|
106
109
|
return np.asarray(result, dtype=np.float64)
|
|
107
110
|
|
|
108
|
-
return embedder
|
|
111
|
+
return embedder # pragma: no cover
|