eval-toolkit 0.33.0__tar.gz → 0.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/CHANGELOG.md +140 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/PKG-INFO +3 -1
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/pyproject.toml +7 -1
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/__init__.py +5 -0
- eval_toolkit-0.34.0/src/eval_toolkit/_parallel.py +129 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/bootstrap.py +406 -120
- eval_toolkit-0.34.0/src/eval_toolkit/embeddings.py +108 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/manifest.py +32 -1
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/thresholds.py +129 -0
- eval_toolkit-0.34.0/tests/golden/data/dedup_holdout.jsonl +51 -0
- eval_toolkit-0.34.0/tests/golden/data/dedup_holdout_expected.json +44 -0
- eval_toolkit-0.34.0/tests/golden/data/dedup_holdout_provenance.md +89 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/public_api/snapshot.json +36 -9
- eval_toolkit-0.34.0/tests/golden/test_dedup_holdout_calibration.py +212 -0
- eval_toolkit-0.34.0/tests/test_block_bootstrap_on_folds.py +114 -0
- eval_toolkit-0.34.0/tests/test_bootstrap_njobs.py +165 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_unit.py +67 -0
- eval_toolkit-0.34.0/tests/test_embeddings.py +87 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest.py +50 -0
- eval_toolkit-0.34.0/tests/test_parallel.py +150 -0
- eval_toolkit-0.34.0/tests/test_recall_at_fpr.py +97 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/.gitignore +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/LICENSE +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/STYLE.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.33.0 → eval_toolkit-0.34.0}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,146 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.34.0] — 2026-05-17 — Phase 4 stats unblockers + unified parallelism + cookbook (BREAKING)
|
|
11
|
+
|
|
12
|
+
Closes all 7 open backlog issues in one consumer-closing release. Also
|
|
13
|
+
lands the toolkit's first unified parallelism story: a shared internal
|
|
14
|
+
`parallel_map` helper + `n_jobs` kwarg on all 5 public bootstrap
|
|
15
|
+
functions. Future iterations will mechanically extend the same helper to
|
|
16
|
+
harness + operating-points (follow-up issues filed).
|
|
17
|
+
|
|
18
|
+
### Breaking changes
|
|
19
|
+
|
|
20
|
+
- **`eval_toolkit.bootstrap.mde_from_ci`**: parameter renamed from
|
|
21
|
+
`paired` to `ci` and type widened to `BootstrapCI | PairedBootstrapCI`
|
|
22
|
+
(was `PairedBootstrapCI`-only). Positional callers unaffected; keyword
|
|
23
|
+
callers must update:
|
|
24
|
+
```python
|
|
25
|
+
mde_from_ci(paired=x) # v0.33.x and earlier
|
|
26
|
+
mde_from_ci(ci=x) # v0.34.0+
|
|
27
|
+
mde_from_ci(x) # positional form, unchanged
|
|
28
|
+
```
|
|
29
|
+
This is a one-time exception to the repo's 2-minor-version deprecation
|
|
30
|
+
warning policy (justification + criteria recorded in
|
|
31
|
+
[`docs/source/DEPRECATION.md`](docs/source/DEPRECATION.md#one-time-exceptions-to-the-2-minor-version-warning-policy)).
|
|
32
|
+
Notification issues filed on the 2 known toolkit consumers
|
|
33
|
+
(`prompt-injection-detection-submission`, `post-transformers`). Audit
|
|
34
|
+
confirms both use positional form — zero actual breakage in practice.
|
|
35
|
+
|
|
36
|
+
### Added
|
|
37
|
+
|
|
38
|
+
- `eval_toolkit.bootstrap.block_bootstrap_on_folds` — CV-aware sibling
|
|
39
|
+
to `cv_clt_ci`; resamples K folds with replacement; returns
|
|
40
|
+
`BootstrapCI(method="block_bootstrap")`. The A-008 sensitivity-check
|
|
41
|
+
pattern (block-bootstrap halfwidth / cv_clt halfwidth > 1.5 flags
|
|
42
|
+
LODO non-exchangeability) is the prototypical use. Closes #21.
|
|
43
|
+
- `eval_toolkit.RecallAtFprResult` (frozen dataclass) +
|
|
44
|
+
`eval_toolkit.recall_at_fpr(y_true, y_score, target_fpr)` — one-shot
|
|
45
|
+
recall + actual_fpr + FP/TN at the smallest threshold meeting FPR ≤
|
|
46
|
+
target. Use `.to_dict()` for JSON / pandas-row integration. Closes #9.
|
|
47
|
+
- New optional `n_jobs: int = 1` kwarg on 5 bootstrap functions:
|
|
48
|
+
`bootstrap_ci`, `paired_bootstrap_diff`, `paired_bootstrap_ece_diff`,
|
|
49
|
+
`paired_bootstrap_op_point_diff`, `paired_mde` (via `_bootstrap_t_ci`
|
|
50
|
+
internal helper). Backed by a new internal `_parallel.parallel_map`
|
|
51
|
+
helper (joblib loky; not exported). `n_jobs > 1` reproduces `n_jobs=1`
|
|
52
|
+
result bit-for-bit for the same seed (via
|
|
53
|
+
`np.random.SeedSequence.spawn`). Lambda metrics rejected at call time
|
|
54
|
+
with helpful `TypeError`. `n_jobs > os.cpu_count()` is auto-capped
|
|
55
|
+
with WARNING log; `n_jobs=0` raises `ValueError`. Closes #17.
|
|
56
|
+
- 6 new pages in `docs/source/examples/`:
|
|
57
|
+
- **Cookbook** (closes #19): `nested_seed_split.md`,
|
|
58
|
+
`callable_embedder_dedup.md`, `cross_corpus_contamination_scan.md`.
|
|
59
|
+
- **Plotting walkthroughs**: `plot_roc_curve_walkthrough.md`,
|
|
60
|
+
`plot_pareto_frontier_walkthrough.md`,
|
|
61
|
+
`plot_slice_metric_heatmap_walkthrough.md` (backfills the v0.33.0
|
|
62
|
+
docs gap).
|
|
63
|
+
- `docs/source/methodology/parallelism.md` — design rationale + caller
|
|
64
|
+
contract for the toolkit-wide parallelism story. Documents the 6
|
|
65
|
+
design principles (single backend, single helper, opt-in per-fn,
|
|
66
|
+
default sequential, reproducibility via SeedSequence, picklability
|
|
67
|
+
surface) and the checklist for adding `n_jobs` to a new function.
|
|
68
|
+
|
|
69
|
+
### Changed
|
|
70
|
+
|
|
71
|
+
- `eval_toolkit.bootstrap.mde_from_ci` now accepts
|
|
72
|
+
`BootstrapCI | PairedBootstrapCI` (was paired-only). See **Breaking
|
|
73
|
+
changes** above. Closes #20.
|
|
74
|
+
- `eval_toolkit.build_manifest` gains `config_path: Path | str | None`
|
|
75
|
+
kwarg; when supplied, `config_hash` is computed as
|
|
76
|
+
`sha256(Path(config_path).read_bytes()).hexdigest()` — capturing the
|
|
77
|
+
exact YAML file bytes including comments + key ordering (which the
|
|
78
|
+
default canonical-JSON path strips during parse). Default behavior
|
|
79
|
+
preserved when `config_path is None`. Closes #10.
|
|
80
|
+
|
|
81
|
+
### Internal
|
|
82
|
+
|
|
83
|
+
- New `src/eval_toolkit/_parallel.py` (internal; not exported) — single
|
|
84
|
+
source of truth for parallelism. Future per-function `n_jobs`
|
|
85
|
+
additions will reuse this helper. The toolkit's first INFO-level
|
|
86
|
+
log site is here (once-per-process guidance log when `n_jobs=1` AND
|
|
87
|
+
iteration count ≥ 1000). New `tests/test_parallel.py` covers smart-
|
|
88
|
+
default semantics + reproducibility contract.
|
|
89
|
+
- New golden test `tests/golden/test_dedup_holdout_calibration.py`
|
|
90
|
+
exercising 3 deterministic `SimilarityStrategy` variants against a
|
|
91
|
+
migrated 50-pair adversarial fixture at thresholds {0.75, 0.80, 0.85}
|
|
92
|
+
(strict snapshot at `tests/golden/data/dedup_holdout_expected.json`)
|
|
93
|
+
plus an `EmbeddingCosineStrategy` soft-bound check (FPR < 0.5,
|
|
94
|
+
FNR < 0.5 at threshold 0.80) gated by `pytest.importorskip` +
|
|
95
|
+
`@pytest.mark.slow`. Refresh helper at
|
|
96
|
+
`scripts/refresh_dedup_holdout.py`. Closes #18.
|
|
97
|
+
- `CONTRIBUTING.md` + `docs/source/repo-strategy.md` updated with
|
|
98
|
+
explicit "Parallelism" section codifying the new pattern (was an
|
|
99
|
+
implicit anti-pattern before; v0.34.0 codifies the new opt-in design).
|
|
100
|
+
- `docs/source/DEPRECATION.md` extended with a "One-time exceptions"
|
|
101
|
+
section documenting the `mde_from_ci` rename + criteria future
|
|
102
|
+
exceptions must satisfy.
|
|
103
|
+
- RNG-stream note: the 5 wired bootstrap fns now derive per-resample
|
|
104
|
+
seeds via `np.random.SeedSequence(seed).spawn(n_resamples)` instead
|
|
105
|
+
of sequential calls on a single `Generator`. The bootstrap output is
|
|
106
|
+
*statistically equivalent* (both are valid bootstraps) but the exact
|
|
107
|
+
numerical CI bounds for the same caller-supplied `seed` will differ
|
|
108
|
+
slightly from v0.33.x. Existing tests use behavioral assertions
|
|
109
|
+
(`overlaps_zero`, `delta`, etc.) that are robust to the RNG-stream
|
|
110
|
+
change; tests that pin exact CI bounds (e.g., consumer golden tests)
|
|
111
|
+
may need regen on upgrade.
|
|
112
|
+
|
|
113
|
+
## [0.33.1] — 2026-05-17 — MiniLM convenience embedder
|
|
114
|
+
|
|
115
|
+
Closes the last open item in the v0.33 milestone (deferred from v0.33.0
|
|
116
|
+
per the planned split). Ships the canonical semantic-dedup recipe
|
|
117
|
+
(`sentence-transformers/all-MiniLM-L6-v2` at cosine ≥ 0.80, per ADR-027)
|
|
118
|
+
pre-wired for `EmbeddingCosineStrategy` so consumers stop reinventing the
|
|
119
|
+
embedder-wrapping boilerplate.
|
|
120
|
+
|
|
121
|
+
No breaking changes. Public API gains 1 new export
|
|
122
|
+
(`make_minilm_embedder`) and 1 new optional dependency extra
|
|
123
|
+
(`[embeddings]`). Existing `EmbeddingCosineStrategy` callers that already
|
|
124
|
+
ship their own embedder are unaffected.
|
|
125
|
+
|
|
126
|
+
### Added
|
|
127
|
+
|
|
128
|
+
- `eval_toolkit.embeddings.make_minilm_embedder` — factory returning a
|
|
129
|
+
`Callable[[Sequence[str]], np.ndarray]` that loads
|
|
130
|
+
`sentence-transformers/all-MiniLM-L6-v2` (configurable), memoises model
|
|
131
|
+
loads via `functools.lru_cache(maxsize=8)`, and emits `(n, 384)`
|
|
132
|
+
`float64` embeddings ready for `EmbeddingCosineStrategy`. Raises a
|
|
133
|
+
helpful `ImportError` with the install hint when the optional dep is
|
|
134
|
+
absent. Closes #3.
|
|
135
|
+
- New optional dependency extra `[embeddings]` →
|
|
136
|
+
`sentence-transformers>=3.0`. Intentionally **not** in `[all]` / `[dev]`
|
|
137
|
+
because the transitive `torch` install (~700MB) would balloon
|
|
138
|
+
contributor setup.
|
|
139
|
+
|
|
140
|
+
### Internal
|
|
141
|
+
|
|
142
|
+
- `docs/source/api/embeddings.md` Sphinx page added (autosummary stub);
|
|
143
|
+
wired into the API toctree alongside the other module pages.
|
|
144
|
+
- `docs/source/api/plotting.md` autosummary backfilled with
|
|
145
|
+
`plot_roc_curve`, `plot_pareto_frontier`, `plot_slice_metric_heatmap`
|
|
146
|
+
(missed in v0.33.0).
|
|
147
|
+
- `tool.mypy.overrides` extended with `sentence_transformers.*` (matches
|
|
148
|
+
the existing pattern for untyped third-party libs).
|
|
149
|
+
|
|
10
150
|
## [0.33.0] — 2026-05-17 — Plotting batch + ax= parity + CI quality-of-life
|
|
11
151
|
|
|
12
152
|
Consumer-unblocking release: closes the four upstream-gap TODOs in
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.34.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -60,6 +60,8 @@ Requires-Dist: sphinx-autodoc-typehints>=2.0; extra == 'docs'
|
|
|
60
60
|
Requires-Dist: sphinx-copybutton>=0.5; extra == 'docs'
|
|
61
61
|
Requires-Dist: sphinx-design>=0.6; extra == 'docs'
|
|
62
62
|
Requires-Dist: sphinx>=7.3; extra == 'docs'
|
|
63
|
+
Provides-Extra: embeddings
|
|
64
|
+
Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
|
|
63
65
|
Provides-Extra: parquet
|
|
64
66
|
Requires-Dist: pyarrow>=15.0; extra == 'parquet'
|
|
65
67
|
Provides-Extra: plotting
|
|
@@ -50,6 +50,12 @@ plotting = ["matplotlib>=3.8", "pillow>=10.0"]
|
|
|
50
50
|
property = ["hypothesis>=6.100"]
|
|
51
51
|
yaml = ["pyyaml>=6.0"]
|
|
52
52
|
parquet = ["pyarrow>=15.0"]
|
|
53
|
+
# v0.33.1: MiniLM convenience embedder for EmbeddingCosineStrategy.
|
|
54
|
+
# sentence-transformers transitively pulls torch + tokenizers (~700MB).
|
|
55
|
+
# Intentionally NOT in [all] / [dev] — opt-in only to keep contributor
|
|
56
|
+
# setup small. The canonical semantic-dedup recipe (all-MiniLM-L6-v2 +
|
|
57
|
+
# cosine@0.80) is what this factory pre-wires for callers.
|
|
58
|
+
embeddings = ["sentence-transformers>=3.0"]
|
|
53
59
|
# DEPRECATED (announced v0.30.1, removal v0.33.0).
|
|
54
60
|
#
|
|
55
61
|
# Retained as a transitive no-op so `pip install eval-toolkit[validation]`
|
|
@@ -158,7 +164,7 @@ warn_no_return = true
|
|
|
158
164
|
strict_equality = true
|
|
159
165
|
|
|
160
166
|
[[tool.mypy.overrides]]
|
|
161
|
-
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*"]
|
|
167
|
+
module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
|
|
162
168
|
ignore_missing_imports = true
|
|
163
169
|
|
|
164
170
|
[tool.pytest.ini_options]
|
|
@@ -62,6 +62,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
62
62
|
"PairedBootstrapCI": "eval_toolkit.bootstrap",
|
|
63
63
|
"ThresholdedMetricFn": "eval_toolkit.bootstrap",
|
|
64
64
|
"ThresholdFn": "eval_toolkit.bootstrap",
|
|
65
|
+
"block_bootstrap_on_folds": "eval_toolkit.bootstrap",
|
|
65
66
|
"bonferroni_correct": "eval_toolkit.bootstrap",
|
|
66
67
|
"bootstrap_ci": "eval_toolkit.bootstrap",
|
|
67
68
|
"correct_p_values": "eval_toolkit.bootstrap",
|
|
@@ -117,6 +118,8 @@ _EXPORTS: dict[str, str] = {
|
|
|
117
118
|
"render_files": "eval_toolkit.docs",
|
|
118
119
|
"render_text": "eval_toolkit.docs",
|
|
119
120
|
"walk_path": "eval_toolkit.docs",
|
|
121
|
+
# --- embeddings ---
|
|
122
|
+
"make_minilm_embedder": "eval_toolkit.embeddings",
|
|
120
123
|
# --- evidence ---
|
|
121
124
|
"AggregateEvidence": "eval_toolkit.evidence",
|
|
122
125
|
"EvidenceAxis": "eval_toolkit.evidence",
|
|
@@ -250,6 +253,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
250
253
|
"CISafeThresholdSelector": "eval_toolkit.thresholds",
|
|
251
254
|
"CostSensitiveSelector": "eval_toolkit.thresholds",
|
|
252
255
|
"MaxF1Selector": "eval_toolkit.thresholds",
|
|
256
|
+
"RecallAtFprResult": "eval_toolkit.thresholds",
|
|
253
257
|
"TargetFPRSelector": "eval_toolkit.thresholds",
|
|
254
258
|
"TargetPrecisionSelector": "eval_toolkit.thresholds",
|
|
255
259
|
"TargetRecallSelector": "eval_toolkit.thresholds",
|
|
@@ -257,6 +261,7 @@ _EXPORTS: dict[str, str] = {
|
|
|
257
261
|
"ThresholdSelector": "eval_toolkit.thresholds",
|
|
258
262
|
"WilsonInterval": "eval_toolkit.thresholds",
|
|
259
263
|
"YoudenJSelector": "eval_toolkit.thresholds",
|
|
264
|
+
"recall_at_fpr": "eval_toolkit.thresholds",
|
|
260
265
|
"select_threshold": "eval_toolkit.thresholds",
|
|
261
266
|
"wilson_interval": "eval_toolkit.thresholds",
|
|
262
267
|
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Toolkit-internal parallel-map helper. Single source of truth for parallelism.
|
|
2
|
+
|
|
3
|
+
This module is *internal* (not exported via ``__all__`` or
|
|
4
|
+
``__init__._EXPORTS``); future per-function ``n_jobs`` additions across the
|
|
5
|
+
toolkit call into this helper rather than each inventing their own
|
|
6
|
+
parallelism backend.
|
|
7
|
+
|
|
8
|
+
See ``docs/source/methodology/parallelism.md`` for the design rationale +
|
|
9
|
+
caller contract (reproducibility, picklability, smart defaults).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
import pickle
|
|
17
|
+
from collections.abc import Callable, Iterable, Sized
|
|
18
|
+
|
|
19
|
+
_logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_GUIDANCE_THRESHOLD = 1000
|
|
22
|
+
|
|
23
|
+
_GUIDANCE_EMITTED = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parallel_map[T, R](
|
|
27
|
+
fn: Callable[[T], R],
|
|
28
|
+
items: Iterable[T],
|
|
29
|
+
*,
|
|
30
|
+
n_jobs: int = 1,
|
|
31
|
+
description: str = "work",
|
|
32
|
+
) -> list[R]:
|
|
33
|
+
"""Map ``fn`` over ``items``; parallel when ``n_jobs != 1``.
|
|
34
|
+
|
|
35
|
+
Design contract (see ``docs/source/methodology/parallelism.md``):
|
|
36
|
+
|
|
37
|
+
- ``n_jobs == 1`` (default) — pure-Python serial; preserves tracebacks.
|
|
38
|
+
If ``len(items) >= 1000``, emits an INFO log on the first qualifying
|
|
39
|
+
call per Python process suggesting ``n_jobs > 1`` (silent thereafter).
|
|
40
|
+
- ``n_jobs == -1`` — joblib loky with all cores.
|
|
41
|
+
- ``n_jobs > 1`` — joblib loky; values exceeding ``os.cpu_count()``
|
|
42
|
+
are silently capped (with a WARNING log) to avoid CPU-frying.
|
|
43
|
+
- ``n_jobs == 0`` — raises ``ValueError`` (likely a typo for 1 or -1).
|
|
44
|
+
- ``fn`` MUST be picklable when ``n_jobs != 1`` (lambdas and closures
|
|
45
|
+
over local state are rejected at call time with a helpful
|
|
46
|
+
``TypeError``).
|
|
47
|
+
- Reproducibility: caller is responsible for deterministic per-item
|
|
48
|
+
state (use ``np.random.SeedSequence(seed).spawn(n)`` for resample
|
|
49
|
+
loops so ``n_jobs > 1`` produces identical results to ``n_jobs == 1``
|
|
50
|
+
for the same seed).
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
fn : Callable[[T], R]
|
|
55
|
+
Picklable callable to apply to each item. Lambdas and closures
|
|
56
|
+
over local state are rejected when ``n_jobs != 1``.
|
|
57
|
+
items : Iterable[T]
|
|
58
|
+
Work items. Materialised internally; pass any iterable.
|
|
59
|
+
n_jobs : int, optional
|
|
60
|
+
Default 1 (sequential). Set to -1 for all cores, or a positive int.
|
|
61
|
+
``n_jobs=0`` is rejected (use 1 or -1).
|
|
62
|
+
description : str, optional
|
|
63
|
+
Used in log messages and error messages for context (e.g.,
|
|
64
|
+
``"paired bootstrap"``).
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
list[R]
|
|
69
|
+
Results in the order of ``items``.
|
|
70
|
+
|
|
71
|
+
Raises
|
|
72
|
+
------
|
|
73
|
+
ValueError
|
|
74
|
+
If ``n_jobs == 0``.
|
|
75
|
+
TypeError
|
|
76
|
+
If ``n_jobs != 1`` and ``fn`` is not picklable.
|
|
77
|
+
|
|
78
|
+
Examples
|
|
79
|
+
--------
|
|
80
|
+
>>> def square(x): return x * x
|
|
81
|
+
>>> parallel_map(square, [1, 2, 3], n_jobs=1)
|
|
82
|
+
[1, 4, 9]
|
|
83
|
+
"""
|
|
84
|
+
if n_jobs == 0:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"n_jobs=0 is not allowed for {description}; use 1 (sequential), "
|
|
87
|
+
"-1 (all cores), or a positive integer."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
items_list = items if isinstance(items, list) else list(items)
|
|
91
|
+
n_items = len(items_list) if isinstance(items_list, Sized) else 0
|
|
92
|
+
|
|
93
|
+
if n_jobs == 1:
|
|
94
|
+
global _GUIDANCE_EMITTED
|
|
95
|
+
if n_items >= _GUIDANCE_THRESHOLD and not _GUIDANCE_EMITTED:
|
|
96
|
+
_logger.info(
|
|
97
|
+
"%s: running %d items sequentially (n_jobs=1). For parallel "
|
|
98
|
+
"speedup set n_jobs > 1 (typical wall-clock 3-5x on 8 cores). "
|
|
99
|
+
"(Shown once per process.)",
|
|
100
|
+
description,
|
|
101
|
+
n_items,
|
|
102
|
+
)
|
|
103
|
+
_GUIDANCE_EMITTED = True
|
|
104
|
+
return [fn(item) for item in items_list]
|
|
105
|
+
|
|
106
|
+
if n_jobs > 0:
|
|
107
|
+
cpu_count = os.cpu_count() or 1
|
|
108
|
+
if n_jobs > cpu_count:
|
|
109
|
+
_logger.warning(
|
|
110
|
+
"%s: capping n_jobs from %d to %d (os.cpu_count()).",
|
|
111
|
+
description,
|
|
112
|
+
n_jobs,
|
|
113
|
+
cpu_count,
|
|
114
|
+
)
|
|
115
|
+
n_jobs = cpu_count
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
pickle.dumps(fn)
|
|
119
|
+
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
|
120
|
+
raise TypeError(
|
|
121
|
+
f"parallel_map of {description}: callable is not picklable "
|
|
122
|
+
f"(lambdas and closures over local state are not supported "
|
|
123
|
+
f"with n_jobs != 1). Define a named top-level function. "
|
|
124
|
+
f"Underlying error: {e}"
|
|
125
|
+
) from e
|
|
126
|
+
|
|
127
|
+
from joblib import Parallel, delayed # noqa: PLC0415
|
|
128
|
+
|
|
129
|
+
return list(Parallel(n_jobs=n_jobs, backend="loky")(delayed(fn)(item) for item in items_list))
|