eval-toolkit 0.35.0__tar.gz → 0.36.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/CHANGELOG.md +50 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/PKG-INFO +1 -1
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/harness.py +227 -35
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/public_api/snapshot.json +3 -3
- eval_toolkit-0.36.0/tests/test_harness_parallelism.py +266 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/.gitignore +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/LICENSE +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/STYLE.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/pyproject.toml +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/__init__.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/_parallel.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/embeddings.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/plotting.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout.jsonl +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout_expected.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/data/dedup_holdout_provenance.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/golden/test_dedup_holdout_calibration.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_block_bootstrap_on_folds.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_njobs.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_embeddings.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_parallel.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_plotting_edge.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_plotting_visual.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_recall_at_fpr.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.35.0 → eval_toolkit-0.36.0}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,56 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.36.0] — 2026-05-18 — harness parallelization (#29, #30) + Node 24 actions
|
|
11
|
+
|
|
12
|
+
Wires the v0.34.0 unified parallelism pattern into the harness evaluation
|
|
13
|
+
loop. `evaluate()` and `evaluate_folded()` now accept an `n_jobs` kwarg
|
|
14
|
+
(default `1` preserves bit-identical sequential behavior); under
|
|
15
|
+
`n_jobs != 1`, the `(slice × scorer)` work-unit loop in
|
|
16
|
+
`_score_all_slices` and the `(spec × scorer)` fit phase in
|
|
17
|
+
`_attach_transferred_operating_points` dispatch through joblib loky via
|
|
18
|
+
the existing `_parallel.parallel_map` helper.
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- `evaluate(..., n_jobs: int = 1)` and `evaluate_folded(..., n_jobs: int = 1)`
|
|
23
|
+
— keyword-only kwarg per Principle #3 of `methodology/parallelism.md`.
|
|
24
|
+
`n_jobs=1` (default) runs the existing pure-Python sequential loop
|
|
25
|
+
(Principle #4 — bit-identical to v0.35). `n_jobs > 1` uses joblib loky;
|
|
26
|
+
`n_jobs=-1` uses all cores; `n_jobs=0` is rejected. Closes #29, #30.
|
|
27
|
+
- Strict-pickle Scorer sniff at `evaluate()` entry when `n_jobs != 1`:
|
|
28
|
+
raises a clean `TypeError` referencing
|
|
29
|
+
`methodology/parallelism.md#scorer-picklability` with the underlying
|
|
30
|
+
pickle error attached. Reuses the v0.35 ADR contract; no new exception
|
|
31
|
+
class. Catches non-picklable scorers up front rather than relying on
|
|
32
|
+
joblib's more permissive cloudpickle path (which would silently absorb
|
|
33
|
+
closures and obscure the contract documented in v0.35).
|
|
34
|
+
|
|
35
|
+
### Internal
|
|
36
|
+
|
|
37
|
+
- New module-scope step functions `_score_one_pair` and
|
|
38
|
+
`_fit_one_op_point_pair` in `harness.py` (picklable; required by loky).
|
|
39
|
+
- `_score_all_slices` and `_attach_transferred_operating_points`
|
|
40
|
+
refactored to use flat work-unit dispatch via `parallel_map`.
|
|
41
|
+
|
|
42
|
+
### Tests
|
|
43
|
+
|
|
44
|
+
- New `tests/test_harness_parallelism.py` (7 tests): bit-identical
|
|
45
|
+
reproducibility across `n_jobs=1` vs `n_jobs=2` for `evaluate`
|
|
46
|
+
(basic, paired-diffs, operating-points), `evaluate_folded`,
|
|
47
|
+
picklability rejection (closure scorer), `n_jobs=0` rejection,
|
|
48
|
+
`n_jobs=-1` smoke. All 66 harness tests pass (7 new + 59 existing).
|
|
49
|
+
|
|
50
|
+
### Infrastructure
|
|
51
|
+
|
|
52
|
+
- Bumped `actions/upload-artifact` and `actions/download-artifact` from
|
|
53
|
+
`@v5` → `@v6` across `publish.yml` / `nightly-mc.yml` /
|
|
54
|
+
`nightly-benchmarks.yml`. The v6 majors run on Node.js 24
|
|
55
|
+
(GitHub deprecates Node 20 actions from 2026-06-02). Other pinned
|
|
56
|
+
actions (`checkout@v6`, `setup-uv@v8.1.0`, `codeql-action@v3`,
|
|
57
|
+
`deploy-pages@v4`, `upload-pages-artifact@v3`) were not flagged in
|
|
58
|
+
the v0.35 publish annotation and are deferred to a separate audit.
|
|
59
|
+
|
|
10
60
|
## [0.35.0] — 2026-05-18 — `fit_temperature_binary` + Scorer picklability ADR
|
|
11
61
|
|
|
12
62
|
Small, additive release. Adds a binary-classification calibration helper
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.36.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -32,6 +32,7 @@ v0.7.0 additions:
|
|
|
32
32
|
from __future__ import annotations
|
|
33
33
|
|
|
34
34
|
import logging
|
|
35
|
+
import pickle
|
|
35
36
|
import time
|
|
36
37
|
import traceback
|
|
37
38
|
from collections.abc import Mapping, Sequence
|
|
@@ -41,6 +42,7 @@ from typing import TYPE_CHECKING, Final, Literal, cast
|
|
|
41
42
|
|
|
42
43
|
import numpy as np
|
|
43
44
|
|
|
45
|
+
from eval_toolkit._parallel import parallel_map
|
|
44
46
|
from eval_toolkit.artifacts import (
|
|
45
47
|
error_metric,
|
|
46
48
|
sanitize_for_json,
|
|
@@ -62,7 +64,7 @@ from eval_toolkit.operating_points import (
|
|
|
62
64
|
fit_operating_points,
|
|
63
65
|
)
|
|
64
66
|
from eval_toolkit.protocols import Scorer, SliceAwareScorer
|
|
65
|
-
from eval_toolkit.thresholds import TargetFPRSelector
|
|
67
|
+
from eval_toolkit.thresholds import TargetFPRSelector, ThresholdSelector
|
|
66
68
|
|
|
67
69
|
if TYPE_CHECKING:
|
|
68
70
|
import pandas as pd
|
|
@@ -278,6 +280,31 @@ def _object_to_dict(obj: object, *, what: str) -> dict[str, object]:
|
|
|
278
280
|
raise TypeError(f"expected {what} mapping or object with to_dict(), got {type(obj).__name__}")
|
|
279
281
|
|
|
280
282
|
|
|
283
|
+
def _assert_scorers_picklable(scorers: Mapping[str, Scorer]) -> None:
|
|
284
|
+
"""Strict-pickle sniff for Scorer args when ``n_jobs != 1``.
|
|
285
|
+
|
|
286
|
+
joblib's loky backend uses cloudpickle (which absorbs closures + local
|
|
287
|
+
classes), but the v0.35 Scorer picklability ADR
|
|
288
|
+
(``methodology/parallelism.md#scorer-picklability``) is a *strict* pickle
|
|
289
|
+
contract — cloudpickle behavior is platform-dependent and the more
|
|
290
|
+
permissive failure modes are harder to debug. Fail fast at
|
|
291
|
+
:func:`evaluate` entry with the same ``TypeError`` style as
|
|
292
|
+
:func:`eval_toolkit._parallel.parallel_map`'s fn-sniff (no new exception
|
|
293
|
+
class — single channel for the picklability contract).
|
|
294
|
+
"""
|
|
295
|
+
for sname, scorer in scorers.items():
|
|
296
|
+
try:
|
|
297
|
+
pickle.dumps(scorer)
|
|
298
|
+
except (pickle.PicklingError, AttributeError, TypeError) as exc:
|
|
299
|
+
raise TypeError(
|
|
300
|
+
f"evaluate(n_jobs != 1): scorer {sname!r} "
|
|
301
|
+
f"({type(scorer).__name__}) is not picklable. See "
|
|
302
|
+
f"methodology/parallelism.md#scorer-picklability for the "
|
|
303
|
+
f"contract and worked picklable / broken / fix examples. "
|
|
304
|
+
f"Underlying error: {exc}"
|
|
305
|
+
) from exc
|
|
306
|
+
|
|
307
|
+
|
|
281
308
|
def _should_score_slice(scorer: Scorer, slice_name: str) -> bool:
|
|
282
309
|
"""Honor optional slice-aware scorer hooks without widening the base Protocol."""
|
|
283
310
|
should_score = getattr(scorer, "should_score_slice", None)
|
|
@@ -696,6 +723,36 @@ def _run_leakage_phase(
|
|
|
696
723
|
)
|
|
697
724
|
|
|
698
725
|
|
|
726
|
+
# Tuple shape for the flat `(slice × scorer)` work-unit dispatched to
|
|
727
|
+
# parallel_map by `_score_all_slices`. Defined at module scope so workers
|
|
728
|
+
# can pickle the function reference.
|
|
729
|
+
_ScoreOnePairItem = tuple[EvalSlice, str, Scorer, int, int, Literal["raise", "record"]]
|
|
730
|
+
_ScoreOnePairResult = tuple[str, str, dict[str, object], np.ndarray]
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _score_one_pair(item: _ScoreOnePairItem) -> _ScoreOnePairResult:
|
|
734
|
+
"""Picklable step function for ``(slice × scorer)`` parallel dispatch.
|
|
735
|
+
|
|
736
|
+
Module-scope so loky workers can serialize the reference (closures over
|
|
737
|
+
enclosing locals would fail :func:`parallel_map`'s pickle sniff). All
|
|
738
|
+
inputs flow through the ``item`` tuple — no captured state.
|
|
739
|
+
|
|
740
|
+
Returns ``(slice_name, scorer_name, result_dict, scores_array)`` so the
|
|
741
|
+
caller can reassemble ``by_slice`` + ``score_cache`` in the original
|
|
742
|
+
iteration order.
|
|
743
|
+
"""
|
|
744
|
+
slice_, sname, scorer, n_resamples, seed, on_scorer_error = item
|
|
745
|
+
result = evaluate_scorer_on_slice(
|
|
746
|
+
scorer,
|
|
747
|
+
slice_,
|
|
748
|
+
n_resamples=n_resamples,
|
|
749
|
+
seed=seed,
|
|
750
|
+
on_scorer_error=on_scorer_error,
|
|
751
|
+
)
|
|
752
|
+
scores = np.asarray(result["scores"], dtype=np.float64)
|
|
753
|
+
return slice_.name, sname, result, scores
|
|
754
|
+
|
|
755
|
+
|
|
699
756
|
def _score_all_slices(
|
|
700
757
|
scorers: dict[str, Scorer],
|
|
701
758
|
slices: Sequence[EvalSlice],
|
|
@@ -704,6 +761,7 @@ def _score_all_slices(
|
|
|
704
761
|
seed: int,
|
|
705
762
|
paired_diffs: list[tuple[str, str]] | None,
|
|
706
763
|
on_scorer_error: Literal["raise", "record"],
|
|
764
|
+
n_jobs: int = 1,
|
|
707
765
|
) -> tuple[dict[str, dict[str, object]], dict[tuple[str, str], np.ndarray]]:
|
|
708
766
|
"""Score every ``(slice, scorer)`` pair; return ``(by_slice, score_cache)``.
|
|
709
767
|
|
|
@@ -714,10 +772,17 @@ def _score_all_slices(
|
|
|
714
772
|
``score_cache`` is keyed ``(slice.name, scorer.name)`` and carries the
|
|
715
773
|
raw score arrays so :func:`_attach_transferred_operating_points` can
|
|
716
774
|
re-use them without re-calling scorers.
|
|
717
|
-
"""
|
|
718
|
-
by_slice: dict[str, dict[str, object]] = {}
|
|
719
|
-
score_cache: dict[tuple[str, str], np.ndarray] = {}
|
|
720
775
|
|
|
776
|
+
v0.36 added ``n_jobs``: a flat ``(slice × scorer)`` parallel dispatch
|
|
777
|
+
via :func:`eval_toolkit._parallel.parallel_map`. Default ``1`` preserves
|
|
778
|
+
bit-identical sequential behavior. ``n_jobs != 1`` requires picklable
|
|
779
|
+
scorers per the v0.35 ADR
|
|
780
|
+
(``docs/source/methodology/parallelism.md#scorer-picklability``).
|
|
781
|
+
"""
|
|
782
|
+
# Pre-filter skipped pairs (allow-list miss) before dispatching parallel
|
|
783
|
+
# work-units. Logs the same skip messages as the pre-parallel version.
|
|
784
|
+
work_units: list[_ScoreOnePairItem] = []
|
|
785
|
+
skipped: dict[tuple[str, str], dict[str, object]] = {}
|
|
721
786
|
for slice_ in slices:
|
|
722
787
|
_logger.info(
|
|
723
788
|
"[slice %s] n=%d, positives=%d",
|
|
@@ -725,32 +790,61 @@ def _score_all_slices(
|
|
|
725
790
|
len(slice_.df),
|
|
726
791
|
int(slice_.y_true.sum()),
|
|
727
792
|
)
|
|
728
|
-
slice_data: dict[str, dict[str, object]] = {}
|
|
729
|
-
scores_by_scorer: dict[str, np.ndarray] = {}
|
|
730
793
|
for sname, scorer in scorers.items():
|
|
731
794
|
if not _should_score_slice(scorer, slice_.name):
|
|
732
795
|
reason = f"slice {slice_.name!r} not in scorer allow-list"
|
|
733
|
-
|
|
796
|
+
skipped[(slice_.name, sname)] = _skipped_scorer_result(slice_, reason)
|
|
734
797
|
_logger.info(" skipped %s: %s", sname, reason)
|
|
735
798
|
continue
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
799
|
+
work_units.append((slice_, sname, scorer, n_resamples, seed, on_scorer_error))
|
|
800
|
+
|
|
801
|
+
# Parallel scoring. parallel_map at n_jobs=1 is a pure-Python for-loop
|
|
802
|
+
# (Principle #4) — bit-identical to the pre-v0.36 sequential code.
|
|
803
|
+
if work_units:
|
|
804
|
+
t0_total = time.time()
|
|
805
|
+
results = parallel_map(
|
|
806
|
+
_score_one_pair,
|
|
807
|
+
work_units,
|
|
808
|
+
n_jobs=n_jobs,
|
|
809
|
+
description="harness _score_all_slices",
|
|
810
|
+
)
|
|
811
|
+
elapsed_total = time.time() - t0_total
|
|
812
|
+
_logger.info(
|
|
813
|
+
" scored %d (slice, scorer) pairs in %.1fs (n_jobs=%d)",
|
|
814
|
+
len(work_units),
|
|
815
|
+
elapsed_total,
|
|
816
|
+
n_jobs,
|
|
817
|
+
)
|
|
818
|
+
else:
|
|
819
|
+
results = []
|
|
820
|
+
|
|
821
|
+
# Index results for O(1) lookup during reassembly.
|
|
822
|
+
results_by_key: dict[tuple[str, str], _ScoreOnePairResult] = {
|
|
823
|
+
(slice_name, sname): (slice_name, sname, result_dict, scores_arr)
|
|
824
|
+
for slice_name, sname, result_dict, scores_arr in results
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
# Reassemble in the original (slices × scorers.items()) iteration order.
|
|
828
|
+
by_slice: dict[str, dict[str, object]] = {}
|
|
829
|
+
score_cache: dict[tuple[str, str], np.ndarray] = {}
|
|
830
|
+
for slice_ in slices:
|
|
831
|
+
slice_data: dict[str, dict[str, object]] = {}
|
|
832
|
+
scores_by_scorer: dict[str, np.ndarray] = {}
|
|
833
|
+
for sname in scorers:
|
|
834
|
+
key = (slice_.name, sname)
|
|
835
|
+
if key in skipped:
|
|
836
|
+
slice_data[sname] = skipped[key]
|
|
837
|
+
continue
|
|
838
|
+
_, _, result_dict, scores_arr = results_by_key[key]
|
|
839
|
+
slice_data[sname] = result_dict
|
|
840
|
+
# If the scorer raised under on_scorer_error="record", scores_arr is [].
|
|
841
|
+
# Paired-diff machinery short-circuits on the same len-check it uses
|
|
842
|
+
# for skipped scorers; no special-case needed.
|
|
843
|
+
scores_by_scorer[sname] = scores_arr
|
|
844
|
+
score_cache[key] = scores_arr
|
|
845
|
+
pr = result_dict.get("pr_auc")
|
|
752
846
|
pr_display = f"{pr:.4f}" if isinstance(pr, float) else "N/A"
|
|
753
|
-
_logger.info(" %s: PR-AUC=%s
|
|
847
|
+
_logger.info(" %s: PR-AUC=%s", sname, pr_display)
|
|
754
848
|
|
|
755
849
|
diffs = (
|
|
756
850
|
_compute_paired_diffs(
|
|
@@ -789,6 +883,7 @@ def evaluate(
|
|
|
789
883
|
on_leakage: Literal["raise", "record", "skip"] = "raise",
|
|
790
884
|
on_scorer_error: Literal["raise", "record"] = "raise",
|
|
791
885
|
operating_point_specs: Sequence[OperatingPointSpec] = (),
|
|
886
|
+
n_jobs: int = 1,
|
|
792
887
|
) -> RunResult:
|
|
793
888
|
"""Run every scorer on every slice; return a pure :class:`RunResult` (no IO).
|
|
794
889
|
|
|
@@ -830,6 +925,15 @@ def evaluate(
|
|
|
830
925
|
Fit thresholds on one mixed-class slice and apply them to named target
|
|
831
926
|
slices. Results are attached under each scorer's
|
|
832
927
|
``"transferred_operating_points"`` block. Default empty (skip).
|
|
928
|
+
n_jobs : int, optional
|
|
929
|
+
Parallel workers (default 1 — sequential). ``n_jobs > 1`` uses
|
|
930
|
+
joblib loky to parallelize the flat ``(slice × scorer)`` work-unit
|
|
931
|
+
loop in :func:`_score_all_slices` (and the operating-point fit
|
|
932
|
+
phase when ``operating_point_specs`` is non-empty). ``n_jobs=-1``
|
|
933
|
+
uses all cores; ``n_jobs=0`` is rejected. Scorers must be picklable
|
|
934
|
+
when ``n_jobs != 1`` — see
|
|
935
|
+
:doc:`methodology/parallelism` § Scorer picklability for the
|
|
936
|
+
contract + worked examples.
|
|
833
937
|
|
|
834
938
|
Returns
|
|
835
939
|
-------
|
|
@@ -850,6 +954,9 @@ def evaluate(
|
|
|
850
954
|
if not slices:
|
|
851
955
|
raise ValueError("at least one slice required")
|
|
852
956
|
|
|
957
|
+
if n_jobs != 1:
|
|
958
|
+
_assert_scorers_picklable(scorers)
|
|
959
|
+
|
|
853
960
|
config: dict[str, object] = {
|
|
854
961
|
"n_resamples": n_resamples,
|
|
855
962
|
"seed": seed,
|
|
@@ -872,6 +979,7 @@ def evaluate(
|
|
|
872
979
|
seed=seed,
|
|
873
980
|
paired_diffs=paired_diffs,
|
|
874
981
|
on_scorer_error=on_scorer_error,
|
|
982
|
+
n_jobs=n_jobs,
|
|
875
983
|
)
|
|
876
984
|
|
|
877
985
|
if operating_point_specs:
|
|
@@ -882,11 +990,45 @@ def evaluate(
|
|
|
882
990
|
score_cache=score_cache,
|
|
883
991
|
scorer_names=list(scorers.keys()),
|
|
884
992
|
specs=operating_point_specs,
|
|
993
|
+
n_jobs=n_jobs,
|
|
885
994
|
)
|
|
886
995
|
|
|
887
996
|
return RunResult(run_id=run_id, git_sha=git_sha, config=config, by_slice=by_slice)
|
|
888
997
|
|
|
889
998
|
|
|
999
|
+
_OpPointFitItem = tuple[
|
|
1000
|
+
str, # spec_name (for reassembly key)
|
|
1001
|
+
str, # fit_slice_name (passed through to fit_operating_points)
|
|
1002
|
+
str, # scorer_name
|
|
1003
|
+
np.ndarray, # fit_y_true
|
|
1004
|
+
np.ndarray, # fit_scores
|
|
1005
|
+
Sequence[ThresholdSelector], # spec.selectors (passed through to fit_operating_points)
|
|
1006
|
+
]
|
|
1007
|
+
_OpPointFitResult = tuple[str, str, object] # (spec_name, scorer_name, fitted | error_dict)
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
def _fit_one_op_point_pair(item: _OpPointFitItem) -> _OpPointFitResult:
|
|
1011
|
+
"""Picklable step function for ``(spec × scorer)`` operating-point fitting.
|
|
1012
|
+
|
|
1013
|
+
Module-scope so loky workers can serialize the reference. All inputs flow
|
|
1014
|
+
through the ``item`` tuple. Returns ``(spec_name, scorer_name, fitted)``
|
|
1015
|
+
where ``fitted`` is either the :func:`fit_operating_points` result or a
|
|
1016
|
+
``{"error": str}`` dict matching the sequential code path.
|
|
1017
|
+
"""
|
|
1018
|
+
spec_name, fit_slice_name, scorer_name, y_true, fit_scores, selectors = item
|
|
1019
|
+
try:
|
|
1020
|
+
fitted = fit_operating_points(
|
|
1021
|
+
y_true,
|
|
1022
|
+
fit_scores,
|
|
1023
|
+
selectors,
|
|
1024
|
+
fitted_on_slice=fit_slice_name,
|
|
1025
|
+
scorer_name=scorer_name,
|
|
1026
|
+
)
|
|
1027
|
+
except (ValueError, RuntimeError) as exc:
|
|
1028
|
+
return spec_name, scorer_name, {"error": str(exc)}
|
|
1029
|
+
return spec_name, scorer_name, fitted
|
|
1030
|
+
|
|
1031
|
+
|
|
890
1032
|
def _attach_transferred_operating_points(
|
|
891
1033
|
*,
|
|
892
1034
|
by_slice: dict[str, dict[str, object]],
|
|
@@ -894,34 +1036,73 @@ def _attach_transferred_operating_points(
|
|
|
894
1036
|
score_cache: Mapping[tuple[str, str], np.ndarray],
|
|
895
1037
|
scorer_names: Sequence[str],
|
|
896
1038
|
specs: Sequence[OperatingPointSpec],
|
|
1039
|
+
n_jobs: int = 1,
|
|
897
1040
|
) -> None:
|
|
898
|
-
"""Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics.
|
|
1041
|
+
"""Mutate ``by_slice`` to attach opt-in cross-slice operating-point metrics.
|
|
1042
|
+
|
|
1043
|
+
v0.36 added ``n_jobs``: parallelizes the ``(spec × scorer)`` fit phase
|
|
1044
|
+
via :func:`eval_toolkit._parallel.parallel_map`. The apply phase
|
|
1045
|
+
(writing into ``by_slice``) stays sequential — fitting dominates runtime.
|
|
1046
|
+
Default ``n_jobs=1`` preserves bit-identical sequential behavior.
|
|
1047
|
+
"""
|
|
1048
|
+
# Pre-flight: handle "fit slice not found" errors (these short-circuit the
|
|
1049
|
+
# entire spec) + collect valid fit work-units. Tracks pre-conditions
|
|
1050
|
+
# ("fit scorer skipped") as separate state so the parallel dispatch only
|
|
1051
|
+
# carries actual work.
|
|
1052
|
+
fit_work: list[_OpPointFitItem] = []
|
|
1053
|
+
fit_skip_reasons: dict[tuple[str, str], dict[str, object]] = {}
|
|
1054
|
+
specs_with_valid_fit: list[OperatingPointSpec] = []
|
|
1055
|
+
names_per_spec: dict[str, list[str]] = {}
|
|
1056
|
+
|
|
899
1057
|
for spec in specs:
|
|
900
1058
|
names = list(spec.scorer_names) if spec.scorer_names else list(scorer_names)
|
|
1059
|
+
names_per_spec[spec.name] = names
|
|
901
1060
|
if spec.fit_slice not in slices_by_name:
|
|
902
1061
|
_record_spec_error(by_slice, spec, names, f"fit slice {spec.fit_slice!r} not found")
|
|
903
1062
|
continue
|
|
904
|
-
|
|
1063
|
+
specs_with_valid_fit.append(spec)
|
|
905
1064
|
fit_slice = slices_by_name[spec.fit_slice]
|
|
906
|
-
fitted_by_scorer: dict[str, object] = {}
|
|
907
1065
|
for scorer_name in names:
|
|
908
1066
|
fit_scores = score_cache.get((spec.fit_slice, scorer_name))
|
|
909
1067
|
if fit_scores is None or len(fit_scores) != len(fit_slice.y_true):
|
|
910
|
-
|
|
1068
|
+
fit_skip_reasons[(spec.name, scorer_name)] = {
|
|
911
1069
|
"error": "fit scorer skipped, errored, or produced no scores"
|
|
912
1070
|
}
|
|
913
1071
|
continue
|
|
914
|
-
|
|
915
|
-
|
|
1072
|
+
fit_work.append(
|
|
1073
|
+
(
|
|
1074
|
+
spec.name,
|
|
1075
|
+
spec.fit_slice,
|
|
1076
|
+
scorer_name,
|
|
916
1077
|
fit_slice.y_true,
|
|
917
1078
|
fit_scores,
|
|
918
1079
|
spec.selectors,
|
|
919
|
-
fitted_on_slice=spec.fit_slice,
|
|
920
|
-
scorer_name=scorer_name,
|
|
921
1080
|
)
|
|
922
|
-
|
|
923
|
-
|
|
1081
|
+
)
|
|
1082
|
+
|
|
1083
|
+
# Parallel fit phase. parallel_map at n_jobs=1 is a pure-Python for-loop
|
|
1084
|
+
# (Principle #4) — bit-identical to the pre-v0.36 sequential code.
|
|
1085
|
+
fit_results: list[_OpPointFitResult] = (
|
|
1086
|
+
parallel_map(
|
|
1087
|
+
_fit_one_op_point_pair,
|
|
1088
|
+
fit_work,
|
|
1089
|
+
n_jobs=n_jobs,
|
|
1090
|
+
description="harness _attach_transferred_operating_points (fit)",
|
|
1091
|
+
)
|
|
1092
|
+
if fit_work
|
|
1093
|
+
else []
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
# Index by (spec_name, scorer_name) for O(1) lookup in the apply phase.
|
|
1097
|
+
fitted_by_pair: dict[tuple[str, str], object] = {
|
|
1098
|
+
(spec_name, scorer_name): fitted for spec_name, scorer_name, fitted in fit_results
|
|
1099
|
+
}
|
|
1100
|
+
fitted_by_pair.update(fit_skip_reasons)
|
|
924
1101
|
|
|
1102
|
+
# Sequential apply phase — preserves the original by_slice mutation order
|
|
1103
|
+
# and the schema of error / skipped markers.
|
|
1104
|
+
for spec in specs_with_valid_fit:
|
|
1105
|
+
names = names_per_spec[spec.name]
|
|
925
1106
|
for target_name in spec.apply_slices:
|
|
926
1107
|
if target_name not in slices_by_name:
|
|
927
1108
|
_record_spec_error(
|
|
@@ -939,7 +1120,7 @@ def _attach_transferred_operating_points(
|
|
|
939
1120
|
spec_block: dict[str, object] = {}
|
|
940
1121
|
transfer_block[spec.name] = spec_block
|
|
941
1122
|
|
|
942
|
-
fitted =
|
|
1123
|
+
fitted = fitted_by_pair.get((spec.name, scorer_name))
|
|
943
1124
|
if not isinstance(fitted, dict) or "error" in fitted:
|
|
944
1125
|
spec_block["error"] = (
|
|
945
1126
|
str(fitted.get("error", "threshold fitting failed"))
|
|
@@ -1099,6 +1280,7 @@ def evaluate_folded(
|
|
|
1099
1280
|
on_scorer_error: Literal["raise", "record"] = "raise",
|
|
1100
1281
|
eval_split_names: Sequence[str] = ("test",),
|
|
1101
1282
|
summary_metrics: Sequence[str] = ("pr_auc", "roc_auc"),
|
|
1283
|
+
n_jobs: int = 1,
|
|
1102
1284
|
) -> RunResult:
|
|
1103
1285
|
"""Run a fold aggregator: ``Splitter × seeds → RunResult`` with CV-CI summary.
|
|
1104
1286
|
|
|
@@ -1128,6 +1310,15 @@ def evaluate_folded(
|
|
|
1128
1310
|
RNG seeds for multi-seed × CV. Default ``(42,)`` (single seed).
|
|
1129
1311
|
n_resamples, paired_diffs, leakage_checks, on_leakage, on_scorer_error :
|
|
1130
1312
|
Forwarded to :func:`evaluate` per fold.
|
|
1313
|
+
n_jobs : int, optional
|
|
1314
|
+
Parallel workers (default 1 — sequential). Forwarded to
|
|
1315
|
+
:func:`evaluate` per fold; parallelizes the inner
|
|
1316
|
+
``(slice × scorer)`` work-unit loop within each fold. Folds
|
|
1317
|
+
themselves run sequentially to keep determinism + traceback
|
|
1318
|
+
fidelity simple; for fold-level parallelism, consider an external
|
|
1319
|
+
``joblib.Parallel`` wrapper at the call site. See
|
|
1320
|
+
:doc:`methodology/parallelism` § Scorer picklability for the
|
|
1321
|
+
Scorer picklability contract when ``n_jobs != 1``.
|
|
1131
1322
|
eval_split_names : sequence of str, optional
|
|
1132
1323
|
Subset of each fold-dict's keys to actually evaluate. Default
|
|
1133
1324
|
``("test",)`` — train sets are skipped (eval-only K-fold). Pass
|
|
@@ -1183,6 +1374,7 @@ def evaluate_folded(
|
|
|
1183
1374
|
leakage_checks=leakage_checks,
|
|
1184
1375
|
on_leakage=on_leakage,
|
|
1185
1376
|
on_scorer_error=on_scorer_error,
|
|
1377
|
+
n_jobs=n_jobs,
|
|
1186
1378
|
)
|
|
1187
1379
|
by_fold[fold_id] = fold_result
|
|
1188
1380
|
|
|
@@ -1017,7 +1017,7 @@
|
|
|
1017
1017
|
"doc_first_line": "str(object='') -> str",
|
|
1018
1018
|
"kind": "value",
|
|
1019
1019
|
"type": "str",
|
|
1020
|
-
"value": "'0.
|
|
1020
|
+
"value": "'0.36.0'"
|
|
1021
1021
|
},
|
|
1022
1022
|
"apply_operating_points": {
|
|
1023
1023
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1117,7 +1117,7 @@
|
|
|
1117
1117
|
"evaluate": {
|
|
1118
1118
|
"doc_first_line": "Run every scorer on every slice; return a pure :class:`RunResult` (no IO).",
|
|
1119
1119
|
"kind": "function",
|
|
1120
|
-
"signature": "(scorers: 'dict[str, Scorer]', slices: 'Sequence[EvalSlice]', *, run_id: 'str', git_sha: 'str | None' = None, n_resamples: 'int' = 1000, paired_diffs: 'list[tuple[str, str]] | None' = None, seed: 'int' = 42, extra_config: 'Mapping[str, object] | None' = None, leakage_checks: 'Sequence[LeakageCheck]' = (), on_leakage: \"Literal['raise', 'record', 'skip']\" = 'raise', on_scorer_error: \"Literal['raise', 'record']\" = 'raise', operating_point_specs: 'Sequence[OperatingPointSpec]' = ()) -> 'RunResult'"
|
|
1120
|
+
"signature": "(scorers: 'dict[str, Scorer]', slices: 'Sequence[EvalSlice]', *, run_id: 'str', git_sha: 'str | None' = None, n_resamples: 'int' = 1000, paired_diffs: 'list[tuple[str, str]] | None' = None, seed: 'int' = 42, extra_config: 'Mapping[str, object] | None' = None, leakage_checks: 'Sequence[LeakageCheck]' = (), on_leakage: \"Literal['raise', 'record', 'skip']\" = 'raise', on_scorer_error: \"Literal['raise', 'record']\" = 'raise', operating_point_specs: 'Sequence[OperatingPointSpec]' = (), n_jobs: 'int' = 1) -> 'RunResult'"
|
|
1121
1121
|
},
|
|
1122
1122
|
"evaluate_claims": {
|
|
1123
1123
|
"doc_first_line": "Evaluate claim specs against a result payload and optional manifest.",
|
|
@@ -1127,7 +1127,7 @@
|
|
|
1127
1127
|
"evaluate_folded": {
|
|
1128
1128
|
"doc_first_line": "Run a fold aggregator: ``Splitter \u00d7 seeds \u2192 RunResult`` with CV-CI summary.",
|
|
1129
1129
|
"kind": "function",
|
|
1130
|
-
"signature": "(scorers: 'dict[str, Scorer]', splitter: 'Splitter', slice_: 'EvalSlice', *, run_id: 'str', git_sha: 'str | None' = None, seeds: 'Sequence[int]' = (42,), n_resamples: 'int' = 1000, paired_diffs: 'list[tuple[str, str]] | None' = None, leakage_checks: 'Sequence[LeakageCheck]' = (), on_leakage: \"Literal['raise', 'record', 'skip']\" = 'raise', on_scorer_error: \"Literal['raise', 'record']\" = 'raise', eval_split_names: 'Sequence[str]' = ('test',), summary_metrics: 'Sequence[str]' = ('pr_auc', 'roc_auc')) -> 'RunResult'"
|
|
1130
|
+
"signature": "(scorers: 'dict[str, Scorer]', splitter: 'Splitter', slice_: 'EvalSlice', *, run_id: 'str', git_sha: 'str | None' = None, seeds: 'Sequence[int]' = (42,), n_resamples: 'int' = 1000, paired_diffs: 'list[tuple[str, str]] | None' = None, leakage_checks: 'Sequence[LeakageCheck]' = (), on_leakage: \"Literal['raise', 'record', 'skip']\" = 'raise', on_scorer_error: \"Literal['raise', 'record']\" = 'raise', eval_split_names: 'Sequence[str]' = ('test',), summary_metrics: 'Sequence[str]' = ('pr_auc', 'roc_auc'), n_jobs: 'int' = 1) -> 'RunResult'"
|
|
1131
1131
|
},
|
|
1132
1132
|
"evaluate_scorer_on_slice": {
|
|
1133
1133
|
"doc_first_line": "Score one scorer on one slice; return headline + bootstrap CI on PR-AUC.",
|