eval-toolkit 0.32.0__tar.gz → 0.33.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/CHANGELOG.md +52 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/PKG-INFO +1 -1
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/__init__.py +3 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/_version.py +1 -1
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/plotting.py +364 -4
- eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
- eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
- eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/public_api/snapshot.json +21 -3
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_edge.py +201 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_visual.py +56 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/.gitignore +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/LICENSE +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/STYLE.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/archive/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/datasets/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/data-integrity/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/inference/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/prompt-injection/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/source/methodology/README.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/pyproject.toml +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/__main__.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/_deprecated.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/analysis.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/artifacts.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/bootstrap.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/calibration.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/claims.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/config.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/docs.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/evidence.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/harness.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/leakage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/loaders.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/manifest.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/metrics.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/operating_points.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/paths.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/protocols.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/provenance.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/py.typed +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/seeds.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/splits.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/text_dedup.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/thresholds.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/benchmarks/__init__.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/conftest.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/bootstrap_ci/cases.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/expected.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/input.md +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/metrics.json +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/strategies.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_analysis.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_artifacts.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_calibration_mc.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_edge_cases.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_golden.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_research_grounded.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_unit.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_bootstrap_chain.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_determinism.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_optimization_failures.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_research_grounded.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_unit.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims_coverage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_cli.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_config.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_bootstrap.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_calibration.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_harness.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_metrics.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_plotting.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_dedup_split_leakage_chain.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_deprecations.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_docs_golden.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_docs_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_evidence_validators.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_edge_cases.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_fault_injection.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_folded.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_internals.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_metric_options.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_smoke.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_import_boundaries.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage_error_paths.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders_coverage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_logging.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_contamination_round_trip.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_validation.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_stratified_subsets.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_unit.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_misc_coverage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_numeric_edge_cases.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_operating_points.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_operating_points_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_paths.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_pipeline_e2e.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_smoke.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_protocol_conformance.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_provenance.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_public_api.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_reference_equivalence.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_reproducibility_integration.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_schemas.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_seeds.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits_leakage_integration.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_coverage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_strategies.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_constant_score.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_coverage.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_props.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_research_grounded.py +0 -0
- {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_v09_contracts.py +0 -0
|
@@ -7,6 +7,58 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.33.0] — 2026-05-17 — Plotting batch + ax= parity + CI quality-of-life
|
|
11
|
+
|
|
12
|
+
Consumer-unblocking release: closes the four upstream-gap TODOs in
|
|
13
|
+
`prompt-injection-detection-submission`'s Phase 4 figures (F1, F2, F5,
|
|
14
|
+
F6-left) which had been carrying hand-rolled prototypes pending these
|
|
15
|
+
primitives. Also bundles two CI/maintenance fixes that were quality-of-life
|
|
16
|
+
pain points during v0.32 ship.
|
|
17
|
+
|
|
18
|
+
**Note**: The `v0.33` milestone's #3 (`make_minilm_embedder`) is deferred
|
|
19
|
+
to the next iteration (likely v0.33.1 or v0.34) so this release stays
|
|
20
|
+
focused on the plotting batch + `ax=` parity. MiniLM adds a new optional
|
|
21
|
+
dep + new module; ships better as its own bite.
|
|
22
|
+
|
|
23
|
+
No breaking changes. Public API gains 3 new plotting exports
|
|
24
|
+
(`plot_roc_curve`, `plot_pareto_frontier`, `plot_slice_metric_heatmap`)
|
|
25
|
+
and adds an `ax=` kwarg to 2 existing plotting fns (`plot_metric_bars`,
|
|
26
|
+
`plot_score_histograms`) — all additive.
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
|
|
30
|
+
- `eval_toolkit.plotting.plot_roc_curve` — sibling to `plot_pr_curve`;
|
|
31
|
+
accepts `ax=`, optional baseline overlay, threshold marker. Includes
|
|
32
|
+
a diagonal chance line. Closes #14.
|
|
33
|
+
- `eval_toolkit.plotting.plot_pareto_frontier` — cost-vs-performance
|
|
34
|
+
scatter with running-best frontier overlay (O(n log n) sweep). Supports
|
|
35
|
+
both higher-is-better and lower-is-better metric directions, optional
|
|
36
|
+
per-point labels. Closes #15.
|
|
37
|
+
- `eval_toolkit.plotting.plot_slice_metric_heatmap` — (rows × cols × metric)
|
|
38
|
+
heatmap with colorbar + optional cell annotations + NaN-cell masking.
|
|
39
|
+
Closes #16.
|
|
40
|
+
|
|
41
|
+
### Changed
|
|
42
|
+
|
|
43
|
+
- `plot_metric_bars` and `plot_score_histograms` now accept an `ax=` kwarg,
|
|
44
|
+
bringing the count of `ax=`-accepting plotting fns to 6 of 7
|
|
45
|
+
(`plot_confusion_matrix_grid` remains figure-creating since it's
|
|
46
|
+
intrinsically a grid-of-axes). Closes #24.
|
|
47
|
+
- `Makefile`'s `coverage` target now filters `monte_carlo` and `benchmark`
|
|
48
|
+
markers, matching what `.github/workflows/ci.yml` actually runs. `make ci`
|
|
49
|
+
drops from ~45 min to ~3 min locally. Closes #25.
|
|
50
|
+
|
|
51
|
+
### Internal
|
|
52
|
+
|
|
53
|
+
- 16 new edge tests covering input validation + `ax=` branches for the
|
|
54
|
+
3 new plotting fns and the 2 backfilled ones.
|
|
55
|
+
- 3 new `@pytest.mark.mpl_image_compare` baseline tests + checked-in
|
|
56
|
+
baseline PNGs for the new plotting fns.
|
|
57
|
+
- `.github/workflows/*.yml` audited for Node.js 20 deprecation; bumped
|
|
58
|
+
`actions/upload-artifact@v4 → v5` (3 workflows) and
|
|
59
|
+
`actions/download-artifact@v4 → v5` (publish.yml) ahead of the
|
|
60
|
+
2026-09-16 Node-20 removal deadline. Closes #26.
|
|
61
|
+
|
|
10
62
|
## [0.32.0] — 2026-05-16 — Multiple-comparisons correction + EvidenceGate discoverability
|
|
11
63
|
|
|
12
64
|
Bundled close-outs from the `v0.32` milestone triage (4 issues). Adds
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33.0
|
|
4
4
|
Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
|
|
5
5
|
Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
|
|
6
6
|
Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
|
|
@@ -195,9 +195,12 @@ _EXPORTS: dict[str, str] = {
|
|
|
195
195
|
"plot_confusion_matrix_grid": "eval_toolkit.plotting",
|
|
196
196
|
"plot_lift_ci": "eval_toolkit.plotting",
|
|
197
197
|
"plot_metric_bars": "eval_toolkit.plotting",
|
|
198
|
+
"plot_pareto_frontier": "eval_toolkit.plotting",
|
|
198
199
|
"plot_pr_curve": "eval_toolkit.plotting",
|
|
199
200
|
"plot_reliability_diagram": "eval_toolkit.plotting",
|
|
201
|
+
"plot_roc_curve": "eval_toolkit.plotting",
|
|
200
202
|
"plot_score_histograms": "eval_toolkit.plotting",
|
|
203
|
+
"plot_slice_metric_heatmap": "eval_toolkit.plotting",
|
|
201
204
|
"save_figure": "eval_toolkit.plotting",
|
|
202
205
|
"set_plot_style": "eval_toolkit.plotting",
|
|
203
206
|
# --- provenance ---
|
|
@@ -21,7 +21,7 @@ from __future__ import annotations
|
|
|
21
21
|
|
|
22
22
|
import json
|
|
23
23
|
import os
|
|
24
|
-
from collections.abc import Callable, Container, Iterable, Mapping
|
|
24
|
+
from collections.abc import Callable, Container, Iterable, Mapping, Sequence
|
|
25
25
|
from pathlib import Path
|
|
26
26
|
from types import MappingProxyType
|
|
27
27
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -34,7 +34,7 @@ from matplotlib.colors import LinearSegmentedColormap
|
|
|
34
34
|
from matplotlib.figure import Figure
|
|
35
35
|
from matplotlib.patches import Rectangle
|
|
36
36
|
from sklearn.calibration import calibration_curve
|
|
37
|
-
from sklearn.metrics import precision_recall_curve
|
|
37
|
+
from sklearn.metrics import precision_recall_curve, roc_curve
|
|
38
38
|
|
|
39
39
|
if TYPE_CHECKING:
|
|
40
40
|
from eval_toolkit.bootstrap import BootstrapCI
|
|
@@ -48,9 +48,12 @@ __all__ = [
|
|
|
48
48
|
"plot_confusion_matrix_grid",
|
|
49
49
|
"plot_lift_ci",
|
|
50
50
|
"plot_metric_bars",
|
|
51
|
+
"plot_pareto_frontier",
|
|
51
52
|
"plot_pr_curve",
|
|
52
53
|
"plot_reliability_diagram",
|
|
54
|
+
"plot_roc_curve",
|
|
53
55
|
"plot_score_histograms",
|
|
56
|
+
"plot_slice_metric_heatmap",
|
|
54
57
|
"save_figure",
|
|
55
58
|
"set_plot_style",
|
|
56
59
|
]
|
|
@@ -451,6 +454,129 @@ def plot_pr_curve(
|
|
|
451
454
|
return fig
|
|
452
455
|
|
|
453
456
|
|
|
457
|
+
def plot_roc_curve(
|
|
458
|
+
y_true: np.ndarray,
|
|
459
|
+
y_score: np.ndarray,
|
|
460
|
+
*,
|
|
461
|
+
label: str | None = None,
|
|
462
|
+
threshold: float | None = None,
|
|
463
|
+
baseline_curve: tuple[np.ndarray, np.ndarray] | None = None,
|
|
464
|
+
baseline_label: str = "baseline",
|
|
465
|
+
title: str | None = None,
|
|
466
|
+
figsize: tuple[float, float] | None = None,
|
|
467
|
+
ax: Axes | None = None,
|
|
468
|
+
) -> Figure:
|
|
469
|
+
"""Receiver-operating-characteristic curve.
|
|
470
|
+
|
|
471
|
+
Sibling of :func:`plot_pr_curve`. Plots the (FPR, TPR) curve with a
|
|
472
|
+
diagonal chance line. ROC is invariant to class prior, so unlike
|
|
473
|
+
`plot_pr_curve` there is no ``prevalence`` parameter.
|
|
474
|
+
|
|
475
|
+
Parameters
|
|
476
|
+
----------
|
|
477
|
+
y_true, y_score : np.ndarray
|
|
478
|
+
Labels and scores.
|
|
479
|
+
label : str or None, optional
|
|
480
|
+
Legend label for the main curve.
|
|
481
|
+
threshold : float or None, optional
|
|
482
|
+
Draw a star marker at the (fpr, tpr) point closest to this
|
|
483
|
+
threshold.
|
|
484
|
+
baseline_curve : tuple of (fpr, tpr) np.ndarrays, optional
|
|
485
|
+
Optional baseline curve to overlay (e.g., a simpler reference model).
|
|
486
|
+
baseline_label : str, optional
|
|
487
|
+
Legend label for the baseline overlay (default ``"baseline"``).
|
|
488
|
+
title : str or None, optional
|
|
489
|
+
figsize : tuple of float or None, optional
|
|
490
|
+
ax : matplotlib Axes or None, optional
|
|
491
|
+
|
|
492
|
+
Returns
|
|
493
|
+
-------
|
|
494
|
+
matplotlib.figure.Figure
|
|
495
|
+
|
|
496
|
+
Raises
|
|
497
|
+
------
|
|
498
|
+
ValueError
|
|
499
|
+
If ``y_true``/``y_score`` fail shape/dtype/value-range checks
|
|
500
|
+
(re-raised from validators); if ``threshold`` is outside [0, 1];
|
|
501
|
+
or if ``baseline_curve`` is not a length-2 tuple with
|
|
502
|
+
matching-shape ``(fpr, tpr)`` arrays.
|
|
503
|
+
"""
|
|
504
|
+
y_true = _ensure_ndarray("y_true", y_true)
|
|
505
|
+
y_score = _ensure_ndarray("y_score", y_score)
|
|
506
|
+
_validate_pair(y_true, y_score, other_name="y_score")
|
|
507
|
+
|
|
508
|
+
if threshold is not None and not 0.0 <= threshold <= 1.0:
|
|
509
|
+
raise ValueError(f"threshold must be in [0, 1], got {threshold}")
|
|
510
|
+
if baseline_curve is not None:
|
|
511
|
+
if not (isinstance(baseline_curve, tuple) and len(baseline_curve) == 2):
|
|
512
|
+
raise ValueError("baseline_curve must be a (fpr, tpr) tuple")
|
|
513
|
+
bl_fpr = _ensure_ndarray("baseline_curve[0]", baseline_curve[0])
|
|
514
|
+
bl_tpr = _ensure_ndarray("baseline_curve[1]", baseline_curve[1])
|
|
515
|
+
if bl_fpr.shape != bl_tpr.shape:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
f"baseline_curve fpr and tpr must have same shape, "
|
|
518
|
+
f"got {bl_fpr.shape} vs {bl_tpr.shape}"
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
fig, axes = _resolve_axes(ax, figsize)
|
|
522
|
+
|
|
523
|
+
fpr, tpr, thresholds = roc_curve(y_true, y_score)
|
|
524
|
+
axes.plot(fpr, tpr, color=PALETTE["negative"], label=label, linewidth=1.5)
|
|
525
|
+
|
|
526
|
+
# Diagonal chance line (AUC = 0.5 reference).
|
|
527
|
+
axes.plot(
|
|
528
|
+
[0.0, 1.0],
|
|
529
|
+
[0.0, 1.0],
|
|
530
|
+
color=PALETTE["baseline"],
|
|
531
|
+
linestyle="--",
|
|
532
|
+
linewidth=0.8,
|
|
533
|
+
alpha=0.7,
|
|
534
|
+
label="chance",
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
if baseline_curve is not None:
|
|
538
|
+
bl_fpr = np.asarray(baseline_curve[0])
|
|
539
|
+
bl_tpr = np.asarray(baseline_curve[1])
|
|
540
|
+
axes.plot(
|
|
541
|
+
bl_fpr,
|
|
542
|
+
bl_tpr,
|
|
543
|
+
color=PALETTE["baseline"],
|
|
544
|
+
linewidth=1.0,
|
|
545
|
+
linestyle="--",
|
|
546
|
+
label=baseline_label,
|
|
547
|
+
zorder=1,
|
|
548
|
+
)
|
|
549
|
+
if threshold is not None:
|
|
550
|
+
# roc_curve prepends a sentinel threshold of np.inf; finite-mask it
|
|
551
|
+
# before picking the closest match so the marker lands on a real point.
|
|
552
|
+
finite = np.isfinite(thresholds)
|
|
553
|
+
if not finite.any():
|
|
554
|
+
raise ValueError("roc_curve returned no finite thresholds")
|
|
555
|
+
rel_idx = int(np.argmin(np.abs(thresholds[finite] - threshold)))
|
|
556
|
+
idx = int(np.flatnonzero(finite)[rel_idx])
|
|
557
|
+
axes.scatter(
|
|
558
|
+
fpr[idx],
|
|
559
|
+
tpr[idx],
|
|
560
|
+
color=PALETTE["accent"],
|
|
561
|
+
marker="*",
|
|
562
|
+
s=120,
|
|
563
|
+
zorder=5,
|
|
564
|
+
label=f"τ={threshold:.3f}",
|
|
565
|
+
edgecolor="black",
|
|
566
|
+
linewidth=0.5,
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
axes.set_xlabel("False Positive Rate")
|
|
570
|
+
axes.set_ylabel("True Positive Rate")
|
|
571
|
+
axes.set_xlim(0.0, 1.0)
|
|
572
|
+
axes.set_ylim(0.0, 1.05)
|
|
573
|
+
if title is not None:
|
|
574
|
+
axes.set_title(title)
|
|
575
|
+
_maybe_add_legend(axes)
|
|
576
|
+
fig.tight_layout()
|
|
577
|
+
return fig
|
|
578
|
+
|
|
579
|
+
|
|
454
580
|
def plot_reliability_diagram(
|
|
455
581
|
y_true: np.ndarray,
|
|
456
582
|
y_prob: np.ndarray,
|
|
@@ -670,6 +796,7 @@ def plot_metric_bars(
|
|
|
670
796
|
figsize: tuple[float, float] | None = None,
|
|
671
797
|
label_formatter: Callable[[str], str] | None = None,
|
|
672
798
|
sort_key: Callable[[str], Any] | None = None,
|
|
799
|
+
ax: Axes | None = None,
|
|
673
800
|
) -> Figure:
|
|
674
801
|
"""Bar chart for a ``{label: metric}`` mapping.
|
|
675
802
|
|
|
@@ -684,6 +811,9 @@ def plot_metric_bars(
|
|
|
684
811
|
Maps raw key → display label. Default is identity.
|
|
685
812
|
sort_key : callable or None, optional
|
|
686
813
|
Maps raw key → sort key. Default is alphabetical.
|
|
814
|
+
ax : matplotlib Axes or None, optional
|
|
815
|
+
Render onto this Axes (reuses its parent Figure); otherwise creates a
|
|
816
|
+
fresh figure.
|
|
687
817
|
|
|
688
818
|
Returns
|
|
689
819
|
-------
|
|
@@ -703,7 +833,7 @@ def plot_metric_bars(
|
|
|
703
833
|
labels = [fmt(k) for k, _ in sorted_items]
|
|
704
834
|
bar_values = [v for _, v in sorted_items]
|
|
705
835
|
|
|
706
|
-
fig, axes =
|
|
836
|
+
fig, axes = _resolve_axes(ax, figsize)
|
|
707
837
|
bar_color = color or PALETTE["negative"]
|
|
708
838
|
axes.bar(labels, bar_values, color=bar_color, edgecolor="black", linewidth=0.5)
|
|
709
839
|
upper = max(bar_values)
|
|
@@ -728,6 +858,7 @@ def plot_score_histograms(
|
|
|
728
858
|
figsize: tuple[float, float] | None = None,
|
|
729
859
|
label_formatter: Callable[[str], str] | None = None,
|
|
730
860
|
sort_key: Callable[[str], Any] | None = None,
|
|
861
|
+
ax: Axes | None = None,
|
|
731
862
|
) -> Figure:
|
|
732
863
|
"""Overlaid score-distribution histograms, one per slice.
|
|
733
864
|
|
|
@@ -742,6 +873,9 @@ def plot_score_histograms(
|
|
|
742
873
|
title, figsize : optional
|
|
743
874
|
label_formatter, sort_key : callable or None, optional
|
|
744
875
|
See :func:`plot_metric_bars`.
|
|
876
|
+
ax : matplotlib Axes or None, optional
|
|
877
|
+
Render onto this Axes (reuses its parent Figure); otherwise creates a
|
|
878
|
+
fresh figure.
|
|
745
879
|
|
|
746
880
|
Returns
|
|
747
881
|
-------
|
|
@@ -778,7 +912,7 @@ def plot_score_histograms(
|
|
|
778
912
|
PALETTE["baseline"],
|
|
779
913
|
]
|
|
780
914
|
|
|
781
|
-
fig, axes =
|
|
915
|
+
fig, axes = _resolve_axes(ax, figsize)
|
|
782
916
|
for i, (key, arr) in enumerate(sorted_items):
|
|
783
917
|
color = palette_cycle[i % len(palette_cycle)]
|
|
784
918
|
axes.hist(
|
|
@@ -989,3 +1123,229 @@ def plot_bootstrap_distribution(
|
|
|
989
1123
|
axes.set_title(title)
|
|
990
1124
|
fig.tight_layout()
|
|
991
1125
|
return fig
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
def plot_pareto_frontier(
|
|
1129
|
+
cost: np.ndarray,
|
|
1130
|
+
metric: np.ndarray,
|
|
1131
|
+
*,
|
|
1132
|
+
point_labels: Sequence[str] | None = None,
|
|
1133
|
+
higher_metric_is_better: bool = True,
|
|
1134
|
+
xlabel: str = "cost",
|
|
1135
|
+
ylabel: str = "metric",
|
|
1136
|
+
title: str | None = None,
|
|
1137
|
+
figsize: tuple[float, float] | None = None,
|
|
1138
|
+
ax: Axes | None = None,
|
|
1139
|
+
) -> Figure:
|
|
1140
|
+
"""Cost-vs-performance scatter with Pareto frontier overlay.
|
|
1141
|
+
|
|
1142
|
+
Points on the frontier (the running-best metric as cost increases) are
|
|
1143
|
+
drawn in accent color and connected by a dashed polyline; dominated
|
|
1144
|
+
points are drawn in muted baseline color. Cost is always assumed
|
|
1145
|
+
lower-is-better; ``higher_metric_is_better`` controls the metric
|
|
1146
|
+
direction.
|
|
1147
|
+
|
|
1148
|
+
Parameters
|
|
1149
|
+
----------
|
|
1150
|
+
cost : np.ndarray, shape (n,)
|
|
1151
|
+
Cost values (training/inference/compute proxy; lower-is-better).
|
|
1152
|
+
metric : np.ndarray, shape (n,)
|
|
1153
|
+
Metric values aligned with ``cost``.
|
|
1154
|
+
point_labels : Sequence[str] or None, optional
|
|
1155
|
+
Per-point annotations (e.g., rung names). If provided, must have
|
|
1156
|
+
length ``n``.
|
|
1157
|
+
higher_metric_is_better : bool, optional
|
|
1158
|
+
If True (default), frontier maximises metric at minimum cost. If
|
|
1159
|
+
False, frontier minimises both (e.g., metric is an error/loss).
|
|
1160
|
+
xlabel, ylabel : str, optional
|
|
1161
|
+
title, figsize, ax : optional
|
|
1162
|
+
|
|
1163
|
+
Returns
|
|
1164
|
+
-------
|
|
1165
|
+
matplotlib.figure.Figure
|
|
1166
|
+
|
|
1167
|
+
Raises
|
|
1168
|
+
------
|
|
1169
|
+
ValueError
|
|
1170
|
+
If ``cost`` and ``metric`` shapes don't match, are not 1-D, are
|
|
1171
|
+
empty, contain NaN/inf, or if ``point_labels`` length disagrees.
|
|
1172
|
+
"""
|
|
1173
|
+
cost = _ensure_ndarray("cost", cost)
|
|
1174
|
+
metric = _ensure_ndarray("metric", metric)
|
|
1175
|
+
if cost.ndim != 1 or metric.ndim != 1:
|
|
1176
|
+
raise ValueError(f"cost and metric must be 1-D, got shapes {cost.shape} and {metric.shape}")
|
|
1177
|
+
if cost.shape != metric.shape:
|
|
1178
|
+
raise ValueError(
|
|
1179
|
+
f"cost and metric must have same shape, got {cost.shape} vs {metric.shape}"
|
|
1180
|
+
)
|
|
1181
|
+
if cost.size == 0:
|
|
1182
|
+
raise ValueError("cost and metric must be non-empty")
|
|
1183
|
+
if not (np.isfinite(cost).all() and np.isfinite(metric).all()):
|
|
1184
|
+
raise ValueError("cost and metric must contain finite values only")
|
|
1185
|
+
if point_labels is not None and len(point_labels) != cost.size:
|
|
1186
|
+
raise ValueError(f"point_labels length {len(point_labels)} != n={cost.size}")
|
|
1187
|
+
|
|
1188
|
+
fig, axes = _resolve_axes(ax, figsize)
|
|
1189
|
+
|
|
1190
|
+
# Sweep frontier: sort by cost ascending; a point is on the frontier iff
|
|
1191
|
+
# it improves on the running-best metric. With ties on cost, only the
|
|
1192
|
+
# best metric at that cost can be a frontier member; ``np.lexsort`` keys
|
|
1193
|
+
# so smaller cost wins and within same cost better metric wins. The
|
|
1194
|
+
# ``sign`` multiplier folds the direction (higher/lower-is-better) into
|
|
1195
|
+
# a uniform max-sweep against a -inf baseline.
|
|
1196
|
+
sign = 1.0 if higher_metric_is_better else -1.0
|
|
1197
|
+
order = np.lexsort((-sign * metric, cost))
|
|
1198
|
+
cost_s = cost[order]
|
|
1199
|
+
metric_s = metric[order]
|
|
1200
|
+
on_frontier = np.zeros(cost.size, dtype=bool)
|
|
1201
|
+
best_signed = -np.inf
|
|
1202
|
+
for i in range(cost.size):
|
|
1203
|
+
candidate = sign * float(metric_s[i])
|
|
1204
|
+
if candidate > best_signed:
|
|
1205
|
+
on_frontier[i] = True
|
|
1206
|
+
best_signed = candidate
|
|
1207
|
+
|
|
1208
|
+
# Map back to original indices for plotting labels in input order.
|
|
1209
|
+
frontier_mask = np.zeros(cost.size, dtype=bool)
|
|
1210
|
+
frontier_mask[order[on_frontier]] = True
|
|
1211
|
+
|
|
1212
|
+
axes.scatter(
|
|
1213
|
+
cost[~frontier_mask],
|
|
1214
|
+
metric[~frontier_mask],
|
|
1215
|
+
color=PALETTE["baseline"],
|
|
1216
|
+
s=40,
|
|
1217
|
+
alpha=0.7,
|
|
1218
|
+
zorder=2,
|
|
1219
|
+
label="dominated" if (~frontier_mask).any() else None,
|
|
1220
|
+
)
|
|
1221
|
+
axes.scatter(
|
|
1222
|
+
cost[frontier_mask],
|
|
1223
|
+
metric[frontier_mask],
|
|
1224
|
+
color=PALETTE["accent"],
|
|
1225
|
+
edgecolor="black",
|
|
1226
|
+
linewidth=0.5,
|
|
1227
|
+
s=70,
|
|
1228
|
+
zorder=4,
|
|
1229
|
+
label="frontier",
|
|
1230
|
+
)
|
|
1231
|
+
if frontier_mask.any():
|
|
1232
|
+
axes.plot(
|
|
1233
|
+
cost_s[on_frontier],
|
|
1234
|
+
metric_s[on_frontier],
|
|
1235
|
+
color=PALETTE["accent"],
|
|
1236
|
+
linestyle="--",
|
|
1237
|
+
linewidth=1.0,
|
|
1238
|
+
alpha=0.8,
|
|
1239
|
+
zorder=3,
|
|
1240
|
+
)
|
|
1241
|
+
if point_labels is not None:
|
|
1242
|
+
for label, x, y in zip(point_labels, cost, metric, strict=True):
|
|
1243
|
+
axes.annotate(
|
|
1244
|
+
label,
|
|
1245
|
+
(float(x), float(y)),
|
|
1246
|
+
textcoords="offset points",
|
|
1247
|
+
xytext=(6, 4),
|
|
1248
|
+
fontsize=9,
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
axes.set_xlabel(xlabel)
|
|
1252
|
+
axes.set_ylabel(ylabel)
|
|
1253
|
+
if title is not None:
|
|
1254
|
+
axes.set_title(title)
|
|
1255
|
+
_maybe_add_legend(axes)
|
|
1256
|
+
fig.tight_layout()
|
|
1257
|
+
return fig
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
def plot_slice_metric_heatmap(
|
|
1261
|
+
grid: np.ndarray,
|
|
1262
|
+
*,
|
|
1263
|
+
row_labels: Sequence[str],
|
|
1264
|
+
col_labels: Sequence[str],
|
|
1265
|
+
metric_name: str = "metric",
|
|
1266
|
+
cmap: str = "viridis",
|
|
1267
|
+
annotate: bool = True,
|
|
1268
|
+
annot_fmt: str = "{:.3f}",
|
|
1269
|
+
title: str | None = None,
|
|
1270
|
+
figsize: tuple[float, float] | None = None,
|
|
1271
|
+
ax: Axes | None = None,
|
|
1272
|
+
) -> Figure:
|
|
1273
|
+
"""Heatmap of a (row × col × metric) grid with colorbar.
|
|
1274
|
+
|
|
1275
|
+
Parameters
|
|
1276
|
+
----------
|
|
1277
|
+
grid : np.ndarray, shape (n_rows, n_cols)
|
|
1278
|
+
Metric values, one per (row, col) cell. NaN cells render as blank
|
|
1279
|
+
(white) in the heatmap and are skipped from annotations.
|
|
1280
|
+
row_labels, col_labels : Sequence[str]
|
|
1281
|
+
Tick labels for the two axes; lengths must match ``grid.shape``.
|
|
1282
|
+
metric_name : str, optional
|
|
1283
|
+
Used as the colorbar label. Default ``"metric"``.
|
|
1284
|
+
cmap : str, optional
|
|
1285
|
+
Matplotlib colormap name. Default ``"viridis"``.
|
|
1286
|
+
annotate : bool, optional
|
|
1287
|
+
If True (default), write each cell's value on the heatmap.
|
|
1288
|
+
annot_fmt : str, optional
|
|
1289
|
+
Format string for cell annotations. Default ``"{:.3f}"``.
|
|
1290
|
+
title, figsize, ax : optional
|
|
1291
|
+
|
|
1292
|
+
Returns
|
|
1293
|
+
-------
|
|
1294
|
+
matplotlib.figure.Figure
|
|
1295
|
+
|
|
1296
|
+
Raises
|
|
1297
|
+
------
|
|
1298
|
+
ValueError
|
|
1299
|
+
If ``grid`` is not 2-D, if label lengths disagree with the grid
|
|
1300
|
+
shape, or if the grid is empty.
|
|
1301
|
+
"""
|
|
1302
|
+
grid_arr = _ensure_ndarray("grid", grid).astype(np.float64, copy=False)
|
|
1303
|
+
if grid_arr.ndim != 2:
|
|
1304
|
+
raise ValueError(f"grid must be 2-D, got shape {grid_arr.shape}")
|
|
1305
|
+
n_rows, n_cols = grid_arr.shape
|
|
1306
|
+
if n_rows == 0 or n_cols == 0:
|
|
1307
|
+
raise ValueError(f"grid must be non-empty, got shape {grid_arr.shape}")
|
|
1308
|
+
if len(row_labels) != n_rows:
|
|
1309
|
+
raise ValueError(f"row_labels length {len(row_labels)} != grid.shape[0] {n_rows}")
|
|
1310
|
+
if len(col_labels) != n_cols:
|
|
1311
|
+
raise ValueError(f"col_labels length {len(col_labels)} != grid.shape[1] {n_cols}")
|
|
1312
|
+
|
|
1313
|
+
fig, axes = _resolve_axes(ax, figsize)
|
|
1314
|
+
|
|
1315
|
+
masked = np.ma.masked_invalid(grid_arr)
|
|
1316
|
+
im = axes.imshow(masked, cmap=cmap, aspect="auto")
|
|
1317
|
+
fig.colorbar(im, ax=axes, label=metric_name)
|
|
1318
|
+
|
|
1319
|
+
axes.set_xticks(np.arange(n_cols))
|
|
1320
|
+
axes.set_yticks(np.arange(n_rows))
|
|
1321
|
+
axes.set_xticklabels(list(col_labels))
|
|
1322
|
+
axes.set_yticklabels(list(row_labels))
|
|
1323
|
+
axes.tick_params(axis="x", rotation=30)
|
|
1324
|
+
for tick in axes.get_xticklabels():
|
|
1325
|
+
tick.set_horizontalalignment("right")
|
|
1326
|
+
|
|
1327
|
+
if annotate:
|
|
1328
|
+
# Choose text color per cell from luminance midpoint to stay readable.
|
|
1329
|
+
vmin = float(np.nanmin(grid_arr)) if np.isfinite(grid_arr).any() else 0.0
|
|
1330
|
+
vmax = float(np.nanmax(grid_arr)) if np.isfinite(grid_arr).any() else 1.0
|
|
1331
|
+
midpoint = 0.5 * (vmin + vmax)
|
|
1332
|
+
for i in range(n_rows):
|
|
1333
|
+
for j in range(n_cols):
|
|
1334
|
+
v = grid_arr[i, j]
|
|
1335
|
+
if not np.isfinite(v):
|
|
1336
|
+
continue
|
|
1337
|
+
text_color = "white" if v < midpoint else "black"
|
|
1338
|
+
axes.text(
|
|
1339
|
+
j,
|
|
1340
|
+
i,
|
|
1341
|
+
annot_fmt.format(v),
|
|
1342
|
+
ha="center",
|
|
1343
|
+
va="center",
|
|
1344
|
+
color=text_color,
|
|
1345
|
+
fontsize=9,
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
if title is not None:
|
|
1349
|
+
axes.set_title(title)
|
|
1350
|
+
fig.tight_layout()
|
|
1351
|
+
return fig
|
|
Binary file
|
|
@@ -164,9 +164,12 @@
|
|
|
164
164
|
"plot_confusion_matrix_grid",
|
|
165
165
|
"plot_lift_ci",
|
|
166
166
|
"plot_metric_bars",
|
|
167
|
+
"plot_pareto_frontier",
|
|
167
168
|
"plot_pr_curve",
|
|
168
169
|
"plot_reliability_diagram",
|
|
170
|
+
"plot_roc_curve",
|
|
169
171
|
"plot_score_histograms",
|
|
172
|
+
"plot_slice_metric_heatmap",
|
|
170
173
|
"pr_auc",
|
|
171
174
|
"precision_at_prior",
|
|
172
175
|
"quantile_stratified_pr_auc",
|
|
@@ -1001,7 +1004,7 @@
|
|
|
1001
1004
|
"doc_first_line": "str(object='') -> str",
|
|
1002
1005
|
"kind": "value",
|
|
1003
1006
|
"type": "str",
|
|
1004
|
-
"value": "'0.
|
|
1007
|
+
"value": "'0.33.0'"
|
|
1005
1008
|
},
|
|
1006
1009
|
"apply_operating_points": {
|
|
1007
1010
|
"doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
|
|
@@ -1326,7 +1329,12 @@
|
|
|
1326
1329
|
"plot_metric_bars": {
|
|
1327
1330
|
"doc_first_line": "Bar chart for a ``{label: metric}`` mapping.",
|
|
1328
1331
|
"kind": "function",
|
|
1329
|
-
"signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
|
|
1332
|
+
"signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1333
|
+
},
|
|
1334
|
+
"plot_pareto_frontier": {
|
|
1335
|
+
"doc_first_line": "Cost-vs-performance scatter with Pareto frontier overlay.",
|
|
1336
|
+
"kind": "function",
|
|
1337
|
+
"signature": "(cost: 'np.ndarray', metric: 'np.ndarray', *, point_labels: 'Sequence[str] | None' = None, higher_metric_is_better: 'bool' = True, xlabel: 'str' = 'cost', ylabel: 'str' = 'metric', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1330
1338
|
},
|
|
1331
1339
|
"plot_pr_curve": {
|
|
1332
1340
|
"doc_first_line": "Precision-recall curve.",
|
|
@@ -1338,10 +1346,20 @@
|
|
|
1338
1346
|
"kind": "function",
|
|
1339
1347
|
"signature": "(y_true: 'np.ndarray', y_prob: 'np.ndarray', *, n_bins: 'int' = 10, bin_counts: 'np.ndarray | None' = None, xlabel: 'str' = 'Mean Predicted Probability', ylabel: 'str' = 'Observed Fraction of Positives', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1340
1348
|
},
|
|
1349
|
+
"plot_roc_curve": {
|
|
1350
|
+
"doc_first_line": "Receiver-operating-characteristic curve.",
|
|
1351
|
+
"kind": "function",
|
|
1352
|
+
"signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', *, label: 'str | None' = None, threshold: 'float | None' = None, baseline_curve: 'tuple[np.ndarray, np.ndarray] | None' = None, baseline_label: 'str' = 'baseline', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1353
|
+
},
|
|
1341
1354
|
"plot_score_histograms": {
|
|
1342
1355
|
"doc_first_line": "Overlaid score-distribution histograms, one per slice.",
|
|
1343
1356
|
"kind": "function",
|
|
1344
|
-
"signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
|
|
1357
|
+
"signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1358
|
+
},
|
|
1359
|
+
"plot_slice_metric_heatmap": {
|
|
1360
|
+
"doc_first_line": "Heatmap of a (row \u00d7 col \u00d7 metric) grid with colorbar.",
|
|
1361
|
+
"kind": "function",
|
|
1362
|
+
"signature": "(grid: 'np.ndarray', *, row_labels: 'Sequence[str]', col_labels: 'Sequence[str]', metric_name: 'str' = 'metric', cmap: 'str' = 'viridis', annotate: 'bool' = True, annot_fmt: 'str' = '{:.3f}', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
|
|
1345
1363
|
},
|
|
1346
1364
|
"pr_auc": {
|
|
1347
1365
|
"doc_first_line": "Average precision (PR-AUC).",
|