eval-toolkit 0.32.0__tar.gz → 0.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/CHANGELOG.md +52 -0
  2. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/PKG-INFO +1 -1
  3. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/__init__.py +3 -0
  4. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/_version.py +1 -1
  5. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/plotting.py +364 -4
  6. eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_pareto_frontier.png +0 -0
  7. eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_roc_curve.png +0 -0
  8. eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png +0 -0
  9. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/public_api/snapshot.json +21 -3
  10. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_edge.py +201 -0
  11. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_visual.py +56 -0
  12. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/.gitignore +0 -0
  13. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/LICENSE +0 -0
  14. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/README.md +0 -0
  15. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/STYLE.md +0 -0
  16. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/archive/README.md +0 -0
  17. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/README.md +0 -0
  18. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/datasets/README.md +0 -0
  19. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/data-integrity/README.md +0 -0
  20. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/eval-ecosystem/README.md +0 -0
  21. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/inference/README.md +0 -0
  22. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/research/papers/prompt-injection/README.md +0 -0
  23. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/docs/source/methodology/README.md +0 -0
  24. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/pyproject.toml +0 -0
  25. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/__main__.py +0 -0
  26. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/_deprecated.py +0 -0
  27. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/analysis.py +0 -0
  28. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/artifacts.py +0 -0
  29. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/bootstrap.py +0 -0
  30. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/calibration.py +0 -0
  31. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/claims.py +0 -0
  32. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/config.py +0 -0
  33. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/docs.py +0 -0
  34. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/evidence.py +0 -0
  35. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/harness.py +0 -0
  36. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/leakage.py +0 -0
  37. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/loaders.py +0 -0
  38. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/manifest.py +0 -0
  39. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/metrics.py +0 -0
  40. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/operating_points.py +0 -0
  41. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/paths.py +0 -0
  42. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/protocols.py +0 -0
  43. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/provenance.py +0 -0
  44. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/py.typed +0 -0
  45. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v1.json +0 -0
  46. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v2.json +0 -0
  47. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/manifest.v3.json +0 -0
  48. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/results.v1.json +0 -0
  49. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/schemas/results_full.v1.json +0 -0
  50. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/seeds.py +0 -0
  51. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/splits.py +0 -0
  52. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/text_dedup.py +0 -0
  53. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/thresholds.py +0 -0
  54. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_bootstrap_distribution.png +0 -0
  55. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_confusion_matrix_grid.png +0 -0
  56. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_lift_ci.png +0 -0
  57. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_metric_bars.png +0 -0
  58. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_pr_curve.png +0 -0
  59. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_reliability_diagram.png +0 -0
  60. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/baseline/test_plotting_visual/plot_score_histograms.png +0 -0
  61. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/benchmarks/__init__.py +0 -0
  62. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/benchmarks/test_kernel_benchmarks.py +0 -0
  63. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/conftest.py +0 -0
  64. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/bootstrap_ci/cases.json +0 -0
  65. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/expected.md +0 -0
  66. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/input.md +0 -0
  67. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/docs/metrics.json +0 -0
  68. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/strategies.py +0 -0
  69. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_analysis.py +0 -0
  70. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_artifacts.py +0 -0
  71. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_calibration_mc.py +0 -0
  72. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_edge_cases.py +0 -0
  73. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_golden.py +0 -0
  74. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_props.py +0 -0
  75. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_research_grounded.py +0 -0
  76. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_bootstrap_unit.py +0 -0
  77. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_bootstrap_chain.py +0 -0
  78. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_determinism.py +0 -0
  79. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_optimization_failures.py +0 -0
  80. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_props.py +0 -0
  81. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_research_grounded.py +0 -0
  82. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_calibration_unit.py +0 -0
  83. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims.py +0 -0
  84. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims_coverage.py +0 -0
  85. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_claims_props.py +0 -0
  86. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_cli.py +0 -0
  87. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_config.py +0 -0
  88. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_bootstrap.py +0 -0
  89. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_calibration.py +0 -0
  90. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_harness.py +0 -0
  91. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_metrics.py +0 -0
  92. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_coverage_plotting.py +0 -0
  93. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_dedup_split_leakage_chain.py +0 -0
  94. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_deprecations.py +0 -0
  95. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_docs_golden.py +0 -0
  96. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_docs_props.py +0 -0
  97. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_evidence_validators.py +0 -0
  98. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_edge_cases.py +0 -0
  99. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_fault_injection.py +0 -0
  100. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_folded.py +0 -0
  101. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_internals.py +0 -0
  102. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_metric_options.py +0 -0
  103. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_harness_smoke.py +0 -0
  104. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_import_boundaries.py +0 -0
  105. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage.py +0 -0
  106. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage_error_paths.py +0 -0
  107. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_leakage_props.py +0 -0
  108. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders.py +0 -0
  109. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders_coverage.py +0 -0
  110. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_loaders_props.py +0 -0
  111. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_logging.py +0 -0
  112. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest.py +0 -0
  113. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_contamination_round_trip.py +0 -0
  114. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_props.py +0 -0
  115. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_manifest_validation.py +0 -0
  116. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_props.py +0 -0
  117. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_stratified_subsets.py +0 -0
  118. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_metrics_unit.py +0 -0
  119. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_misc_coverage.py +0 -0
  120. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_numeric_edge_cases.py +0 -0
  121. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_operating_points.py +0 -0
  122. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_operating_points_props.py +0 -0
  123. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_paths.py +0 -0
  124. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_pipeline_e2e.py +0 -0
  125. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_plotting_smoke.py +0 -0
  126. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_protocol_conformance.py +0 -0
  127. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_provenance.py +0 -0
  128. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_public_api.py +0 -0
  129. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_reference_equivalence.py +0 -0
  130. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_reproducibility_integration.py +0 -0
  131. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_schemas.py +0 -0
  132. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_seeds.py +0 -0
  133. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits.py +0 -0
  134. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits_leakage_integration.py +0 -0
  135. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_splits_props.py +0 -0
  136. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup.py +0 -0
  137. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_coverage.py +0 -0
  138. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_props.py +0 -0
  139. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_text_dedup_strategies.py +0 -0
  140. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds.py +0 -0
  141. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_constant_score.py +0 -0
  142. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_coverage.py +0 -0
  143. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_props.py +0 -0
  144. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_thresholds_research_grounded.py +0 -0
  145. {eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/test_v09_contracts.py +0 -0
@@ -7,6 +7,58 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.33.0] — 2026-05-17 — Plotting batch + ax= parity + CI quality-of-life
11
+
12
+ Consumer-unblocking release: closes the four upstream-gap TODOs in
13
+ `prompt-injection-detection-submission`'s Phase 4 figures (F1, F2, F5,
14
+ F6-left) which had been carrying hand-rolled prototypes pending these
15
+ primitives. Also bundles two CI/maintenance fixes that were quality-of-life
16
+ pain points during v0.32 ship.
17
+
18
+ **Note**: The `v0.33` milestone's #3 (`make_minilm_embedder`) is deferred
19
+ to the next iteration (likely v0.33.1 or v0.34) so this release stays
20
+ focused on the plotting batch + `ax=` parity. MiniLM adds a new optional
21
+ dep + new module; ships better as its own bite.
22
+
23
+ No breaking changes. Public API gains 3 new plotting exports
24
+ (`plot_roc_curve`, `plot_pareto_frontier`, `plot_slice_metric_heatmap`)
25
+ and adds an `ax=` kwarg to 2 existing plotting fns (`plot_metric_bars`,
26
+ `plot_score_histograms`) — all additive.
27
+
28
+ ### Added
29
+
30
+ - `eval_toolkit.plotting.plot_roc_curve` — sibling to `plot_pr_curve`;
31
+ accepts `ax=`, optional baseline overlay, threshold marker. Includes
32
+ a diagonal chance line. Closes #14.
33
+ - `eval_toolkit.plotting.plot_pareto_frontier` — cost-vs-performance
34
+ scatter with running-best frontier overlay (O(n log n) sweep). Supports
35
+ both higher-is-better and lower-is-better metric directions, optional
36
+ per-point labels. Closes #15.
37
+ - `eval_toolkit.plotting.plot_slice_metric_heatmap` — (rows × cols × metric)
38
+ heatmap with colorbar + optional cell annotations + NaN-cell masking.
39
+ Closes #16.
40
+
41
+ ### Changed
42
+
43
+ - `plot_metric_bars` and `plot_score_histograms` now accept an `ax=` kwarg,
44
+ bringing the count of `ax=`-accepting plotting fns to 6 of 7
45
+ (`plot_confusion_matrix_grid` remains figure-creating since it's
46
+ intrinsically a grid-of-axes). Closes #24.
47
+ - `Makefile`'s `coverage` target now filters `monte_carlo` and `benchmark`
48
+ markers, matching what `.github/workflows/ci.yml` actually runs. `make ci`
49
+ drops from ~45 min to ~3 min locally. Closes #25.
50
+
51
+ ### Internal
52
+
53
+ - 16 new edge tests covering input validation + `ax=` branches for the
54
+ 3 new plotting fns and the 2 backfilled ones.
55
+ - 3 new `@pytest.mark.mpl_image_compare` baseline tests + checked-in
56
+ baseline PNGs for the new plotting fns.
57
+ - `.github/workflows/*.yml` audited for Node.js 20 deprecation; bumped
58
+ `actions/upload-artifact@v4 → v5` (3 workflows) and
59
+ `actions/download-artifact@v4 → v5` (publish.yml) ahead of the
60
+ 2026-09-16 Node-20 removal deadline. Closes #26.
61
+
10
62
  ## [0.32.0] — 2026-05-16 — Multiple-comparisons correction + EvidenceGate discoverability
11
63
 
12
64
  Bundled close-outs from the `v0.32` milestone triage (4 issues). Adds
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-toolkit
3
- Version: 0.32.0
3
+ Version: 0.33.0
4
4
  Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
5
5
  Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
6
6
  Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -195,9 +195,12 @@ _EXPORTS: dict[str, str] = {
195
195
  "plot_confusion_matrix_grid": "eval_toolkit.plotting",
196
196
  "plot_lift_ci": "eval_toolkit.plotting",
197
197
  "plot_metric_bars": "eval_toolkit.plotting",
198
+ "plot_pareto_frontier": "eval_toolkit.plotting",
198
199
  "plot_pr_curve": "eval_toolkit.plotting",
199
200
  "plot_reliability_diagram": "eval_toolkit.plotting",
201
+ "plot_roc_curve": "eval_toolkit.plotting",
200
202
  "plot_score_histograms": "eval_toolkit.plotting",
203
+ "plot_slice_metric_heatmap": "eval_toolkit.plotting",
201
204
  "save_figure": "eval_toolkit.plotting",
202
205
  "set_plot_style": "eval_toolkit.plotting",
203
206
  # --- provenance ---
@@ -2,4 +2,4 @@
2
2
 
3
3
  __all__ = ["__version__"]
4
4
 
5
- __version__ = "0.32.0"
5
+ __version__ = "0.33.0"
@@ -21,7 +21,7 @@ from __future__ import annotations
21
21
 
22
22
  import json
23
23
  import os
24
- from collections.abc import Callable, Container, Iterable, Mapping
24
+ from collections.abc import Callable, Container, Iterable, Mapping, Sequence
25
25
  from pathlib import Path
26
26
  from types import MappingProxyType
27
27
  from typing import TYPE_CHECKING, Any, cast
@@ -34,7 +34,7 @@ from matplotlib.colors import LinearSegmentedColormap
34
34
  from matplotlib.figure import Figure
35
35
  from matplotlib.patches import Rectangle
36
36
  from sklearn.calibration import calibration_curve
37
- from sklearn.metrics import precision_recall_curve
37
+ from sklearn.metrics import precision_recall_curve, roc_curve
38
38
 
39
39
  if TYPE_CHECKING:
40
40
  from eval_toolkit.bootstrap import BootstrapCI
@@ -48,9 +48,12 @@ __all__ = [
48
48
  "plot_confusion_matrix_grid",
49
49
  "plot_lift_ci",
50
50
  "plot_metric_bars",
51
+ "plot_pareto_frontier",
51
52
  "plot_pr_curve",
52
53
  "plot_reliability_diagram",
54
+ "plot_roc_curve",
53
55
  "plot_score_histograms",
56
+ "plot_slice_metric_heatmap",
54
57
  "save_figure",
55
58
  "set_plot_style",
56
59
  ]
@@ -451,6 +454,129 @@ def plot_pr_curve(
451
454
  return fig
452
455
 
453
456
 
457
+ def plot_roc_curve(
458
+ y_true: np.ndarray,
459
+ y_score: np.ndarray,
460
+ *,
461
+ label: str | None = None,
462
+ threshold: float | None = None,
463
+ baseline_curve: tuple[np.ndarray, np.ndarray] | None = None,
464
+ baseline_label: str = "baseline",
465
+ title: str | None = None,
466
+ figsize: tuple[float, float] | None = None,
467
+ ax: Axes | None = None,
468
+ ) -> Figure:
469
+ """Receiver-operating-characteristic curve.
470
+
471
+ Sibling of :func:`plot_pr_curve`. Plots the (FPR, TPR) curve with a
472
+ diagonal chance line. ROC is invariant to class prior, so unlike
473
+ `plot_pr_curve` there is no ``prevalence`` parameter.
474
+
475
+ Parameters
476
+ ----------
477
+ y_true, y_score : np.ndarray
478
+ Labels and scores.
479
+ label : str or None, optional
480
+ Legend label for the main curve.
481
+ threshold : float or None, optional
482
+ Draw a star marker at the (fpr, tpr) point closest to this
483
+ threshold.
484
+ baseline_curve : tuple of (fpr, tpr) np.ndarrays, optional
485
+ Optional baseline curve to overlay (e.g., a simpler reference model).
486
+ baseline_label : str, optional
487
+ Legend label for the baseline overlay (default ``"baseline"``).
488
+ title : str or None, optional
489
+ figsize : tuple of float or None, optional
490
+ ax : matplotlib Axes or None, optional
491
+
492
+ Returns
493
+ -------
494
+ matplotlib.figure.Figure
495
+
496
+ Raises
497
+ ------
498
+ ValueError
499
+ If ``y_true``/``y_score`` fail shape/dtype/value-range checks
500
+ (re-raised from validators); if ``threshold`` is outside [0, 1];
501
+ or if ``baseline_curve`` is not a length-2 tuple with
502
+ matching-shape ``(fpr, tpr)`` arrays.
503
+ """
504
+ y_true = _ensure_ndarray("y_true", y_true)
505
+ y_score = _ensure_ndarray("y_score", y_score)
506
+ _validate_pair(y_true, y_score, other_name="y_score")
507
+
508
+ if threshold is not None and not 0.0 <= threshold <= 1.0:
509
+ raise ValueError(f"threshold must be in [0, 1], got {threshold}")
510
+ if baseline_curve is not None:
511
+ if not (isinstance(baseline_curve, tuple) and len(baseline_curve) == 2):
512
+ raise ValueError("baseline_curve must be a (fpr, tpr) tuple")
513
+ bl_fpr = _ensure_ndarray("baseline_curve[0]", baseline_curve[0])
514
+ bl_tpr = _ensure_ndarray("baseline_curve[1]", baseline_curve[1])
515
+ if bl_fpr.shape != bl_tpr.shape:
516
+ raise ValueError(
517
+ f"baseline_curve fpr and tpr must have same shape, "
518
+ f"got {bl_fpr.shape} vs {bl_tpr.shape}"
519
+ )
520
+
521
+ fig, axes = _resolve_axes(ax, figsize)
522
+
523
+ fpr, tpr, thresholds = roc_curve(y_true, y_score)
524
+ axes.plot(fpr, tpr, color=PALETTE["negative"], label=label, linewidth=1.5)
525
+
526
+ # Diagonal chance line (AUC = 0.5 reference).
527
+ axes.plot(
528
+ [0.0, 1.0],
529
+ [0.0, 1.0],
530
+ color=PALETTE["baseline"],
531
+ linestyle="--",
532
+ linewidth=0.8,
533
+ alpha=0.7,
534
+ label="chance",
535
+ )
536
+
537
+ if baseline_curve is not None:
538
+ bl_fpr = np.asarray(baseline_curve[0])
539
+ bl_tpr = np.asarray(baseline_curve[1])
540
+ axes.plot(
541
+ bl_fpr,
542
+ bl_tpr,
543
+ color=PALETTE["baseline"],
544
+ linewidth=1.0,
545
+ linestyle="--",
546
+ label=baseline_label,
547
+ zorder=1,
548
+ )
549
+ if threshold is not None:
550
+ # roc_curve prepends a sentinel threshold of np.inf; finite-mask it
551
+ # before picking the closest match so the marker lands on a real point.
552
+ finite = np.isfinite(thresholds)
553
+ if not finite.any():
554
+ raise ValueError("roc_curve returned no finite thresholds")
555
+ rel_idx = int(np.argmin(np.abs(thresholds[finite] - threshold)))
556
+ idx = int(np.flatnonzero(finite)[rel_idx])
557
+ axes.scatter(
558
+ fpr[idx],
559
+ tpr[idx],
560
+ color=PALETTE["accent"],
561
+ marker="*",
562
+ s=120,
563
+ zorder=5,
564
+ label=f"τ={threshold:.3f}",
565
+ edgecolor="black",
566
+ linewidth=0.5,
567
+ )
568
+
569
+ axes.set_xlabel("False Positive Rate")
570
+ axes.set_ylabel("True Positive Rate")
571
+ axes.set_xlim(0.0, 1.0)
572
+ axes.set_ylim(0.0, 1.05)
573
+ if title is not None:
574
+ axes.set_title(title)
575
+ _maybe_add_legend(axes)
576
+ fig.tight_layout()
577
+ return fig
578
+
579
+
454
580
  def plot_reliability_diagram(
455
581
  y_true: np.ndarray,
456
582
  y_prob: np.ndarray,
@@ -670,6 +796,7 @@ def plot_metric_bars(
670
796
  figsize: tuple[float, float] | None = None,
671
797
  label_formatter: Callable[[str], str] | None = None,
672
798
  sort_key: Callable[[str], Any] | None = None,
799
+ ax: Axes | None = None,
673
800
  ) -> Figure:
674
801
  """Bar chart for a ``{label: metric}`` mapping.
675
802
 
@@ -684,6 +811,9 @@ def plot_metric_bars(
684
811
  Maps raw key → display label. Default is identity.
685
812
  sort_key : callable or None, optional
686
813
  Maps raw key → sort key. Default is alphabetical.
814
+ ax : matplotlib Axes or None, optional
815
+ Render onto this Axes (reuses its parent Figure); otherwise creates a
816
+ fresh figure.
687
817
 
688
818
  Returns
689
819
  -------
@@ -703,7 +833,7 @@ def plot_metric_bars(
703
833
  labels = [fmt(k) for k, _ in sorted_items]
704
834
  bar_values = [v for _, v in sorted_items]
705
835
 
706
- fig, axes = plt.subplots(figsize=figsize or DEFAULT_FIGSIZE)
836
+ fig, axes = _resolve_axes(ax, figsize)
707
837
  bar_color = color or PALETTE["negative"]
708
838
  axes.bar(labels, bar_values, color=bar_color, edgecolor="black", linewidth=0.5)
709
839
  upper = max(bar_values)
@@ -728,6 +858,7 @@ def plot_score_histograms(
728
858
  figsize: tuple[float, float] | None = None,
729
859
  label_formatter: Callable[[str], str] | None = None,
730
860
  sort_key: Callable[[str], Any] | None = None,
861
+ ax: Axes | None = None,
731
862
  ) -> Figure:
732
863
  """Overlaid score-distribution histograms, one per slice.
733
864
 
@@ -742,6 +873,9 @@ def plot_score_histograms(
742
873
  title, figsize : optional
743
874
  label_formatter, sort_key : callable or None, optional
744
875
  See :func:`plot_metric_bars`.
876
+ ax : matplotlib Axes or None, optional
877
+ Render onto this Axes (reuses its parent Figure); otherwise creates a
878
+ fresh figure.
745
879
 
746
880
  Returns
747
881
  -------
@@ -778,7 +912,7 @@ def plot_score_histograms(
778
912
  PALETTE["baseline"],
779
913
  ]
780
914
 
781
- fig, axes = plt.subplots(figsize=figsize or DEFAULT_FIGSIZE)
915
+ fig, axes = _resolve_axes(ax, figsize)
782
916
  for i, (key, arr) in enumerate(sorted_items):
783
917
  color = palette_cycle[i % len(palette_cycle)]
784
918
  axes.hist(
@@ -989,3 +1123,229 @@ def plot_bootstrap_distribution(
989
1123
  axes.set_title(title)
990
1124
  fig.tight_layout()
991
1125
  return fig
1126
+
1127
+
1128
+ def plot_pareto_frontier(
1129
+ cost: np.ndarray,
1130
+ metric: np.ndarray,
1131
+ *,
1132
+ point_labels: Sequence[str] | None = None,
1133
+ higher_metric_is_better: bool = True,
1134
+ xlabel: str = "cost",
1135
+ ylabel: str = "metric",
1136
+ title: str | None = None,
1137
+ figsize: tuple[float, float] | None = None,
1138
+ ax: Axes | None = None,
1139
+ ) -> Figure:
1140
+ """Cost-vs-performance scatter with Pareto frontier overlay.
1141
+
1142
+ Points on the frontier (the running-best metric as cost increases) are
1143
+ drawn in accent color and connected by a dashed polyline; dominated
1144
+ points are drawn in muted baseline color. Cost is always assumed
1145
+ lower-is-better; ``higher_metric_is_better`` controls the metric
1146
+ direction.
1147
+
1148
+ Parameters
1149
+ ----------
1150
+ cost : np.ndarray, shape (n,)
1151
+ Cost values (training/inference/compute proxy; lower-is-better).
1152
+ metric : np.ndarray, shape (n,)
1153
+ Metric values aligned with ``cost``.
1154
+ point_labels : Sequence[str] or None, optional
1155
+ Per-point annotations (e.g., rung names). If provided, must have
1156
+ length ``n``.
1157
+ higher_metric_is_better : bool, optional
1158
+ If True (default), frontier maximises metric at minimum cost. If
1159
+ False, frontier minimises both (e.g., metric is an error/loss).
1160
+ xlabel, ylabel : str, optional
1161
+ title, figsize, ax : optional
1162
+
1163
+ Returns
1164
+ -------
1165
+ matplotlib.figure.Figure
1166
+
1167
+ Raises
1168
+ ------
1169
+ ValueError
1170
+ If ``cost`` and ``metric`` shapes don't match, are not 1-D, are
1171
+ empty, contain NaN/inf, or if ``point_labels`` length disagrees.
1172
+ """
1173
+ cost = _ensure_ndarray("cost", cost)
1174
+ metric = _ensure_ndarray("metric", metric)
1175
+ if cost.ndim != 1 or metric.ndim != 1:
1176
+ raise ValueError(f"cost and metric must be 1-D, got shapes {cost.shape} and {metric.shape}")
1177
+ if cost.shape != metric.shape:
1178
+ raise ValueError(
1179
+ f"cost and metric must have same shape, got {cost.shape} vs {metric.shape}"
1180
+ )
1181
+ if cost.size == 0:
1182
+ raise ValueError("cost and metric must be non-empty")
1183
+ if not (np.isfinite(cost).all() and np.isfinite(metric).all()):
1184
+ raise ValueError("cost and metric must contain finite values only")
1185
+ if point_labels is not None and len(point_labels) != cost.size:
1186
+ raise ValueError(f"point_labels length {len(point_labels)} != n={cost.size}")
1187
+
1188
+ fig, axes = _resolve_axes(ax, figsize)
1189
+
1190
+ # Sweep frontier: sort by cost ascending; a point is on the frontier iff
1191
+ # it improves on the running-best metric. With ties on cost, only the
1192
+ # best metric at that cost can be a frontier member; ``np.lexsort`` keys
1193
+ # so smaller cost wins and within same cost better metric wins. The
1194
+ # ``sign`` multiplier folds the direction (higher/lower-is-better) into
1195
+ # a uniform max-sweep against a -inf baseline.
1196
+ sign = 1.0 if higher_metric_is_better else -1.0
1197
+ order = np.lexsort((-sign * metric, cost))
1198
+ cost_s = cost[order]
1199
+ metric_s = metric[order]
1200
+ on_frontier = np.zeros(cost.size, dtype=bool)
1201
+ best_signed = -np.inf
1202
+ for i in range(cost.size):
1203
+ candidate = sign * float(metric_s[i])
1204
+ if candidate > best_signed:
1205
+ on_frontier[i] = True
1206
+ best_signed = candidate
1207
+
1208
+ # Map back to original indices for plotting labels in input order.
1209
+ frontier_mask = np.zeros(cost.size, dtype=bool)
1210
+ frontier_mask[order[on_frontier]] = True
1211
+
1212
+ axes.scatter(
1213
+ cost[~frontier_mask],
1214
+ metric[~frontier_mask],
1215
+ color=PALETTE["baseline"],
1216
+ s=40,
1217
+ alpha=0.7,
1218
+ zorder=2,
1219
+ label="dominated" if (~frontier_mask).any() else None,
1220
+ )
1221
+ axes.scatter(
1222
+ cost[frontier_mask],
1223
+ metric[frontier_mask],
1224
+ color=PALETTE["accent"],
1225
+ edgecolor="black",
1226
+ linewidth=0.5,
1227
+ s=70,
1228
+ zorder=4,
1229
+ label="frontier",
1230
+ )
1231
+ if frontier_mask.any():
1232
+ axes.plot(
1233
+ cost_s[on_frontier],
1234
+ metric_s[on_frontier],
1235
+ color=PALETTE["accent"],
1236
+ linestyle="--",
1237
+ linewidth=1.0,
1238
+ alpha=0.8,
1239
+ zorder=3,
1240
+ )
1241
+ if point_labels is not None:
1242
+ for label, x, y in zip(point_labels, cost, metric, strict=True):
1243
+ axes.annotate(
1244
+ label,
1245
+ (float(x), float(y)),
1246
+ textcoords="offset points",
1247
+ xytext=(6, 4),
1248
+ fontsize=9,
1249
+ )
1250
+
1251
+ axes.set_xlabel(xlabel)
1252
+ axes.set_ylabel(ylabel)
1253
+ if title is not None:
1254
+ axes.set_title(title)
1255
+ _maybe_add_legend(axes)
1256
+ fig.tight_layout()
1257
+ return fig
1258
+
1259
+
1260
+ def plot_slice_metric_heatmap(
1261
+ grid: np.ndarray,
1262
+ *,
1263
+ row_labels: Sequence[str],
1264
+ col_labels: Sequence[str],
1265
+ metric_name: str = "metric",
1266
+ cmap: str = "viridis",
1267
+ annotate: bool = True,
1268
+ annot_fmt: str = "{:.3f}",
1269
+ title: str | None = None,
1270
+ figsize: tuple[float, float] | None = None,
1271
+ ax: Axes | None = None,
1272
+ ) -> Figure:
1273
+ """Heatmap of a (row × col × metric) grid with colorbar.
1274
+
1275
+ Parameters
1276
+ ----------
1277
+ grid : np.ndarray, shape (n_rows, n_cols)
1278
+ Metric values, one per (row, col) cell. NaN cells render as blank
1279
+ (white) in the heatmap and are skipped from annotations.
1280
+ row_labels, col_labels : Sequence[str]
1281
+ Tick labels for the two axes; lengths must match ``grid.shape``.
1282
+ metric_name : str, optional
1283
+ Used as the colorbar label. Default ``"metric"``.
1284
+ cmap : str, optional
1285
+ Matplotlib colormap name. Default ``"viridis"``.
1286
+ annotate : bool, optional
1287
+ If True (default), write each cell's value on the heatmap.
1288
+ annot_fmt : str, optional
1289
+ Format string for cell annotations. Default ``"{:.3f}"``.
1290
+ title, figsize, ax : optional
1291
+
1292
+ Returns
1293
+ -------
1294
+ matplotlib.figure.Figure
1295
+
1296
+ Raises
1297
+ ------
1298
+ ValueError
1299
+ If ``grid`` is not 2-D, if label lengths disagree with the grid
1300
+ shape, or if the grid is empty.
1301
+ """
1302
+ grid_arr = _ensure_ndarray("grid", grid).astype(np.float64, copy=False)
1303
+ if grid_arr.ndim != 2:
1304
+ raise ValueError(f"grid must be 2-D, got shape {grid_arr.shape}")
1305
+ n_rows, n_cols = grid_arr.shape
1306
+ if n_rows == 0 or n_cols == 0:
1307
+ raise ValueError(f"grid must be non-empty, got shape {grid_arr.shape}")
1308
+ if len(row_labels) != n_rows:
1309
+ raise ValueError(f"row_labels length {len(row_labels)} != grid.shape[0] {n_rows}")
1310
+ if len(col_labels) != n_cols:
1311
+ raise ValueError(f"col_labels length {len(col_labels)} != grid.shape[1] {n_cols}")
1312
+
1313
+ fig, axes = _resolve_axes(ax, figsize)
1314
+
1315
+ masked = np.ma.masked_invalid(grid_arr)
1316
+ im = axes.imshow(masked, cmap=cmap, aspect="auto")
1317
+ fig.colorbar(im, ax=axes, label=metric_name)
1318
+
1319
+ axes.set_xticks(np.arange(n_cols))
1320
+ axes.set_yticks(np.arange(n_rows))
1321
+ axes.set_xticklabels(list(col_labels))
1322
+ axes.set_yticklabels(list(row_labels))
1323
+ axes.tick_params(axis="x", rotation=30)
1324
+ for tick in axes.get_xticklabels():
1325
+ tick.set_horizontalalignment("right")
1326
+
1327
+ if annotate:
1328
+ # Choose text color per cell from luminance midpoint to stay readable.
1329
+ vmin = float(np.nanmin(grid_arr)) if np.isfinite(grid_arr).any() else 0.0
1330
+ vmax = float(np.nanmax(grid_arr)) if np.isfinite(grid_arr).any() else 1.0
1331
+ midpoint = 0.5 * (vmin + vmax)
1332
+ for i in range(n_rows):
1333
+ for j in range(n_cols):
1334
+ v = grid_arr[i, j]
1335
+ if not np.isfinite(v):
1336
+ continue
1337
+ text_color = "white" if v < midpoint else "black"
1338
+ axes.text(
1339
+ j,
1340
+ i,
1341
+ annot_fmt.format(v),
1342
+ ha="center",
1343
+ va="center",
1344
+ color=text_color,
1345
+ fontsize=9,
1346
+ )
1347
+
1348
+ if title is not None:
1349
+ axes.set_title(title)
1350
+ fig.tight_layout()
1351
+ return fig
@@ -164,9 +164,12 @@
164
164
  "plot_confusion_matrix_grid",
165
165
  "plot_lift_ci",
166
166
  "plot_metric_bars",
167
+ "plot_pareto_frontier",
167
168
  "plot_pr_curve",
168
169
  "plot_reliability_diagram",
170
+ "plot_roc_curve",
169
171
  "plot_score_histograms",
172
+ "plot_slice_metric_heatmap",
170
173
  "pr_auc",
171
174
  "precision_at_prior",
172
175
  "quantile_stratified_pr_auc",
@@ -1001,7 +1004,7 @@
1001
1004
  "doc_first_line": "str(object='') -> str",
1002
1005
  "kind": "value",
1003
1006
  "type": "str",
1004
- "value": "'0.32.0'"
1007
+ "value": "'0.33.0'"
1005
1008
  },
1006
1009
  "apply_operating_points": {
1007
1010
  "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1326,7 +1329,12 @@
1326
1329
  "plot_metric_bars": {
1327
1330
  "doc_first_line": "Bar chart for a ``{label: metric}`` mapping.",
1328
1331
  "kind": "function",
1329
- "signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
1332
+ "signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1333
+ },
1334
+ "plot_pareto_frontier": {
1335
+ "doc_first_line": "Cost-vs-performance scatter with Pareto frontier overlay.",
1336
+ "kind": "function",
1337
+ "signature": "(cost: 'np.ndarray', metric: 'np.ndarray', *, point_labels: 'Sequence[str] | None' = None, higher_metric_is_better: 'bool' = True, xlabel: 'str' = 'cost', ylabel: 'str' = 'metric', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1330
1338
  },
1331
1339
  "plot_pr_curve": {
1332
1340
  "doc_first_line": "Precision-recall curve.",
@@ -1338,10 +1346,20 @@
1338
1346
  "kind": "function",
1339
1347
  "signature": "(y_true: 'np.ndarray', y_prob: 'np.ndarray', *, n_bins: 'int' = 10, bin_counts: 'np.ndarray | None' = None, xlabel: 'str' = 'Mean Predicted Probability', ylabel: 'str' = 'Observed Fraction of Positives', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1340
1348
  },
1349
+ "plot_roc_curve": {
1350
+ "doc_first_line": "Receiver-operating-characteristic curve.",
1351
+ "kind": "function",
1352
+ "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', *, label: 'str | None' = None, threshold: 'float | None' = None, baseline_curve: 'tuple[np.ndarray, np.ndarray] | None' = None, baseline_label: 'str' = 'baseline', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1353
+ },
1341
1354
  "plot_score_histograms": {
1342
1355
  "doc_first_line": "Overlaid score-distribution histograms, one per slice.",
1343
1356
  "kind": "function",
1344
- "signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
1357
+ "signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1358
+ },
1359
+ "plot_slice_metric_heatmap": {
1360
+ "doc_first_line": "Heatmap of a (row \u00d7 col \u00d7 metric) grid with colorbar.",
1361
+ "kind": "function",
1362
+ "signature": "(grid: 'np.ndarray', *, row_labels: 'Sequence[str]', col_labels: 'Sequence[str]', metric_name: 'str' = 'metric', cmap: 'str' = 'viridis', annotate: 'bool' = True, annot_fmt: 'str' = '{:.3f}', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
1345
1363
  },
1346
1364
  "pr_auc": {
1347
1365
  "doc_first_line": "Average precision (PR-AUC).",