PyPI - eval-toolkit - Versions diffs - 0.32.0__tar.gz → 0.33.0__tar.gz - Mend

eval-toolkit 0.32.0tar.gz → 0.33.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,58 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.33.0] — 2026-05-17 — Plotting batch + ax= parity + CI quality-of-life
+Consumer-unblocking release: closes the four upstream-gap TODOs in
+`prompt-injection-detection-submission`'s Phase 4 figures (F1, F2, F5,
+F6-left) which had been carrying hand-rolled prototypes pending these
+primitives. Also bundles two CI/maintenance fixes that were quality-of-life
+pain points during v0.32 ship.
+**Note**: The `v0.33` milestone's #3 (`make_minilm_embedder`) is deferred
+to the next iteration (likely v0.33.1 or v0.34) so this release stays
+focused on the plotting batch + `ax=` parity. MiniLM adds a new optional
+dep + new module; ships better as its own bite.
+No breaking changes. Public API gains 3 new plotting exports
+(`plot_roc_curve`, `plot_pareto_frontier`, `plot_slice_metric_heatmap`)
+and adds an `ax=` kwarg to 2 existing plotting fns (`plot_metric_bars`,
+`plot_score_histograms`) — all additive.
+### Added
+- `eval_toolkit.plotting.plot_roc_curve` — sibling to `plot_pr_curve`;
+  accepts `ax=`, optional baseline overlay, threshold marker. Includes
+  a diagonal chance line. Closes #14.
+- `eval_toolkit.plotting.plot_pareto_frontier` — cost-vs-performance
+  scatter with running-best frontier overlay (O(n log n) sweep). Supports
+  both higher-is-better and lower-is-better metric directions, optional
+  per-point labels. Closes #15.
+- `eval_toolkit.plotting.plot_slice_metric_heatmap` — (rows × cols × metric)
+  heatmap with colorbar + optional cell annotations + NaN-cell masking.
+  Closes #16.
+### Changed
+- `plot_metric_bars` and `plot_score_histograms` now accept an `ax=` kwarg,
+  bringing the count of `ax=`-accepting plotting fns to 6 of 7
+  (`plot_confusion_matrix_grid` remains figure-creating since it's
+  intrinsically a grid-of-axes). Closes #24.
+- `Makefile`'s `coverage` target now filters `monte_carlo` and `benchmark`
+  markers, matching what `.github/workflows/ci.yml` actually runs. `make ci`
+  drops from ~45 min to ~3 min locally. Closes #25.
+### Internal
+- 16 new edge tests covering input validation + `ax=` branches for the
+  3 new plotting fns and the 2 backfilled ones.
+- 3 new `@pytest.mark.mpl_image_compare` baseline tests + checked-in
+  baseline PNGs for the new plotting fns.
+- `.github/workflows/*.yml` audited for Node.js 20 deprecation; bumped
+  `actions/upload-artifact@v4 → v5` (3 workflows) and
+  `actions/download-artifact@v4 → v5` (publish.yml) ahead of the
+  2026-09-16 Node-20 removal deadline. Closes #26.
 ## [0.32.0] — 2026-05-16 — Multiple-comparisons correction + EvidenceGate discoverability
 Bundled close-outs from the `v0.32` milestone triage (4 issues). Adds

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-toolkit
-Version: 0.32.0
+Version: 0.33.0
 Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
 Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
 Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/__init__.py RENAMED Viewed

@@ -195,9 +195,12 @@ _EXPORTS: dict[str, str] = {
     "plot_confusion_matrix_grid": "eval_toolkit.plotting",
     "plot_lift_ci": "eval_toolkit.plotting",
     "plot_metric_bars": "eval_toolkit.plotting",
+    "plot_pareto_frontier": "eval_toolkit.plotting",
     "plot_pr_curve": "eval_toolkit.plotting",
     "plot_reliability_diagram": "eval_toolkit.plotting",
+    "plot_roc_curve": "eval_toolkit.plotting",
     "plot_score_histograms": "eval_toolkit.plotting",
+    "plot_slice_metric_heatmap": "eval_toolkit.plotting",
     "save_figure": "eval_toolkit.plotting",
     "set_plot_style": "eval_toolkit.plotting",
     # --- provenance ---

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/_version.py RENAMED Viewed

@@ -2,4 +2,4 @@
 __all__ = ["__version__"]
-__version__ = "0.32.0"
+__version__ = "0.33.0"

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/src/eval_toolkit/plotting.py RENAMED Viewed

@@ -21,7 +21,7 @@ from __future__ import annotations
 import json
 import os
-from collections.abc import Callable, Container, Iterable, Mapping
+from collections.abc import Callable, Container, Iterable, Mapping, Sequence
 from pathlib import Path
 from types import MappingProxyType
 from typing import TYPE_CHECKING, Any, cast
@@ -34,7 +34,7 @@ from matplotlib.colors import LinearSegmentedColormap
 from matplotlib.figure import Figure
 from matplotlib.patches import Rectangle
 from sklearn.calibration import calibration_curve
-from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import precision_recall_curve, roc_curve
 if TYPE_CHECKING:
     from eval_toolkit.bootstrap import BootstrapCI
@@ -48,9 +48,12 @@ __all__ = [
     "plot_confusion_matrix_grid",
     "plot_lift_ci",
     "plot_metric_bars",
+    "plot_pareto_frontier",
     "plot_pr_curve",
     "plot_reliability_diagram",
+    "plot_roc_curve",
     "plot_score_histograms",
+    "plot_slice_metric_heatmap",
     "save_figure",
     "set_plot_style",
 ]
@@ -451,6 +454,129 @@ def plot_pr_curve(
     return fig
+def plot_roc_curve(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    *,
+    label: str | None = None,
+    threshold: float | None = None,
+    baseline_curve: tuple[np.ndarray, np.ndarray] | None = None,
+    baseline_label: str = "baseline",
+    title: str | None = None,
+    figsize: tuple[float, float] | None = None,
+    ax: Axes | None = None,
+) -> Figure:
+    """Receiver-operating-characteristic curve.
+    Sibling of :func:`plot_pr_curve`. Plots the (FPR, TPR) curve with a
+    diagonal chance line. ROC is invariant to class prior, so unlike
+    `plot_pr_curve` there is no ``prevalence`` parameter.
+    Parameters
+    ----------
+    y_true, y_score : np.ndarray
+        Labels and scores.
+    label : str or None, optional
+        Legend label for the main curve.
+    threshold : float or None, optional
+        Draw a star marker at the (fpr, tpr) point closest to this
+        threshold.
+    baseline_curve : tuple of (fpr, tpr) np.ndarrays, optional
+        Optional baseline curve to overlay (e.g., a simpler reference model).
+    baseline_label : str, optional
+        Legend label for the baseline overlay (default ``"baseline"``).
+    title : str or None, optional
+    figsize : tuple of float or None, optional
+    ax : matplotlib Axes or None, optional
+    Returns
+    -------
+    matplotlib.figure.Figure
+    Raises
+    ------
+    ValueError
+        If ``y_true``/``y_score`` fail shape/dtype/value-range checks
+        (re-raised from validators); if ``threshold`` is outside [0, 1];
+        or if ``baseline_curve`` is not a length-2 tuple with
+        matching-shape ``(fpr, tpr)`` arrays.
+    """
+    y_true = _ensure_ndarray("y_true", y_true)
+    y_score = _ensure_ndarray("y_score", y_score)
+    _validate_pair(y_true, y_score, other_name="y_score")
+    if threshold is not None and not 0.0 <= threshold <= 1.0:
+        raise ValueError(f"threshold must be in [0, 1], got {threshold}")
+    if baseline_curve is not None:
+        if not (isinstance(baseline_curve, tuple) and len(baseline_curve) == 2):
+            raise ValueError("baseline_curve must be a (fpr, tpr) tuple")
+        bl_fpr = _ensure_ndarray("baseline_curve[0]", baseline_curve[0])
+        bl_tpr = _ensure_ndarray("baseline_curve[1]", baseline_curve[1])
+        if bl_fpr.shape != bl_tpr.shape:
+            raise ValueError(
+                f"baseline_curve fpr and tpr must have same shape, "
+                f"got {bl_fpr.shape} vs {bl_tpr.shape}"
+            )
+    fig, axes = _resolve_axes(ax, figsize)
+    fpr, tpr, thresholds = roc_curve(y_true, y_score)
+    axes.plot(fpr, tpr, color=PALETTE["negative"], label=label, linewidth=1.5)
+    # Diagonal chance line (AUC = 0.5 reference).
+    axes.plot(
+        [0.0, 1.0],
+        [0.0, 1.0],
+        color=PALETTE["baseline"],
+        linestyle="--",
+        linewidth=0.8,
+        alpha=0.7,
+        label="chance",
+    )
+    if baseline_curve is not None:
+        bl_fpr = np.asarray(baseline_curve[0])
+        bl_tpr = np.asarray(baseline_curve[1])
+        axes.plot(
+            bl_fpr,
+            bl_tpr,
+            color=PALETTE["baseline"],
+            linewidth=1.0,
+            linestyle="--",
+            label=baseline_label,
+            zorder=1,
+        )
+    if threshold is not None:
+        # roc_curve prepends a sentinel threshold of np.inf; finite-mask it
+        # before picking the closest match so the marker lands on a real point.
+        finite = np.isfinite(thresholds)
+        if not finite.any():
+            raise ValueError("roc_curve returned no finite thresholds")
+        rel_idx = int(np.argmin(np.abs(thresholds[finite] - threshold)))
+        idx = int(np.flatnonzero(finite)[rel_idx])
+        axes.scatter(
+            fpr[idx],
+            tpr[idx],
+            color=PALETTE["accent"],
+            marker="*",
+            s=120,
+            zorder=5,
+            label=f"τ={threshold:.3f}",
+            edgecolor="black",
+            linewidth=0.5,
+        )
+    axes.set_xlabel("False Positive Rate")
+    axes.set_ylabel("True Positive Rate")
+    axes.set_xlim(0.0, 1.0)
+    axes.set_ylim(0.0, 1.05)
+    if title is not None:
+        axes.set_title(title)
+    _maybe_add_legend(axes)
+    fig.tight_layout()
+    return fig
 def plot_reliability_diagram(
     y_true: np.ndarray,
     y_prob: np.ndarray,
@@ -670,6 +796,7 @@ def plot_metric_bars(
     figsize: tuple[float, float] | None = None,
     label_formatter: Callable[[str], str] | None = None,
     sort_key: Callable[[str], Any] | None = None,
+    ax: Axes | None = None,
 ) -> Figure:
     """Bar chart for a ``{label: metric}`` mapping.
@@ -684,6 +811,9 @@ def plot_metric_bars(
         Maps raw key → display label. Default is identity.
     sort_key : callable or None, optional
         Maps raw key → sort key. Default is alphabetical.
+    ax : matplotlib Axes or None, optional
+        Render onto this Axes (reuses its parent Figure); otherwise creates a
+        fresh figure.
     Returns
     -------
@@ -703,7 +833,7 @@ def plot_metric_bars(
     labels = [fmt(k) for k, _ in sorted_items]
     bar_values = [v for _, v in sorted_items]
-    fig, axes = plt.subplots(figsize=figsize or DEFAULT_FIGSIZE)
+    fig, axes = _resolve_axes(ax, figsize)
     bar_color = color or PALETTE["negative"]
     axes.bar(labels, bar_values, color=bar_color, edgecolor="black", linewidth=0.5)
     upper = max(bar_values)
@@ -728,6 +858,7 @@ def plot_score_histograms(
     figsize: tuple[float, float] | None = None,
     label_formatter: Callable[[str], str] | None = None,
     sort_key: Callable[[str], Any] | None = None,
+    ax: Axes | None = None,
 ) -> Figure:
     """Overlaid score-distribution histograms, one per slice.
@@ -742,6 +873,9 @@ def plot_score_histograms(
     title, figsize : optional
     label_formatter, sort_key : callable or None, optional
         See :func:`plot_metric_bars`.
+    ax : matplotlib Axes or None, optional
+        Render onto this Axes (reuses its parent Figure); otherwise creates a
+        fresh figure.
     Returns
     -------
@@ -778,7 +912,7 @@ def plot_score_histograms(
         PALETTE["baseline"],
     ]
-    fig, axes = plt.subplots(figsize=figsize or DEFAULT_FIGSIZE)
+    fig, axes = _resolve_axes(ax, figsize)
     for i, (key, arr) in enumerate(sorted_items):
         color = palette_cycle[i % len(palette_cycle)]
         axes.hist(
@@ -989,3 +1123,229 @@ def plot_bootstrap_distribution(
         axes.set_title(title)
     fig.tight_layout()
     return fig
+def plot_pareto_frontier(
+    cost: np.ndarray,
+    metric: np.ndarray,
+    *,
+    point_labels: Sequence[str] | None = None,
+    higher_metric_is_better: bool = True,
+    xlabel: str = "cost",
+    ylabel: str = "metric",
+    title: str | None = None,
+    figsize: tuple[float, float] | None = None,
+    ax: Axes | None = None,
+) -> Figure:
+    """Cost-vs-performance scatter with Pareto frontier overlay.
+    Points on the frontier (the running-best metric as cost increases) are
+    drawn in accent color and connected by a dashed polyline; dominated
+    points are drawn in muted baseline color. Cost is always assumed
+    lower-is-better; ``higher_metric_is_better`` controls the metric
+    direction.
+    Parameters
+    ----------
+    cost : np.ndarray, shape (n,)
+        Cost values (training/inference/compute proxy; lower-is-better).
+    metric : np.ndarray, shape (n,)
+        Metric values aligned with ``cost``.
+    point_labels : Sequence[str] or None, optional
+        Per-point annotations (e.g., rung names). If provided, must have
+        length ``n``.
+    higher_metric_is_better : bool, optional
+        If True (default), frontier maximises metric at minimum cost. If
+        False, frontier minimises both (e.g., metric is an error/loss).
+    xlabel, ylabel : str, optional
+    title, figsize, ax : optional
+    Returns
+    -------
+    matplotlib.figure.Figure
+    Raises
+    ------
+    ValueError
+        If ``cost`` and ``metric`` shapes don't match, are not 1-D, are
+        empty, contain NaN/inf, or if ``point_labels`` length disagrees.
+    """
+    cost = _ensure_ndarray("cost", cost)
+    metric = _ensure_ndarray("metric", metric)
+    if cost.ndim != 1 or metric.ndim != 1:
+        raise ValueError(f"cost and metric must be 1-D, got shapes {cost.shape} and {metric.shape}")
+    if cost.shape != metric.shape:
+        raise ValueError(
+            f"cost and metric must have same shape, got {cost.shape} vs {metric.shape}"
+        )
+    if cost.size == 0:
+        raise ValueError("cost and metric must be non-empty")
+    if not (np.isfinite(cost).all() and np.isfinite(metric).all()):
+        raise ValueError("cost and metric must contain finite values only")
+    if point_labels is not None and len(point_labels) != cost.size:
+        raise ValueError(f"point_labels length {len(point_labels)} != n={cost.size}")
+    fig, axes = _resolve_axes(ax, figsize)
+    # Sweep frontier: sort by cost ascending; a point is on the frontier iff
+    # it improves on the running-best metric. With ties on cost, only the
+    # best metric at that cost can be a frontier member; ``np.lexsort`` keys
+    # so smaller cost wins and within same cost better metric wins. The
+    # ``sign`` multiplier folds the direction (higher/lower-is-better) into
+    # a uniform max-sweep against a -inf baseline.
+    sign = 1.0 if higher_metric_is_better else -1.0
+    order = np.lexsort((-sign * metric, cost))
+    cost_s = cost[order]
+    metric_s = metric[order]
+    on_frontier = np.zeros(cost.size, dtype=bool)
+    best_signed = -np.inf
+    for i in range(cost.size):
+        candidate = sign * float(metric_s[i])
+        if candidate > best_signed:
+            on_frontier[i] = True
+            best_signed = candidate
+    # Map back to original indices for plotting labels in input order.
+    frontier_mask = np.zeros(cost.size, dtype=bool)
+    frontier_mask[order[on_frontier]] = True
+    axes.scatter(
+        cost[~frontier_mask],
+        metric[~frontier_mask],
+        color=PALETTE["baseline"],
+        s=40,
+        alpha=0.7,
+        zorder=2,
+        label="dominated" if (~frontier_mask).any() else None,
+    )
+    axes.scatter(
+        cost[frontier_mask],
+        metric[frontier_mask],
+        color=PALETTE["accent"],
+        edgecolor="black",
+        linewidth=0.5,
+        s=70,
+        zorder=4,
+        label="frontier",
+    )
+    if frontier_mask.any():
+        axes.plot(
+            cost_s[on_frontier],
+            metric_s[on_frontier],
+            color=PALETTE["accent"],
+            linestyle="--",
+            linewidth=1.0,
+            alpha=0.8,
+            zorder=3,
+        )
+    if point_labels is not None:
+        for label, x, y in zip(point_labels, cost, metric, strict=True):
+            axes.annotate(
+                label,
+                (float(x), float(y)),
+                textcoords="offset points",
+                xytext=(6, 4),
+                fontsize=9,
+            )
+    axes.set_xlabel(xlabel)
+    axes.set_ylabel(ylabel)
+    if title is not None:
+        axes.set_title(title)
+    _maybe_add_legend(axes)
+    fig.tight_layout()
+    return fig
+def plot_slice_metric_heatmap(
+    grid: np.ndarray,
+    *,
+    row_labels: Sequence[str],
+    col_labels: Sequence[str],
+    metric_name: str = "metric",
+    cmap: str = "viridis",
+    annotate: bool = True,
+    annot_fmt: str = "{:.3f}",
+    title: str | None = None,
+    figsize: tuple[float, float] | None = None,
+    ax: Axes | None = None,
+) -> Figure:
+    """Heatmap of a (row × col × metric) grid with colorbar.
+    Parameters
+    ----------
+    grid : np.ndarray, shape (n_rows, n_cols)
+        Metric values, one per (row, col) cell. NaN cells render as blank
+        (white) in the heatmap and are skipped from annotations.
+    row_labels, col_labels : Sequence[str]
+        Tick labels for the two axes; lengths must match ``grid.shape``.
+    metric_name : str, optional
+        Used as the colorbar label. Default ``"metric"``.
+    cmap : str, optional
+        Matplotlib colormap name. Default ``"viridis"``.
+    annotate : bool, optional
+        If True (default), write each cell's value on the heatmap.
+    annot_fmt : str, optional
+        Format string for cell annotations. Default ``"{:.3f}"``.
+    title, figsize, ax : optional
+    Returns
+    -------
+    matplotlib.figure.Figure
+    Raises
+    ------
+    ValueError
+        If ``grid`` is not 2-D, if label lengths disagree with the grid
+        shape, or if the grid is empty.
+    """
+    grid_arr = _ensure_ndarray("grid", grid).astype(np.float64, copy=False)
+    if grid_arr.ndim != 2:
+        raise ValueError(f"grid must be 2-D, got shape {grid_arr.shape}")
+    n_rows, n_cols = grid_arr.shape
+    if n_rows == 0 or n_cols == 0:
+        raise ValueError(f"grid must be non-empty, got shape {grid_arr.shape}")
+    if len(row_labels) != n_rows:
+        raise ValueError(f"row_labels length {len(row_labels)} != grid.shape[0] {n_rows}")
+    if len(col_labels) != n_cols:
+        raise ValueError(f"col_labels length {len(col_labels)} != grid.shape[1] {n_cols}")
+    fig, axes = _resolve_axes(ax, figsize)
+    masked = np.ma.masked_invalid(grid_arr)
+    im = axes.imshow(masked, cmap=cmap, aspect="auto")
+    fig.colorbar(im, ax=axes, label=metric_name)
+    axes.set_xticks(np.arange(n_cols))
+    axes.set_yticks(np.arange(n_rows))
+    axes.set_xticklabels(list(col_labels))
+    axes.set_yticklabels(list(row_labels))
+    axes.tick_params(axis="x", rotation=30)
+    for tick in axes.get_xticklabels():
+        tick.set_horizontalalignment("right")
+    if annotate:
+        # Choose text color per cell from luminance midpoint to stay readable.
+        vmin = float(np.nanmin(grid_arr)) if np.isfinite(grid_arr).any() else 0.0
+        vmax = float(np.nanmax(grid_arr)) if np.isfinite(grid_arr).any() else 1.0
+        midpoint = 0.5 * (vmin + vmax)
+        for i in range(n_rows):
+            for j in range(n_cols):
+                v = grid_arr[i, j]
+                if not np.isfinite(v):
+                    continue
+                text_color = "white" if v < midpoint else "black"
+                axes.text(
+                    j,
+                    i,
+                    annot_fmt.format(v),
+                    ha="center",
+                    va="center",
+                    color=text_color,
+                    fontsize=9,
+                )
+    if title is not None:
+        axes.set_title(title)
+    fig.tight_layout()
+    return fig

eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_pareto_frontier.png ADDED Viewed

Binary file

eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_roc_curve.png ADDED Viewed

Binary file

eval_toolkit-0.33.0/tests/baseline/test_plotting_visual/plot_slice_metric_heatmap.png ADDED Viewed

Binary file

{eval_toolkit-0.32.0 → eval_toolkit-0.33.0}/tests/golden/public_api/snapshot.json RENAMED Viewed

@@ -164,9 +164,12 @@
     "plot_confusion_matrix_grid",
     "plot_lift_ci",
     "plot_metric_bars",
+    "plot_pareto_frontier",
     "plot_pr_curve",
     "plot_reliability_diagram",
+    "plot_roc_curve",
     "plot_score_histograms",
+    "plot_slice_metric_heatmap",
     "pr_auc",
     "precision_at_prior",
     "quantile_stratified_pr_auc",
@@ -1001,7 +1004,7 @@
       "doc_first_line": "str(object='') -> str",
       "kind": "value",
       "type": "str",
-      "value": "'0.32.0'"
+      "value": "'0.33.0'"
     },
     "apply_operating_points": {
       "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1326,7 +1329,12 @@
     "plot_metric_bars": {
       "doc_first_line": "Bar chart for a ``{label: metric}`` mapping.",
       "kind": "function",
-      "signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
+      "signature": "(values: 'dict[str, float]', *, color: 'str | None' = None, ylabel: 'str | None' = None, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
+    },
+    "plot_pareto_frontier": {
+      "doc_first_line": "Cost-vs-performance scatter with Pareto frontier overlay.",
+      "kind": "function",
+      "signature": "(cost: 'np.ndarray', metric: 'np.ndarray', *, point_labels: 'Sequence[str] | None' = None, higher_metric_is_better: 'bool' = True, xlabel: 'str' = 'cost', ylabel: 'str' = 'metric', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
     },
     "plot_pr_curve": {
       "doc_first_line": "Precision-recall curve.",
@@ -1338,10 +1346,20 @@
       "kind": "function",
       "signature": "(y_true: 'np.ndarray', y_prob: 'np.ndarray', *, n_bins: 'int' = 10, bin_counts: 'np.ndarray | None' = None, xlabel: 'str' = 'Mean Predicted Probability', ylabel: 'str' = 'Observed Fraction of Positives', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
     },
+    "plot_roc_curve": {
+      "doc_first_line": "Receiver-operating-characteristic curve.",
+      "kind": "function",
+      "signature": "(y_true: 'np.ndarray', y_score: 'np.ndarray', *, label: 'str | None' = None, threshold: 'float | None' = None, baseline_curve: 'tuple[np.ndarray, np.ndarray] | None' = None, baseline_label: 'str' = 'baseline', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
+    },
     "plot_score_histograms": {
       "doc_first_line": "Overlaid score-distribution histograms, one per slice.",
       "kind": "function",
-      "signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None) -> 'Figure'"
+      "signature": "(scores_by_slice: 'dict[str, np.ndarray]', *, scorer_name: 'str | None' = None, bins: 'int' = 30, title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, label_formatter: 'Callable[[str], str] | None' = None, sort_key: 'Callable[[str], Any] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
+    },
+    "plot_slice_metric_heatmap": {
+      "doc_first_line": "Heatmap of a (row \u00d7 col \u00d7 metric) grid with colorbar.",
+      "kind": "function",
+      "signature": "(grid: 'np.ndarray', *, row_labels: 'Sequence[str]', col_labels: 'Sequence[str]', metric_name: 'str' = 'metric', cmap: 'str' = 'viridis', annotate: 'bool' = True, annot_fmt: 'str' = '{:.3f}', title: 'str | None' = None, figsize: 'tuple[float, float] | None' = None, ax: 'Axes | None' = None) -> 'Figure'"
     },
     "pr_auc": {
       "doc_first_line": "Average precision (PR-AUC).",

eval-toolkit 0.32.0__tar.gz → 0.33.0__tar.gz

eval-toolkit 0.32.0tar.gz → 0.33.0tar.gz