PyPI - eval-toolkit - Versions diffs - 0.43.0__tar.gz → 0.44.0__tar.gz - Mend

eval-toolkit 0.43.0tar.gz → 0.44.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,31 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.44.0] — 2026-05-19 — Defenses + losses: Spotlighting variants + RecallAtLowFPR (closes #50, #51)
+### Added
+- `eval_toolkit.preprocessing` — new module with 3 Spotlighting
+  structural-defense variants from Hines et al. 2024
+  (arXiv 2403.14720): `delimit(text, delimiter='<<')`,
+  `datamark(text, marker='^')`, `encode(text, encoding='base64')`,
+  plus a `sweep(texts, variants=..., kwargs=...)` batch wrapper that
+  returns a `(N*3)`-row DataFrame. Includes a `spotlighting`
+  SimpleNamespace exposing the upstream issue's function-style API
+  (`spotlighting.delimit(text)`, etc.). Base-install safe (pure
+  stdlib). Closes #51.
+- `eval_toolkit.losses` — new module with `RecallAtLowFPR` — the
+  Meta Prompt Guard 2 (PG2) training recipe: a differentiable
+  approximation of recall-at-fixed-FPR via soft-rank, returning a
+  scalar `torch.nn.Module` loss for use in standard training loops.
+  Optimizes detector ranking at a constrained operating point
+  (e.g. `fpr_target=0.01` → "maximize recall while keeping FPR ≤ 1%").
+  Closes #50.
+- New optional extra `[losses] = torch>=2.0`. Granular per the v0.43
+  plan Decision 4 — separated from `[probes]` so callers wanting only
+  the loss don't have to install the larger transformers stack.
+  Shares the torch version pin with `[probes]`.
 ## [0.43.0] — 2026-05-19 — P1 batch: OOD manifest loader + character_injection sweep + ActivationDeltaProbe (closes #48, #49, #53)
 ### Added

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-toolkit
-Version: 0.43.0
+Version: 0.44.0
 Summary: Reusable evaluation contracts for binary classification: metrics, bootstrap CIs, calibration, artifacts, and evidence gates.
 Project-URL: Homepage, https://github.com/brandon-behring/eval-toolkit
 Project-URL: Documentation, https://brandon-behring.github.io/eval-toolkit/
@@ -62,6 +62,8 @@ Requires-Dist: sphinx-design>=0.6; extra == 'docs'
 Requires-Dist: sphinx>=7.3; extra == 'docs'
 Provides-Extra: embeddings
 Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings'
+Provides-Extra: losses
+Requires-Dist: torch>=2.0; extra == 'losses'
 Provides-Extra: parquet
 Requires-Dist: pyarrow>=15.0; extra == 'parquet'
 Provides-Extra: plotting

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/pyproject.toml RENAMED Viewed

@@ -69,6 +69,11 @@ transformers = ["transformers>=4.0"]
 # is base-install-safe (lazy imports inside ActivationDeltaProbe methods);
 # the extra is strictly for callers wanting to actually fit / predict.
 probes = ["torch>=2.0", "transformers>=4.40"]
+# v0.44.0: RecallAtLowFPR loss (Meta Prompt Guard 2 recipe; closes #50).
+# torch-only (no transformers); separated from [probes] per Decision 4
+# (granular extras — losses callers should not have to install the larger
+# transformers stack). Shares the torch version pin with [probes].
+losses = ["torch>=2.0"]
 # DEPRECATED (announced v0.30.1, removal v0.33.0).
 #
 # Retained as a transitive no-op so `pip install eval-toolkit[validation]`
@@ -177,7 +182,7 @@ warn_no_return = true
 strict_equality = true
 [[tool.mypy.overrides]]
-module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*"]
+module = ["scipy.*", "sklearn.*", "matplotlib.*", "pandas.*", "yaml.*", "sentence_transformers.*", "joblib.*", "torch.*", "transformers.*"]
 ignore_missing_imports = true
 [tool.pytest.ini_options]

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/__init__.py RENAMED Viewed

@@ -40,6 +40,13 @@ _EXPORTS: dict[str, str] = {
     "WhitespaceInjection": "eval_toolkit.adversarial",
     "ZeroWidthSpaceInjection": "eval_toolkit.adversarial",
     "character_injection": "eval_toolkit.adversarial",
+    # --- losses ---
+    "RecallAtLowFPR": "eval_toolkit.losses",
+    # --- preprocessing ---
+    "datamark": "eval_toolkit.preprocessing",
+    "delimit": "eval_toolkit.preprocessing",
+    "encode": "eval_toolkit.preprocessing",
+    "spotlighting": "eval_toolkit.preprocessing",
     # --- probes ---
     "ActivationDeltaProbe": "eval_toolkit.probes",
     "ActivationExtractor": "eval_toolkit.probes",

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/_version.py RENAMED Viewed

@@ -2,4 +2,4 @@
 __all__ = ["__version__"]
-__version__ = "0.43.0"
+__version__ = "0.44.0"

eval_toolkit-0.44.0/src/eval_toolkit/losses.py ADDED Viewed

@@ -0,0 +1,225 @@
+"""Differentiable losses for prompt-injection detector training.
+Implements :class:`RecallAtLowFPR` — the Meta Prompt Guard 2 (PG2) training
+recipe, a differentiable approximation of recall-at-fixed-FPR. Optimizes
+detector ranking at a constrained operating point (e.g. FPR ≤ 0.01)
+rather than the implicit FPR-agnostic posture of cross-entropy.
+This module is base-install safe: ``torch`` is soft-imported inside the
+class methods. ``pip install eval-toolkit[losses]`` installs torch.
+The lazy-import pattern matches the ``[probes]`` precedent (separate
+extra so callers wanting only the loss don't have to install
+transformers).
+The formulation follows the soft-rank approximation described in
+Meta's PG2 release notes and similar metric-learning losses (Liu et al.
+NeurIPS 2020 family):
+1. Compute the empirical FPR-target threshold from the negative-class
+   scores in the batch via the ``fpr_target``-th percentile.
+2. Smooth the indicator ``I(s_i >= threshold)`` with
+   ``sigmoid(beta * (s_i - threshold))`` so gradients flow.
+3. Recall@FPR ≈ ``Σ approx_indicator * y / Σ y``; the loss returned is
+   ``1 - Recall@FPR``.
+References
+----------
+.. [1] Meta. 2024. "Prompt Guard 2 — release notes & training recipe."
+.. [2] Liu, X., et al. 2020. "Black-box ranking under FPR constraints."
+       NeurIPS 2020.
+"""
+from __future__ import annotations
+from typing import Any, Literal
+__all__ = [
+    "RecallAtLowFPR",
+]
+ReductionMode = Literal["mean", "sum", "none"]
+def _require_torch() -> Any:
+    """Import torch with a copy-paste install hint if [losses] is missing."""
+    try:
+        import torch
+    except ImportError as exc:
+        raise ImportError(
+            "RecallAtLowFPR requires torch. Install with: pip install eval-toolkit[losses]"
+        ) from exc
+    return torch
+def _build_module_class() -> Any:
+    """Build the :class:`RecallAtLowFPR` ``nn.Module`` lazily.
+    Defined as a factory so importing :mod:`eval_toolkit.losses` does not
+    pull torch at module-import time. The class itself is built on first
+    instantiation; the factory caches the class on the module so repeated
+    construction is constant-time after the first call.
+    """
+    torch = _require_torch()
+    nn = torch.nn
+    # ``nn.Module`` is a runtime-constructed base; mypy can't follow the dynamic
+    # class creation. The runtime behavior is correct (nn.Module API + autograd).
+    class _RecallAtLowFPR(nn.Module):  # type: ignore[misc, name-defined]
+        def __init__(
+            self,
+            fpr_target: float = 0.01,
+            fpr_smoothing_beta: float = 10.0,
+            pos_weight: float = 1.0,
+            reduction: ReductionMode = "mean",
+        ) -> None:
+            super().__init__()
+            if not 0.0 < fpr_target <= 1.0:
+                raise ValueError(f"RecallAtLowFPR: fpr_target must be in (0, 1]; got {fpr_target}")
+            if fpr_smoothing_beta <= 0:
+                raise ValueError(
+                    f"RecallAtLowFPR: fpr_smoothing_beta must be > 0; got {fpr_smoothing_beta}"
+                )
+            if reduction not in ("mean", "sum", "none"):
+                raise ValueError(
+                    f"RecallAtLowFPR: reduction must be 'mean'|'sum'|'none'; got {reduction!r}"
+                )
+            self.fpr_target = float(fpr_target)
+            self.fpr_smoothing_beta = float(fpr_smoothing_beta)
+            self.pos_weight = float(pos_weight)
+            self.reduction = reduction
+        def forward(
+            self,
+            logits: Any,
+            labels: Any,
+        ) -> Any:
+            """Compute the (differentiable) 1 - Recall@FPR loss.
+            Parameters
+            ----------
+            logits : torch.Tensor
+                Predicted scores, shape ``(B,)`` or ``(B, 1)``. Higher
+                value → higher probability of positive class.
+            labels : torch.Tensor
+                Binary labels in ``{0, 1}``, shape ``(B,)``.
+            Returns
+            -------
+            torch.Tensor
+                Scalar (``reduction="mean"`` or ``"sum"``) or
+                per-positive-sample loss (``reduction="none"``).
+            """
+            scores = logits.squeeze(-1) if logits.dim() == 2 else logits
+            if scores.shape != labels.shape:
+                raise ValueError(
+                    f"RecallAtLowFPR: logits shape {tuple(scores.shape)} != "
+                    f"labels shape {tuple(labels.shape)}"
+                )
+            labels_f = labels.float()
+            neg_mask = labels_f < 0.5
+            pos_mask = labels_f >= 0.5
+            if not torch.any(pos_mask):
+                # No positives → recall is undefined; return zero loss with grad.
+                return scores.sum() * 0.0
+            # Threshold = (1 - fpr_target)-th quantile of negative scores.
+            # quantile is straight-through differentiable through neg_scores in PyTorch.
+            neg_scores = scores[neg_mask]
+            if neg_scores.numel() == 0:
+                # No negatives → no FPR constraint binds; threshold at -inf so
+                # everything ranks above it (recall = 1 → loss = 0).
+                threshold = scores.min().detach() - 1.0
+            else:
+                # quantile q = 1 - fpr_target means we want the score above which
+                # exactly fpr_target fraction of negatives sit.
+                q = 1.0 - self.fpr_target
+                threshold = torch.quantile(neg_scores, q)
+            # Soft indicator: sigmoid(beta * (s - t)) → near-step function as beta → ∞.
+            approx_above = torch.sigmoid(self.fpr_smoothing_beta * (scores - threshold))
+            # Recall@FPR = (Σ I(s_i ≥ t) * y_i * pos_weight) / (Σ y_i * pos_weight)
+            tp_weighted = approx_above * labels_f * self.pos_weight
+            denom = labels_f.sum() * self.pos_weight
+            recall_at_fpr = tp_weighted.sum() / denom.clamp(min=1e-9)
+            per_pos = 1.0 - approx_above[pos_mask]  # per-positive contribution
+            if self.reduction == "mean":
+                return torch.tensor(1.0, device=scores.device) - recall_at_fpr
+            if self.reduction == "sum":
+                return per_pos.sum()
+            return per_pos  # "none"
+    return _RecallAtLowFPR
+_CLASS_CACHE: dict[str, Any] = {}
+def RecallAtLowFPR(  # noqa: N802 — matches issue spec PascalCase class-like name
+    fpr_target: float = 0.01,
+    fpr_smoothing_beta: float = 10.0,
+    pos_weight: float = 1.0,
+    reduction: ReductionMode = "mean",
+) -> Any:
+    """Construct a Recall@LowFPR loss module.
+    Differentiable approximation of recall at a constrained false-positive
+    rate, per the Meta Prompt Guard 2 training recipe. Optimizes
+    detector ranking at a specific operating point (e.g. ``fpr_target=0.01``
+    → "maximize recall while keeping FPR ≤ 1%").
+    Parameters
+    ----------
+    fpr_target : float, optional
+        Target false-positive rate (operating point constraint).
+        Must be in ``(0, 1]``. Default ``0.01`` (1% FPR).
+    fpr_smoothing_beta : float, optional
+        Temperature of the soft-indicator approximation; higher values
+        make the loss sharper (closer to the hard step function) but
+        produce smaller gradients away from the threshold. Default ``10.0``.
+        Increase toward training convergence; start low for stable
+        gradient flow.
+    pos_weight : float, optional
+        Per-positive-sample weight applied to the recall numerator and
+        denominator. Default ``1.0`` (unweighted).
+    reduction : {"mean", "sum", "none"}, optional
+        How to reduce the per-positive loss. Default ``"mean"``.
+        ``"mean"`` returns the scalar ``1 - Recall@FPR`` (the canonical
+        training objective). ``"sum"`` returns the sum of per-positive
+        ``1 - approx_indicator``. ``"none"`` returns the per-positive
+        ``1 - approx_indicator`` tensor for custom downstream weighting.
+    Returns
+    -------
+    torch.nn.Module
+        The constructed loss module. Drop into any standard PyTorch
+        training loop.
+    Raises
+    ------
+    ImportError
+        If the ``[losses]`` extra is not installed.
+    ValueError
+        On invalid ``fpr_target`` / ``fpr_smoothing_beta`` / ``reduction``.
+    Examples
+    --------
+    >>> # Requires the [losses] extra.
+    >>> # import torch
+    >>> # loss = RecallAtLowFPR(fpr_target=0.01)
+    >>> # logits = torch.randn(32, requires_grad=True)
+    >>> # labels = torch.randint(0, 2, (32,))
+    >>> # loss(logits, labels).backward()
+    """
+    if "cls" not in _CLASS_CACHE:
+        _CLASS_CACHE["cls"] = _build_module_class()
+    cls = _CLASS_CACHE["cls"]
+    return cls(
+        fpr_target=fpr_target,
+        fpr_smoothing_beta=fpr_smoothing_beta,
+        pos_weight=pos_weight,
+        reduction=reduction,
+    )

eval_toolkit-0.44.0/src/eval_toolkit/preprocessing.py ADDED Viewed

@@ -0,0 +1,259 @@
+"""Structural-defense preprocessing — Spotlighting variants for prompt injection.
+Implements the 3 Spotlighting transforms from Hines et al. 2024 ([1]_) for
+defending LLMs against indirect prompt injection by *structurally marking*
+untrusted input so the model can distinguish it from system instructions.
+The three variants:
+- :func:`delimit` — wrap text in unusual delimiters (default ``<<...>>``)
+- :func:`datamark` — prepend a marker character before each whitespace token
+  (default ``^``)
+- :func:`encode` — encode the text (default ``base64``); the LLM is told to
+  decode but treat the result as data, not instructions
+A :data:`spotlighting` namespace (``SimpleNamespace``) exposes the
+function-style API verbatim from the upstream issue spec:
+>>> from eval_toolkit.preprocessing import spotlighting
+>>> spotlighting.delimit("hello")  # doctest: +SKIP
+'<<hello>>'
+:func:`sweep` applies all 3 variants to a batch of texts and returns a
+``(N*3)``-row DataFrame for downstream evaluation.
+All three variants are deterministic, side-effect-free, and base-install
+safe — only stdlib used.
+References
+----------
+.. [1] Hines, K., et al. 2024. "Defending Against Indirect Prompt Injection
+       Attacks With Spotlighting." arXiv:2403.14720.
+"""
+from __future__ import annotations
+import base64
+import re
+from collections.abc import Sequence
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, Literal
+if TYPE_CHECKING:
+    import pandas as pd
+__all__ = [
+    "datamark",
+    "delimit",
+    "encode",
+    "spotlighting",
+    "sweep",
+]
+# Default constants per the Hines et al. paper § 3
+_DEFAULT_DELIMITER = "<<"
+_DEFAULT_DELIMITER_END = ">>"
+_DEFAULT_MARKER = "^"
+_DEFAULT_ENCODING: Literal["base64"] = "base64"
+def delimit(text: str, *, delimiter: str = _DEFAULT_DELIMITER, end: str | None = None) -> str:
+    """Wrap ``text`` in unusual delimiters so the LLM can spot the boundary.
+    Recoverable via simple slicing (caller knows the delimiter pair).
+    Deterministic: same input + delimiter → same output.
+    Parameters
+    ----------
+    text : str
+        Input to wrap.
+    delimiter : str, optional
+        Opening delimiter. Default ``"<<"``. Choose something unlikely to
+        appear in user-generated content.
+    end : str or None, optional
+        Closing delimiter. If ``None`` (default), the opening delimiter
+        is reversed character-by-character (``"<<"`` → ``">>"``,
+        ``"[["`` → ``"]]"``, ``"BEGIN"`` → ``"NIGEB"``). Pass an explicit
+        ``end`` for asymmetric pairs.
+    Returns
+    -------
+    str
+        ``delimiter + text + end``.
+    Examples
+    --------
+    >>> delimit("hello")
+    '<<hello>>'
+    >>> delimit("hello", delimiter="[", end="]")
+    '[hello]'
+    """
+    if end is None:
+        end = _mirror_delimiter(delimiter)
+    return f"{delimiter}{text}{end}"
+def _mirror_delimiter(d: str) -> str:
+    """Return the mirrored closing form of an opening delimiter.
+    Examples: ``"<<"`` → ``">>"``, ``"[["`` → ``"]]"``, ``"BEGIN"`` → ``"NIGEB"``.
+    """
+    mirror = {"<": ">", "[": "]", "(": ")", "{": "}"}
+    return "".join(mirror.get(ch, ch) for ch in reversed(d))
+def datamark(text: str, *, marker: str = _DEFAULT_MARKER) -> str:
+    """Prepend ``marker`` before each non-leading whitespace run.
+    The LLM sees a textual signal that every word boundary belongs to
+    untrusted data. Recoverable by stripping the marker before each
+    whitespace run.
+    Parameters
+    ----------
+    text : str
+        Input to datamark.
+    marker : str, optional
+        Character (or short string) to inject. Default ``"^"``.
+    Returns
+    -------
+    str
+        Text with ``marker`` inserted before every whitespace run.
+    Examples
+    --------
+    >>> datamark("hello world")
+    'hello^ world'
+    >>> datamark("a b c", marker="*")
+    'a* b* c'
+    """
+    if not text:
+        return text
+    # Insert marker before any internal whitespace run (one or more spaces /
+    # tabs / newlines). Leading whitespace is preserved as-is.
+    return re.sub(r"(\S)(\s+)", lambda m: m.group(1) + marker + m.group(2), text)
+def encode(text: str, *, encoding: Literal["base64"] = _DEFAULT_ENCODING) -> str:
+    """Encode ``text`` so the LLM treats the result as data, not instructions.
+    Only ``base64`` is supported in v0.44.0 — the paper's default + the
+    most LLM-friendly encoding (most foundation models can decode base64
+    on demand).
+    Recoverable via ``base64.b64decode``.
+    Parameters
+    ----------
+    text : str
+        Input to encode.
+    encoding : {"base64"}, optional
+        Encoding scheme. Default ``"base64"``.
+    Returns
+    -------
+    str
+        Encoded text as an ASCII string.
+    Raises
+    ------
+    ValueError
+        On unknown ``encoding``.
+    Examples
+    --------
+    >>> encode("hello")
+    'aGVsbG8='
+    """
+    if encoding == "base64":
+        return base64.b64encode(text.encode("utf-8")).decode("ascii")
+    raise ValueError(f"encode: unsupported encoding {encoding!r}; supported: 'base64'")
+def sweep(
+    texts: Sequence[str],
+    *,
+    variants: Sequence[str] = ("delimit", "datamark", "encode"),
+    delimit_kwargs: dict[str, object] | None = None,
+    datamark_kwargs: dict[str, object] | None = None,
+    encode_kwargs: dict[str, object] | None = None,
+) -> pd.DataFrame:
+    """Apply one or more Spotlighting variants to each text in ``texts``.
+    For each ``(text, variant)`` pair, runs the corresponding transform
+    and emits a row in the result DataFrame. Useful for batch evaluation
+    of detector accuracy under each defense variant.
+    Parameters
+    ----------
+    texts : sequence of str
+        Input texts. Each is identified by its 0-based ``text_id``.
+    variants : sequence of str, optional
+        Which variants to apply. Default ``("delimit", "datamark", "encode")``
+        (all 3). Unknown variant names raise :class:`ValueError`.
+    delimit_kwargs, datamark_kwargs, encode_kwargs : dict or None, optional
+        Per-variant kwargs forwarded to the underlying transform
+        function. Default ``None`` (use each variant's defaults).
+    Returns
+    -------
+    pandas.DataFrame
+        Columns: ``text_id`` (int), ``variant`` (str), ``transformed_text`` (str).
+        Row order: ``(variant, text_id)`` nested.
+    Raises
+    ------
+    ValueError
+        On any unknown variant name in ``variants``.
+    Examples
+    --------
+    >>> # Synthetic 2-text sweep — see docs/source/examples/spotlighting.md
+    >>> # for a runnable end-to-end demo.
+    >>> # df = sweep(["hello", "world"])
+    >>> # df.shape  # (6, 3)
+    """
+    import pandas as pd
+    delimit_kw = delimit_kwargs or {}
+    datamark_kw = datamark_kwargs or {}
+    encode_kw = encode_kwargs or {}
+    def _apply(variant: str, t: str) -> str:
+        if variant == "delimit":
+            return delimit(t, **delimit_kw)  # type: ignore[arg-type]
+        if variant == "datamark":
+            return datamark(t, **datamark_kw)  # type: ignore[arg-type]
+        if variant == "encode":
+            return encode(t, **encode_kw)  # type: ignore[arg-type]
+        raise ValueError(
+            f"sweep: unknown variant {variant!r}; " f"supported: 'delimit', 'datamark', 'encode'"
+        )
+    rows: list[dict[str, object]] = []
+    for variant in variants:
+        if variant not in {"delimit", "datamark", "encode"}:
+            raise ValueError(
+                f"sweep: unknown variant {variant!r}; "
+                f"supported: 'delimit', 'datamark', 'encode'"
+            )
+        for i, text in enumerate(texts):
+            rows.append(
+                {
+                    "text_id": int(i),
+                    "variant": variant,
+                    "transformed_text": _apply(variant, text),
+                }
+            )
+    return pd.DataFrame(rows, columns=["text_id", "variant", "transformed_text"])
+# Module-level function namespace (matches issue spec API)
+spotlighting = SimpleNamespace(
+    delimit=delimit,
+    datamark=datamark,
+    encode=encode,
+    sweep=sweep,
+)

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/probes.py RENAMED Viewed

@@ -236,8 +236,8 @@ class ActivationDeltaProbe:
     def _build_default_extractor(self) -> ActivationExtractor:
         """Build the default HF-backed extractor (lazy import)."""
         try:
-            import torch  # type: ignore[import-not-found]
-            from transformers import AutoModel, AutoTokenizer  # type: ignore[import-not-found]
+            import torch
+            from transformers import AutoModel, AutoTokenizer
         except ImportError as exc:
             raise ImportError(
                 "ActivationDeltaProbe requires torch + transformers. "

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/src/eval_toolkit/seeds.py RENAMED Viewed

@@ -109,7 +109,7 @@ def set_global_seeds(seed: int, *, strict_torch_determinism: bool = False) -> No
     np.random.seed(seed)
     try:
-        import torch  # type: ignore[import-not-found]  # noqa: PLC0415
+        import torch  # noqa: PLC0415
     except ImportError:
         if strict_torch_determinism:
             raise RuntimeError(

{eval_toolkit-0.43.0 → eval_toolkit-0.44.0}/tests/golden/public_api/snapshot.json RENAMED Viewed

@@ -83,6 +83,7 @@
     "RECOMMENDED_SOURCE_ROLES",
     "RUN_RESULT_SCHEMA_VERSION",
     "RecallAtFprResult",
+    "RecallAtLowFPR",
     "RunManifest",
     "RunResult",
     "SINGLE_CLASS_INCOMPATIBLE_METRICS",
@@ -132,7 +133,10 @@
     "cross_dedup",
     "cross_validate_metric",
     "cv_clt_ci",
+    "datamark",
+    "delimit",
     "delong_roc_variance",
+    "encode",
     "error_metric",
     "evaluate",
     "evaluate_claims",
@@ -220,6 +224,7 @@
     "skipped_metric",
     "source_role_gate",
     "split_provenance_config",
+    "spotlighting",
     "stratified_recall",
     "strict_artifact_gate",
     "validate_manifest",
@@ -907,6 +912,11 @@
       "kind": "class",
       "signature": "(threshold: 'float', recall: 'float', actual_fpr: 'float', n_val_neg: 'int', fp: 'int', tn: 'int') -> None"
     },
+    "RecallAtLowFPR": {
+      "doc_first_line": "Construct a Recall@LowFPR loss module.",
+      "kind": "function",
+      "signature": "(fpr_target: 'float' = 0.01, fpr_smoothing_beta: 'float' = 10.0, pos_weight: 'float' = 1.0, reduction: 'ReductionMode' = 'mean') -> 'Any'"
+    },
     "RunManifest": {
       "bases": [
         "object"
@@ -1144,7 +1154,7 @@
       "doc_first_line": "str(object='') -> str",
       "kind": "value",
       "type": "str",
-      "value": "'0.43.0'"
+      "value": "'0.44.0'"
     },
     "apply_operating_points": {
       "doc_first_line": "Apply fitted thresholds to a mixed-class or single-class target slice.",
@@ -1236,11 +1246,26 @@
       "kind": "function",
       "signature": "(fold_metrics: 'np.ndarray', *, confidence: 'float' = 0.95) -> 'BootstrapCI'"
     },
+    "datamark": {
+      "doc_first_line": "Prepend ``marker`` before each non-leading whitespace run.",
+      "kind": "function",
+      "signature": "(text: 'str', *, marker: 'str' = '^') -> 'str'"
+    },
+    "delimit": {
+      "doc_first_line": "Wrap ``text`` in unusual delimiters so the LLM can spot the boundary.",
+      "kind": "function",
+      "signature": "(text: 'str', *, delimiter: 'str' = '<<', end: 'str | None' = None) -> 'str'"
+    },
     "delong_roc_variance": {
       "doc_first_line": "DeLong's variance of the paired ROC-AUC difference.",
       "kind": "function",
       "signature": "(y_true: 'np.ndarray', y_score_a: 'np.ndarray', y_score_b: 'np.ndarray') -> 'DeLongResult'"
     },
+    "encode": {
+      "doc_first_line": "Encode ``text`` so the LLM treats the result as data, not instructions.",
+      "kind": "function",
+      "signature": "(text: 'str', *, encoding: \"Literal['base64']\" = 'base64') -> 'str'"
+    },
     "error_metric": {
       "doc_first_line": "Return a structured errored-metric payload.",
       "kind": "function",
@@ -1676,6 +1701,11 @@
       "kind": "function",
       "signature": "(config: 'Mapping[str, Any]', repo_root: 'Path | str | None' = None, *, path_keys: 'tuple[str, ...]' = ('path', 'dir', 'file', 'splits_dir', 'model_path')) -> 'dict[str, Any]'"
     },
+    "spotlighting": {
+      "doc_first_line": "A simple attribute-based namespace.",
+      "kind": "value",
+      "type": "types.SimpleNamespace"
+    },
     "stratified_recall": {
       "doc_first_line": "Recall (TPR) per categorical stratum.",
       "kind": "function",

eval-toolkit 0.43.0__tar.gz → 0.44.0__tar.gz

eval-toolkit 0.43.0tar.gz → 0.44.0tar.gz