PyPI - dataeval - Versions diffs - 0.86.8__py3-none-any.whl → 0.87.0__py3-none-any.whl - Mend

dataeval 0.86.8py3-none-any.whl → 0.87.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

dataeval/__init__.py +1 -1
dataeval/_version.py +2 -2
dataeval/config.py +4 -19
dataeval/data/_metadata.py +56 -27
dataeval/data/_split.py +1 -1
dataeval/data/selections/_classbalance.py +4 -3
dataeval/data/selections/_classfilter.py +5 -5
dataeval/data/selections/_indices.py +2 -2
dataeval/data/selections/_prioritize.py +249 -29
dataeval/data/selections/_reverse.py +1 -1
dataeval/data/selections/_shuffle.py +2 -2
dataeval/detectors/ood/__init__.py +2 -1
dataeval/detectors/ood/base.py +38 -1
dataeval/detectors/ood/knn.py +95 -0
dataeval/metrics/bias/_balance.py +28 -21
dataeval/metrics/bias/_diversity.py +4 -4
dataeval/metrics/bias/_parity.py +2 -2
dataeval/metrics/stats/_hashstats.py +19 -2
dataeval/outputs/_workflows.py +20 -7
dataeval/typing.py +14 -2
dataeval/utils/__init__.py +2 -2
dataeval/utils/_bin.py +7 -6
dataeval/utils/data/__init__.py +2 -0
dataeval/utils/data/_dataset.py +13 -6
dataeval/utils/data/_validate.py +169 -0
dataeval/workflows/sufficiency.py +53 -10
{dataeval-0.86.8.dist-info → dataeval-0.87.0.dist-info}/METADATA +5 -17
{dataeval-0.86.8.dist-info → dataeval-0.87.0.dist-info}/RECORD +30 -39
dataeval/utils/datasets/__init__.py +0 -19
dataeval/utils/datasets/_antiuav.py +0 -189
dataeval/utils/datasets/_base.py +0 -262
dataeval/utils/datasets/_cifar10.py +0 -201
dataeval/utils/datasets/_fileio.py +0 -142
dataeval/utils/datasets/_milco.py +0 -197
dataeval/utils/datasets/_mixin.py +0 -54
dataeval/utils/datasets/_mnist.py +0 -202
dataeval/utils/datasets/_ships.py +0 -144
dataeval/utils/datasets/_types.py +0 -48
dataeval/utils/datasets/_voc.py +0 -583
{dataeval-0.86.8.dist-info → dataeval-0.87.0.dist-info}/WHEEL +0 -0
/dataeval-0.86.8.dist-info/licenses/LICENSE.txt → /dataeval-0.87.0.dist-info/licenses/LICENSE +0 -0

dataeval/data/selections/_prioritize.py CHANGED Viewed

@@ -32,8 +32,8 @@ class _Clusters:
         self.cluster_centers = cluster_centers
         self.unique_labels = np.unique(labels)
-    def _dist2center(self, X: NDArray[np.float64]) -> NDArray[np.float64]:
-        dist = np.zeros(self.labels.shape)
+    def _dist2center(self, X: NDArray[np.floating[Any]]) -> NDArray[np.float32]:
+        dist = np.zeros(self.labels.shape, dtype=np.float32)
         for lab in self.unique_labels:
             dist[self.labels == lab] = np.linalg.norm(X[self.labels == lab, :] - self.cluster_centers[lab, :], axis=1)
         return dist
@@ -75,6 +75,8 @@ class _Clusters:
 class _Sorter(ABC):
+    scores: NDArray[np.float32] | None = None
     @abstractmethod
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]: ...
@@ -95,11 +97,12 @@ class _KNNSorter(_Sorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         if reference is None:
-            dists = pairwise_distances(embeddings, embeddings)
+            dists = pairwise_distances(embeddings, embeddings).astype(np.float32)
             np.fill_diagonal(dists, np.inf)
         else:
-            dists = pairwise_distances(embeddings, reference)
-        return np.argsort(np.sort(dists, axis=1)[:, self._k])
+            dists = pairwise_distances(embeddings, reference).astype(np.float32)
+        self.scores = np.sort(dists, axis=1)[:, self._k]
+        return np.argsort(self.scores)
 class _KMeansSorter(_Sorter):
@@ -123,7 +126,8 @@ class _KMeansSorter(_Sorter):
 class _KMeansDistanceSorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        return np.argsort(clst._dist2center(embeddings))
+        self.scores = clst._dist2center(embeddings)
+        return np.argsort(self.scores)
 class _KMeansComplexitySorter(_KMeansSorter):
@@ -134,11 +138,11 @@ class _KMeansComplexitySorter(_KMeansSorter):
 class Prioritize(Selection[Any]):
     """
-    Prioritizes the dataset by sort order in the embedding space.
+    Sort the dataset indices in order of highest priority data in the embedding space.
     Parameters
     ----------
-    model : torch.nn.Module
+    model : torch.nn.Module | None
         Model to use for encoding images
     batch_size : int
         Batch size to use when encoding images
@@ -146,10 +150,23 @@ class Prioritize(Selection[Any]):
         Device to use for encoding images
     method : Literal["knn", "kmeans_distance", "kmeans_complexity"]
         Method to use for prioritization
-    k : int | None, default None
-        Number of nearest neighbors to use for prioritization (knn only)
-    c : int | None, default None
-        Number of clusters to use for prioritization (kmeans only)
+    k : int or None, default None
+        Number of nearest neighbors to use for prioritization.
+        If None, uses the square_root of the number of samples. Only used for method="knn", ignored otherwise.
+    c : int or None, default None
+        Number of clusters to use for prioritization. If None, uses the square_root of the number of samples.
+        Only used for method="kmeans_*", ignored otherwise.
+    Notes
+    -----
+    1. `k` is only used for method ["knn"].
+    2. `c` is only used for methods ["kmeans_distance", "kmeans_complexity"].
+    Raises
+    ------
+    ValueError
+        If method not in supported methods
     """
     stage = SelectionStage.ORDER
@@ -157,55 +174,95 @@ class Prioritize(Selection[Any]):
     @overload
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["knn"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None: ...
     @overload
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
+        *,
+        c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        model: torch.nn.Module | None,
+        batch_size: int,
+        device: DeviceLike | None,
+        method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["class_balance"],
+        *,
+        k: int | None = None,
+        c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None,
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        model: torch.nn.Module | None,
+        batch_size: int,
+        device: DeviceLike | None,
+        method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified"],
         *,
+        k: int | None = None,
         c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None: ...
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None:
-        if method not in ("knn", "kmeans_distance", "kmeans_complexity"):
+        if method not in {"knn", "kmeans_distance", "kmeans_complexity"}:
             raise ValueError(f"Invalid prioritization method: {method}")
+        if policy not in ("hard_first", "easy_first", "stratified", "class_balance"):
+            raise ValueError(f"Invalid selection policy: {policy}")
         self._model = model
         self._batch_size = batch_size
         self._device = device
         self._method = method
+        self._policy = policy
         self._embeddings: Embeddings | None = None
         self._reference: Embeddings | None = None
         self._k = k
         self._c = c
+        self.class_label = class_label
     @overload
     @classmethod
     def using(
         cls,
         method: Literal["knn"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize: ...
     @overload
@@ -213,49 +270,72 @@ class Prioritize(Selection[Any]):
     def using(
         cls,
         method: Literal["kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         c: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize: ...
     @classmethod
     def using(
         cls,
         method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         c: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize:
         """
-        Prioritizes the dataset by sort order in the embedding space using existing
-        embeddings and/or reference dataset embeddings.
+        Use precalculated embeddings to sort the dataset indices in order of
+        highest priority data in the embedding space.
         Parameters
         ----------
         method : Literal["knn", "kmeans_distance", "kmeans_complexity"]
-            Method to use for prioritization
+            Method to use for sample scoring during prioritization.
+        policy : Literal["hard_first","easy_first","stratified","class_balance"]
+            Selection policy for prioritizing scored samples.
         embeddings : Embeddings or None, default None
-            Embeddings to use for prioritization
+            Embeddings to use during prioritization. If None, `reference` must be set.
         reference : Embeddings or None, default None
-            Reference embeddings to prioritize relative to
+            Reference embeddings used to prioritize the calculated dataset embeddings relative to them.
+            If `embeddings` is None, this will be used instead.
         k : int or None, default None
-            Number of nearest neighbors to use for prioritization (knn only)
+            Number of nearest neighbors to use for prioritization.
+            If None, uses the square_root of the number of samples. Only used for method="knn", ignored otherwise.
         c : int or None, default None
-            Number of clusters to use for prioritization (kmeans, cluster only)
+            Number of clusters to use for prioritization. If None, uses the square_root of the number of samples.
+            Only used for method="kmeans_*", ignored otherwise.
         Notes
         -----
-        At least one of `embeddings` or `reference` must be provided.
+        1. `k` is only used for method ["knn"].
+        2. `c` is only used for methods ["kmeans_distance", "kmeans_complexity"].
+        Raises
+        ------
+        ValueError
+            If both `embeddings` and `reference` are None
         """
         emb_params: Embeddings | None = embeddings if embeddings is not None else reference
         if emb_params is None:
             raise ValueError("Must provide at least embeddings or reference embeddings.")
-        prioritize = Prioritize(emb_params._model, emb_params.batch_size, emb_params.device, method)
-        prioritize._k = k
-        prioritize._c = c
+        prioritize = Prioritize(
+            emb_params._model,
+            emb_params.batch_size,
+            emb_params.device,
+            method,
+            policy,
+            k=k,
+            c=c,
+            class_label=class_label,
+        )
         prioritize._embeddings = embeddings
         prioritize._reference = reference
         return prioritize
@@ -265,9 +345,148 @@ class Prioritize(Selection[Any]):
             return _KNNSorter(samples, self._k)
         if self._method == "kmeans_distance":
             return _KMeansDistanceSorter(samples, self._c)
-        # self._method == "kmeans_complexity"
         return _KMeansComplexitySorter(samples, self._c)
+    def _compute_bin_extents(self, scores: NDArray[np.floating[Any]]) -> tuple[np.float64, np.float64]:
+        """
+        Compute min/max bin extents for `scores`, padding outward by epsilon
+        Parameters
+        ----------
+        scores: NDArray[np.float64])
+            Array of floats to bin
+        Returns
+        -------
+        tuple[np.float64, np.float64]
+            (min,max) scores padded outward by epsilon = 1e-6*range(scores).
+        """
+        # ensure binning captures all samples in range
+        scores = scores.astype(np.float64)
+        min_score = np.min(scores)
+        max_score = np.max(scores)
+        rng = max_score - min_score
+        eps = rng * 1e-6
+        return min_score - eps, max_score + eps
+    def _select_ordered_by_label(self, labels: NDArray[np.integer[Any]]) -> NDArray[np.intp]:
+        """
+        Given labels (class, group, bin, etc) sorted with decreasing priority,
+        rerank so that we have approximate class/group balance.  This function
+        is used for both stratified and class-balance rerank methods.
+        We could require and return prioritization scores and re-sorted class
+        labels, but it is more compact to return indices.  This allows us to
+        resort other quantities, as well, outside the function.
+        Parameters
+        ---------
+        labels: NDArray[np.integer[Any]]
+            Class label or group ID per instance in order of decreasing priority
+        Returns
+        -------
+        NDArray[np.intp]
+            Indices that sort samples according to uniform class balance or
+            group membership while respecting priority of the initial ordering.
+        """
+        labels = np.array(labels)
+        num_samp = labels.shape[0]
+        selected = np.zeros(num_samp, dtype=bool)
+        # preserve ordering
+        _, index = np.unique(labels, return_index=True)
+        u_lab = labels[np.sort(index)]
+        n_cls = len(u_lab)
+        resort_inds = []
+        cls_idx = 0
+        n = 0
+        while len(resort_inds) < num_samp:
+            c0 = u_lab[cls_idx % n_cls]
+            samples_available = (~selected) * (labels == c0)
+            if any(samples_available):
+                i0 = np.argmax(samples_available)  # selects first occurrence
+                resort_inds.append(i0)
+                selected[i0] = True
+            cls_idx += 1
+            n += 1
+        return np.array(resort_inds).astype(np.intp)
+    def _stratified_rerank(
+        self,
+        scores: NDArray[np.floating[Any]],
+        indices: NDArray[np.integer[Any]],
+        num_bins: int = 50,
+    ) -> NDArray[np.intp]:
+        """
+        Re-rank samples by sampling uniformly over binned scores.  This
+        de-weights selection of samples with similar scores and encourages both
+        prototypical and challenging samples near the decision boundary.
+        Inputs
+        ------
+        scores: NDArray[float]
+            prioritization scores sorted in order of decreasing priority
+        indices: NDArray[int]
+            Indices to be re-sorted according to stratified sampling of scores.
+            Indices are ordered by decreasing priority.
+        num_bins: int
+        Returns
+        -------
+        NDArray[int]
+            re-ranked indices
+        """
+        mn, mx = self._compute_bin_extents(scores)
+        bin_edges = np.linspace(mn, mx, num=num_bins + 1, endpoint=True)
+        bin_label = np.digitize(scores, bin_edges)
+        srt_inds = self._select_ordered_by_label(bin_label)
+        return indices[srt_inds].astype(np.intp)
+    def _rerank(
+        self,
+        indices: NDArray[np.integer[Any]],
+    ) -> NDArray[np.intp]:
+        """
+        Re-rank samples according to the re-rank policy, self._policy.  Values
+        from the 'indices' and optional 'scores' and 'class_label' variables are
+        assumed to correspond by index---i.e. indices[i], scores[i], and
+        class_label[i] should all refer to the same instance in the dataset.
+        Note: indices are assumed to be sorted with easy/prototypical samples
+        first--increasing order by most prioritization scoring methods.
+        Parameters
+        ----------
+        indices: NDArray[np.intp]
+            Indices that sort samples by increasing prioritization score, where
+            low scores indicate high prototypicality ('easy') and high scores
+            indicate challenging samples near the decision boundary ('hard').
+        """
+        if self._policy == "easy_first":
+            return indices.astype(np.intp)
+        if self._policy == "stratified":
+            if self._sorter.scores is None:
+                raise (
+                    ValueError(
+                        "Prioritization scores are necessary in order to use "
+                        "stratified re-rank.  Use 'knn' or 'kmeans_distance' "
+                        "methods to populate scores."
+                    )
+                )
+            return self._stratified_rerank(self._sorter.scores[::-1], indices[::-1])
+        if self._policy == "class_balance":
+            if self.class_label is None:
+                raise (ValueError("Class labels are necessary in order to use class_balance re-rank"))
+            indices_reversed = self._select_ordered_by_label(self.class_label[indices[::-1]]).astype(np.int32)
+            n = len(indices_reversed)
+            return (n - 1 - indices_reversed).astype(np.intp)
+        # elif self._policy == "hard_first" (default)
+        return indices[::-1].astype(np.intp)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
         emb: NDArray[Any] = embeddings.to_numpy(selection)
         emb /= max(np.max(np.linalg.norm(emb, axis=1)), EPSILON)
@@ -290,4 +509,5 @@ class Prioritize(Selection[Any]):
         emb = self._to_normalized_ndarray(embeddings, dataset._selection)
         ref = None if self._reference is None else self._to_normalized_ndarray(self._reference)
         # Sort indices
-        dataset._selection = self._sorter._sort(emb, ref).tolist()
+        indices = self._sorter._sort(emb, ref)
+        dataset._selection = indices[self._rerank(indices)].astype(int).tolist()

dataeval/data/selections/_reverse.py CHANGED Viewed

@@ -9,7 +9,7 @@ from dataeval.data._selection import Select, Selection, SelectionStage
 class Reverse(Selection[Any]):
     """
-    Reverse the selection order of the dataset.
+    Select dataset indices in reverse order.
     """
     stage = SelectionStage.ORDER

dataeval/data/selections/_shuffle.py CHANGED Viewed

@@ -15,12 +15,12 @@ from dataeval.utils._array import as_numpy
 class Shuffle(Selection[Any]):
     """
-    Shuffle the dataset using a seed.
+    Select dataset indices in a random order.
     Parameters
     ----------
     seed : int, ArrayLike, SeedSequence, BitGenerator, Generator or None, default None
-        Seed for the random number generator.
+        Seed for the random number generator. If None, results are not reproducible.
     See Also
     --------

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -2,7 +2,8 @@
 Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
 """
-__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
+__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE", "OOD_KNN"]
 from dataeval.detectors.ood.ae import OOD_AE
+from dataeval.detectors.ood.knn import OOD_KNN
 from dataeval.outputs._ood import OODOutput, OODScoreOutput

dataeval/detectors/ood/base.py CHANGED Viewed

@@ -10,11 +10,15 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, cast
+from abc import ABC, abstractmethod
+from typing import Any, Callable, cast
+import numpy as np
 import torch
+from numpy.typing import NDArray
 from dataeval.config import DeviceLike, get_device
+from dataeval.data import Embeddings
 from dataeval.detectors.ood.mixin import OODBaseMixin, OODFitMixin, OODGMMMixin
 from dataeval.typing import ArrayLike
 from dataeval.utils._array import to_numpy
@@ -93,3 +97,36 @@ class OODBaseGMM(OODBase, OODGMMMixin[GaussianMixtureModelParams]):
         # Calculate the GMM parameters
         _, z, gamma = cast(tuple[torch.Tensor, torch.Tensor, torch.Tensor], self.model(x_ref))
         self._gmm_params = gmm_params(z, gamma)
+class EmbeddingBasedOODBase(OODBaseMixin[Callable[[Any], Any]], ABC):
+    """
+    Base class for embedding-based OOD detection methods.
+    These methods work directly on embedding representations,
+    using distance metrics or density estimation in embedding space.
+    Inherits from OODBaseMixin to get automatic thresholding.
+    """
+    def __init__(self) -> None:
+        """Initialize embedding-based OOD detector."""
+        # Pass a dummy callable as model since we don't use it
+        super().__init__(lambda x: x)
+    def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
+        """Override to skip [0-1] validation for embeddings."""
+        if not isinstance(X, np.ndarray):
+            raise TypeError("Dataset should of type: `NDArray`.")
+        # Skip the [0-1] range check for embeddings
+        return X.shape[1:], X.dtype.type
+    @abstractmethod
+    def fit_embeddings(self, embeddings: Embeddings, threshold_perc: float = 95.0) -> None:
+        """
+        Fit using reference embeddings.
+        Args:
+            embeddings: Reference (in-distribution) embeddings
+            threshold_perc: Percentage of reference data considered normal
+        """
+        pass

dataeval/detectors/ood/knn.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import Literal
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+from dataeval.data import Embeddings
+from dataeval.detectors.ood.base import EmbeddingBasedOODBase
+from dataeval.outputs._ood import OODScoreOutput
+from dataeval.typing import ArrayLike
+class OOD_KNN(EmbeddingBasedOODBase):
+    """
+    K-Nearest Neighbors Out-of-Distribution detector.
+    Uses average cosine distance to k nearest neighbors in embedding space to detect OOD samples.
+    Samples with larger average distances to their k nearest neighbors in the
+    reference (in-distribution) set are considered more likely to be OOD.
+    Based on the methodology from:
+    "Back to the Basics: Revisiting Out-of-Distribution Detection Baselines"
+    (Kuan & Mueller, 2022)
+    As referenced in:
+    "Safe AI for coral reefs: Benchmarking out-of-distribution detection
+    algorithms for coral reef image surveys"
+    """
+    def __init__(self, k: int = 10, distance_metric: Literal["cosine", "euclidean"] = "cosine") -> None:
+        """
+        Initialize KNN OOD detector.
+        Args:
+            k: Number of nearest neighbors to consider (default: 10)
+            distance_metric: Distance metric to use ('cosine' or 'euclidean')
+        """
+        super().__init__()
+        self.k = k
+        self.distance_metric = distance_metric
+        self._nn_model: NearestNeighbors
+        self.reference_embeddings: ArrayLike
+    def fit_embeddings(self, embeddings: Embeddings, threshold_perc: float = 95.0) -> None:
+        """
+        Fit the detector using reference (in-distribution) embeddings.
+        Builds a k-NN index for efficient nearest neighbor search and
+        computes reference scores for automatic thresholding.
+        Args:
+            embeddings: Reference embeddings from in-distribution data
+            threshold_perc: Percentage of reference data considered normal
+        """
+        self.reference_embeddings = embeddings.to_numpy()
+        if self.k >= len(self.reference_embeddings):
+            raise ValueError(
+                f"k ({self.k}) must be less than number of reference embeddings ({len(self.reference_embeddings)})"
+            )
+        # Build k-NN index using sklearn
+        self._nn_model = NearestNeighbors(
+            n_neighbors=self.k,
+            metric=self.distance_metric,
+            algorithm="auto",  # Let sklearn choose the best algorithm
+        )
+        self._nn_model.fit(self.reference_embeddings)
+        # efficiently compute reference scores for automatic thresholding
+        ref_scores = self._compute_reference_scores()
+        self._ref_score = OODScoreOutput(instance_score=ref_scores)
+        self._threshold_perc = threshold_perc
+        self._data_info = self._get_data_info(self.reference_embeddings)
+    def _compute_reference_scores(self) -> np.ndarray:
+        """Efficiently compute reference scores by excluding self-matches."""
+        # Find k+1 neighbors (including self) for reference points
+        distances, _ = self._nn_model.kneighbors(self.reference_embeddings, n_neighbors=self.k + 1)
+        # Skip first neighbor (self with distance 0) and average the rest
+        return np.mean(distances[:, 1:], axis=1)
+    def _score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScoreOutput:
+        """
+        Compute OOD scores for input embeddings.
+        Args:
+            X: Input embeddings to score
+            batch_size: Batch size (not used, kept for interface compatibility)
+        Returns:
+            OODScoreOutput containing instance-level scores
+        """
+        # Compute OOD scores using sklearn's efficient k-NN search
+        distances, _ = self._nn_model.kneighbors(X)
+        return OODScoreOutput(instance_score=np.mean(distances, axis=1))

dataeval 0.86.8__py3-none-any.whl → 0.87.0__py3-none-any.whl

dataeval 0.86.8py3-none-any.whl → 0.87.0py3-none-any.whl