PyPI - dataeval - Versions diffs - 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl - Mend

dataeval 0.86.9py3-none-any.whl → 0.88.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

dataeval/__init__.py +1 -1
dataeval/_log.py +1 -1
dataeval/_version.py +2 -2
dataeval/config.py +4 -19
dataeval/data/_embeddings.py +78 -35
dataeval/data/_images.py +41 -8
dataeval/data/_metadata.py +348 -66
dataeval/data/_selection.py +22 -7
dataeval/data/_split.py +3 -2
dataeval/data/selections/_classbalance.py +4 -3
dataeval/data/selections/_classfilter.py +9 -8
dataeval/data/selections/_indices.py +4 -3
dataeval/data/selections/_prioritize.py +249 -29
dataeval/data/selections/_reverse.py +1 -1
dataeval/data/selections/_shuffle.py +5 -4
dataeval/detectors/drift/_base.py +2 -1
dataeval/detectors/drift/_mmd.py +2 -1
dataeval/detectors/drift/_nml/_base.py +1 -1
dataeval/detectors/drift/_nml/_chunk.py +2 -1
dataeval/detectors/drift/_nml/_result.py +3 -2
dataeval/detectors/drift/_nml/_thresholds.py +6 -5
dataeval/detectors/drift/_uncertainty.py +2 -1
dataeval/detectors/linters/duplicates.py +2 -1
dataeval/detectors/linters/outliers.py +4 -3
dataeval/detectors/ood/__init__.py +2 -1
dataeval/detectors/ood/ae.py +1 -1
dataeval/detectors/ood/base.py +39 -1
dataeval/detectors/ood/knn.py +95 -0
dataeval/detectors/ood/mixin.py +2 -1
dataeval/metadata/_utils.py +1 -1
dataeval/metrics/bias/_balance.py +29 -22
dataeval/metrics/bias/_diversity.py +4 -4
dataeval/metrics/bias/_parity.py +2 -2
dataeval/metrics/stats/_base.py +3 -29
dataeval/metrics/stats/_boxratiostats.py +2 -1
dataeval/metrics/stats/_dimensionstats.py +2 -1
dataeval/metrics/stats/_hashstats.py +21 -3
dataeval/metrics/stats/_pixelstats.py +2 -1
dataeval/metrics/stats/_visualstats.py +2 -1
dataeval/outputs/_base.py +2 -3
dataeval/outputs/_bias.py +2 -1
dataeval/outputs/_estimators.py +1 -1
dataeval/outputs/_linters.py +3 -3
dataeval/outputs/_stats.py +3 -3
dataeval/outputs/_utils.py +1 -1
dataeval/outputs/_workflows.py +49 -31
dataeval/typing.py +23 -9
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +3 -2
dataeval/utils/_bin.py +9 -7
dataeval/utils/_method.py +2 -3
dataeval/utils/_multiprocessing.py +34 -0
dataeval/utils/_plot.py +2 -1
dataeval/utils/data/__init__.py +6 -5
dataeval/utils/data/{metadata.py → _merge.py} +3 -2
dataeval/utils/data/_validate.py +170 -0
dataeval/utils/data/collate.py +2 -1
dataeval/utils/torch/_internal.py +2 -1
dataeval/utils/torch/trainer.py +1 -1
dataeval/workflows/sufficiency.py +13 -9
{dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/METADATA +8 -21
dataeval-0.88.0.dist-info/RECORD +105 -0
dataeval/utils/data/_dataset.py +0 -246
dataeval/utils/datasets/__init__.py +0 -21
dataeval/utils/datasets/_antiuav.py +0 -189
dataeval/utils/datasets/_base.py +0 -266
dataeval/utils/datasets/_cifar10.py +0 -201
dataeval/utils/datasets/_fileio.py +0 -142
dataeval/utils/datasets/_milco.py +0 -197
dataeval/utils/datasets/_mixin.py +0 -54
dataeval/utils/datasets/_mnist.py +0 -202
dataeval/utils/datasets/_seadrone.py +0 -512
dataeval/utils/datasets/_ships.py +0 -144
dataeval/utils/datasets/_types.py +0 -48
dataeval/utils/datasets/_voc.py +0 -583
dataeval-0.86.9.dist-info/RECORD +0 -115
{dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
/dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.88.0.dist-info/licenses/LICENSE +0 -0

dataeval/data/_selection.py CHANGED Viewed

@@ -2,8 +2,9 @@ from __future__ import annotations
 __all__ = []
+from collections.abc import Iterator, Sequence
 from enum import IntEnum
-from typing import Generic, Iterator, Sequence, TypeVar
+from typing import Generic, TypeVar
 from dataeval.typing import AnnotatedDataset, DatasetMetadata
@@ -31,14 +32,21 @@ class Subselection(Generic[_TDatum]):
 class Select(AnnotatedDataset[_TDatum]):
     """
-    Wraps a dataset and applies selection criteria to it.
+    Dataset wrapper that applies selection criteria for filtering.
+    Wraps an existing dataset and applies one or more selection filters to
+    create a subset view without modifying the original dataset. Supports
+    chaining multiple selection criteria for complex filtering operations.
     Parameters
     ----------
-    dataset : Dataset
-        The dataset to wrap.
-    selections : Selection or list[Selection], optional
-        The selection criteria to apply to the dataset.
+    dataset : AnnotatedDataset[_TDatum]
+        Source dataset to wrap and filter. Must implement AnnotatedDataset
+        interface with indexed access to data tuples.
+    selections : Selection or Sequence[Selection] or None, default None
+        Selection criteria to apply for filtering the dataset. When None,
+        returns all items from the source dataset. Default None creates
+        unfiltered view for consistent interface.
     Examples
     --------
@@ -49,7 +57,7 @@ class Select(AnnotatedDataset[_TDatum]):
     >>> # - f"data_{idx}", one_hot_encoded(idx % class_count), {"id": idx}
     >>> dataset = SampleDataset(size=100, class_count=10)
-    >>> # Apply a selection criteria to the dataset
+    >>> # Apply selection criteria to the dataset
     >>> selections = [Limit(size=5), ClassFilter(classes=[0, 2])]
     >>> selected_dataset = Select(dataset, selections=selections)
@@ -61,6 +69,12 @@ class Select(AnnotatedDataset[_TDatum]):
     (data_10, 0, {'id': 10})
     (data_12, 2, {'id': 12})
     (data_20, 0, {'id': 20})
+    Notes
+    -----
+    Selection criteria are applied in the order provided, allowing for
+    efficient sequential filtering. The wrapper maintains all metadata
+    and interface compatibility with the original dataset.
     """
     _dataset: AnnotatedDataset[_TDatum]
@@ -91,6 +105,7 @@ class Select(AnnotatedDataset[_TDatum]):
     @property
     def metadata(self) -> DatasetMetadata:
+        """Dataset metadata information including identifier and configuration."""
         return self._metadata
     def __str__(self) -> str:

dataeval/data/_split.py CHANGED Viewed

@@ -4,7 +4,8 @@ __all__ = []
 import logging
 import warnings
-from typing import Any, Iterator, Protocol, Sequence
+from collections.abc import Iterator, Sequence
+from typing import Any, Protocol
 import numpy as np
 from numpy.typing import NDArray
@@ -208,7 +209,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
     split_set = set(split_on)
     indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
-    binned_features = metadata.discretized_data[:, indices]
+    binned_features = metadata.binned_data[:, indices]
     return np.unique(binned_features, axis=0, return_inverse=True)[1]

dataeval/data/selections/_classbalance.py CHANGED Viewed

@@ -11,12 +11,13 @@ from dataeval.utils._array import as_numpy
 class ClassBalance(Selection[ImageClassificationDatum]):
     """
-    Balance the dataset by class.
+    Select indices of a dataset that will equalize the occurrences of all classes.
     Note
     ----
-    The total number of instances of each class will be equalized which may result
+    1. The total number of instances of each class will be equalized which may result
     in a lower total number of instances than specified by the selection limit.
+    2. This selection currently only supports classification tasks
     """
     stage = SelectionStage.FILTER
@@ -29,7 +30,7 @@ class ClassBalance(Selection[ImageClassificationDatum]):
                 label = int(np.argmax(as_numpy(target)))
             else:
                 # ObjectDetectionTarget and SegmentationTarget not supported yet
-                raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
+                raise TypeError("ClassBalance only supports classification targets as an array of class probabilities.")
             class_indices.setdefault(label, []).append(i)
         per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))

dataeval/data/selections/_classfilter.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
+from collections.abc import Iterable, Mapping, Sequence, Sized
+from typing import Any, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
@@ -14,12 +15,12 @@ from dataeval.utils._array import as_numpy
 class ClassFilter(Selection[Any]):
     """
-    Filter the dataset by class.
+    Select dataset indices based on class labels, keeping only those present in `classes`.
     Parameters
     ----------
     classes : Sequence[int]
-        The classes to filter by.
+        The sequence of classes to keep.
     filter_detections : bool, default True
         Whether to filter detections from targets for object detection and segmentation datasets.
     """
@@ -41,16 +42,16 @@ class ClassFilter(Selection[Any]):
             if isinstance(target, Array):
                 # Get the label for the image
                 label = int(np.argmax(as_numpy(target)))
-                # Check to see if the label is in the classes to filter for
+                # Check to see if the label is in the classes to keep
                 if label in self.classes:
-                    # Include the image
+                    # Include the image index
                     selection.append(idx)
-            elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
+            elif isinstance(target, ObjectDetectionTarget | SegmentationTarget):
                 # Get the set of labels from the target
                 labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
                 # Check to see if any labels are in the classes to filter for
                 if labels.intersection(self.classes):
-                    # Include the image
+                    # Include the image index
                     selection.append(idx)
                     # If we are filtering out other labels and there are other labels, add a subselection filter
                     if self.filter_detections and labels.difference(self.classes):
@@ -68,7 +69,7 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
 def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
-    if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
+    if not isinstance(obj, str | bytes | bytearray) and isinstance(obj, Sequence | Array) and len(obj) == len(mask):
         return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
     return obj

dataeval/data/selections/_indices.py CHANGED Viewed

@@ -2,19 +2,20 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Sequence
+from collections.abc import Sequence
+from typing import Any
 from dataeval.data._selection import Select, Selection, SelectionStage
 class Indices(Selection[Any]):
     """
-    Selects specific indices from the dataset.
+    Selects only the given indices from the dataset.
     Parameters
     ----------
     indices : Sequence[int]
-        The indices to select from the dataset.
+        The specific indices to select.
     """
     stage = SelectionStage.FILTER

dataeval/data/selections/_prioritize.py CHANGED Viewed

@@ -32,8 +32,8 @@ class _Clusters:
         self.cluster_centers = cluster_centers
         self.unique_labels = np.unique(labels)
-    def _dist2center(self, X: NDArray[np.float64]) -> NDArray[np.float64]:
-        dist = np.zeros(self.labels.shape)
+    def _dist2center(self, X: NDArray[np.floating[Any]]) -> NDArray[np.float32]:
+        dist = np.zeros(self.labels.shape, dtype=np.float32)
         for lab in self.unique_labels:
             dist[self.labels == lab] = np.linalg.norm(X[self.labels == lab, :] - self.cluster_centers[lab, :], axis=1)
         return dist
@@ -75,6 +75,8 @@ class _Clusters:
 class _Sorter(ABC):
+    scores: NDArray[np.float32] | None = None
     @abstractmethod
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]: ...
@@ -95,11 +97,12 @@ class _KNNSorter(_Sorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         if reference is None:
-            dists = pairwise_distances(embeddings, embeddings)
+            dists = pairwise_distances(embeddings, embeddings).astype(np.float32)
             np.fill_diagonal(dists, np.inf)
         else:
-            dists = pairwise_distances(embeddings, reference)
-        return np.argsort(np.sort(dists, axis=1)[:, self._k])
+            dists = pairwise_distances(embeddings, reference).astype(np.float32)
+        self.scores = np.sort(dists, axis=1)[:, self._k]
+        return np.argsort(self.scores)
 class _KMeansSorter(_Sorter):
@@ -123,7 +126,8 @@ class _KMeansSorter(_Sorter):
 class _KMeansDistanceSorter(_KMeansSorter):
     def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
         clst = self._get_clusters(embeddings if reference is None else reference)
-        return np.argsort(clst._dist2center(embeddings))
+        self.scores = clst._dist2center(embeddings)
+        return np.argsort(self.scores)
 class _KMeansComplexitySorter(_KMeansSorter):
@@ -134,11 +138,11 @@ class _KMeansComplexitySorter(_KMeansSorter):
 class Prioritize(Selection[Any]):
     """
-    Prioritizes the dataset by sort order in the embedding space.
+    Sort the dataset indices in order of highest priority data in the embedding space.
     Parameters
     ----------
-    model : torch.nn.Module
+    model : torch.nn.Module | None
         Model to use for encoding images
     batch_size : int
         Batch size to use when encoding images
@@ -146,10 +150,23 @@ class Prioritize(Selection[Any]):
         Device to use for encoding images
     method : Literal["knn", "kmeans_distance", "kmeans_complexity"]
         Method to use for prioritization
-    k : int | None, default None
-        Number of nearest neighbors to use for prioritization (knn only)
-    c : int | None, default None
-        Number of clusters to use for prioritization (kmeans only)
+    k : int or None, default None
+        Number of nearest neighbors to use for prioritization.
+        If None, uses the square_root of the number of samples. Only used for method="knn", ignored otherwise.
+    c : int or None, default None
+        Number of clusters to use for prioritization. If None, uses the square_root of the number of samples.
+        Only used for method="kmeans_*", ignored otherwise.
+    Notes
+    -----
+    1. `k` is only used for method ["knn"].
+    2. `c` is only used for methods ["kmeans_distance", "kmeans_complexity"].
+    Raises
+    ------
+    ValueError
+        If method not in supported methods
     """
     stage = SelectionStage.ORDER
@@ -157,55 +174,95 @@ class Prioritize(Selection[Any]):
     @overload
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["knn"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None: ...
     @overload
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
+        *,
+        c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        model: torch.nn.Module | None,
+        batch_size: int,
+        device: DeviceLike | None,
+        method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["class_balance"],
+        *,
+        k: int | None = None,
+        c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None,
+    ) -> None: ...
+    @overload
+    def __init__(
+        self,
+        model: torch.nn.Module | None,
+        batch_size: int,
+        device: DeviceLike | None,
+        method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified"],
         *,
+        k: int | None = None,
         c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None: ...
     def __init__(
         self,
-        model: torch.nn.Module,
+        model: torch.nn.Module | None,
         batch_size: int,
         device: DeviceLike | None,
         method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         c: int | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> None:
-        if method not in ("knn", "kmeans_distance", "kmeans_complexity"):
+        if method not in {"knn", "kmeans_distance", "kmeans_complexity"}:
             raise ValueError(f"Invalid prioritization method: {method}")
+        if policy not in ("hard_first", "easy_first", "stratified", "class_balance"):
+            raise ValueError(f"Invalid selection policy: {policy}")
         self._model = model
         self._batch_size = batch_size
         self._device = device
         self._method = method
+        self._policy = policy
         self._embeddings: Embeddings | None = None
         self._reference: Embeddings | None = None
         self._k = k
         self._c = c
+        self.class_label = class_label
     @overload
     @classmethod
     def using(
         cls,
         method: Literal["knn"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize: ...
     @overload
@@ -213,49 +270,72 @@ class Prioritize(Selection[Any]):
     def using(
         cls,
         method: Literal["kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         c: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize: ...
     @classmethod
     def using(
         cls,
         method: Literal["knn", "kmeans_distance", "kmeans_complexity"],
+        policy: Literal["hard_first", "easy_first", "stratified", "class_balance"],
         *,
         k: int | None = None,
         c: int | None = None,
         embeddings: Embeddings | None = None,
         reference: Embeddings | None = None,
+        class_label: NDArray[np.integer[Any]] | None = None,
     ) -> Prioritize:
         """
-        Prioritizes the dataset by sort order in the embedding space using existing
-        embeddings and/or reference dataset embeddings.
+        Use precalculated embeddings to sort the dataset indices in order of
+        highest priority data in the embedding space.
         Parameters
         ----------
         method : Literal["knn", "kmeans_distance", "kmeans_complexity"]
-            Method to use for prioritization
+            Method to use for sample scoring during prioritization.
+        policy : Literal["hard_first","easy_first","stratified","class_balance"]
+            Selection policy for prioritizing scored samples.
         embeddings : Embeddings or None, default None
-            Embeddings to use for prioritization
+            Embeddings to use during prioritization. If None, `reference` must be set.
         reference : Embeddings or None, default None
-            Reference embeddings to prioritize relative to
+            Reference embeddings used to prioritize the calculated dataset embeddings relative to them.
+            If `embeddings` is None, this will be used instead.
         k : int or None, default None
-            Number of nearest neighbors to use for prioritization (knn only)
+            Number of nearest neighbors to use for prioritization.
+            If None, uses the square_root of the number of samples. Only used for method="knn", ignored otherwise.
         c : int or None, default None
-            Number of clusters to use for prioritization (kmeans, cluster only)
+            Number of clusters to use for prioritization. If None, uses the square_root of the number of samples.
+            Only used for method="kmeans_*", ignored otherwise.
         Notes
         -----
-        At least one of `embeddings` or `reference` must be provided.
+        1. `k` is only used for method ["knn"].
+        2. `c` is only used for methods ["kmeans_distance", "kmeans_complexity"].
+        Raises
+        ------
+        ValueError
+            If both `embeddings` and `reference` are None
         """
         emb_params: Embeddings | None = embeddings if embeddings is not None else reference
         if emb_params is None:
             raise ValueError("Must provide at least embeddings or reference embeddings.")
-        prioritize = Prioritize(emb_params._model, emb_params.batch_size, emb_params.device, method)
-        prioritize._k = k
-        prioritize._c = c
+        prioritize = Prioritize(
+            emb_params._model,
+            emb_params.batch_size,
+            emb_params.device,
+            method,
+            policy,
+            k=k,
+            c=c,
+            class_label=class_label,
+        )
         prioritize._embeddings = embeddings
         prioritize._reference = reference
         return prioritize
@@ -265,9 +345,148 @@ class Prioritize(Selection[Any]):
             return _KNNSorter(samples, self._k)
         if self._method == "kmeans_distance":
             return _KMeansDistanceSorter(samples, self._c)
-        # self._method == "kmeans_complexity"
         return _KMeansComplexitySorter(samples, self._c)
+    def _compute_bin_extents(self, scores: NDArray[np.floating[Any]]) -> tuple[np.float64, np.float64]:
+        """
+        Compute min/max bin extents for `scores`, padding outward by epsilon
+        Parameters
+        ----------
+        scores: NDArray[np.float64])
+            Array of floats to bin
+        Returns
+        -------
+        tuple[np.float64, np.float64]
+            (min,max) scores padded outward by epsilon = 1e-6*range(scores).
+        """
+        # ensure binning captures all samples in range
+        scores = scores.astype(np.float64)
+        min_score = np.min(scores)
+        max_score = np.max(scores)
+        rng = max_score - min_score
+        eps = rng * 1e-6
+        return min_score - eps, max_score + eps
+    def _select_ordered_by_label(self, labels: NDArray[np.integer[Any]]) -> NDArray[np.intp]:
+        """
+        Given labels (class, group, bin, etc) sorted with decreasing priority,
+        rerank so that we have approximate class/group balance.  This function
+        is used for both stratified and class-balance rerank methods.
+        We could require and return prioritization scores and re-sorted class
+        labels, but it is more compact to return indices.  This allows us to
+        resort other quantities, as well, outside the function.
+        Parameters
+        ---------
+        labels: NDArray[np.integer[Any]]
+            Class label or group ID per instance in order of decreasing priority
+        Returns
+        -------
+        NDArray[np.intp]
+            Indices that sort samples according to uniform class balance or
+            group membership while respecting priority of the initial ordering.
+        """
+        labels = np.array(labels)
+        num_samp = labels.shape[0]
+        selected = np.zeros(num_samp, dtype=bool)
+        # preserve ordering
+        _, index = np.unique(labels, return_index=True)
+        u_lab = labels[np.sort(index)]
+        n_cls = len(u_lab)
+        resort_inds = []
+        cls_idx = 0
+        n = 0
+        while len(resort_inds) < num_samp:
+            c0 = u_lab[cls_idx % n_cls]
+            samples_available = (~selected) * (labels == c0)
+            if any(samples_available):
+                i0 = np.argmax(samples_available)  # selects first occurrence
+                resort_inds.append(i0)
+                selected[i0] = True
+            cls_idx += 1
+            n += 1
+        return np.array(resort_inds).astype(np.intp)
+    def _stratified_rerank(
+        self,
+        scores: NDArray[np.floating[Any]],
+        indices: NDArray[np.integer[Any]],
+        num_bins: int = 50,
+    ) -> NDArray[np.intp]:
+        """
+        Re-rank samples by sampling uniformly over binned scores.  This
+        de-weights selection of samples with similar scores and encourages both
+        prototypical and challenging samples near the decision boundary.
+        Inputs
+        ------
+        scores: NDArray[float]
+            prioritization scores sorted in order of decreasing priority
+        indices: NDArray[int]
+            Indices to be re-sorted according to stratified sampling of scores.
+            Indices are ordered by decreasing priority.
+        num_bins: int
+        Returns
+        -------
+        NDArray[int]
+            re-ranked indices
+        """
+        mn, mx = self._compute_bin_extents(scores)
+        bin_edges = np.linspace(mn, mx, num=num_bins + 1, endpoint=True)
+        bin_label = np.digitize(scores, bin_edges)
+        srt_inds = self._select_ordered_by_label(bin_label)
+        return indices[srt_inds].astype(np.intp)
+    def _rerank(
+        self,
+        indices: NDArray[np.integer[Any]],
+    ) -> NDArray[np.intp]:
+        """
+        Re-rank samples according to the re-rank policy, self._policy.  Values
+        from the 'indices' and optional 'scores' and 'class_label' variables are
+        assumed to correspond by index---i.e. indices[i], scores[i], and
+        class_label[i] should all refer to the same instance in the dataset.
+        Note: indices are assumed to be sorted with easy/prototypical samples
+        first--increasing order by most prioritization scoring methods.
+        Parameters
+        ----------
+        indices: NDArray[np.intp]
+            Indices that sort samples by increasing prioritization score, where
+            low scores indicate high prototypicality ('easy') and high scores
+            indicate challenging samples near the decision boundary ('hard').
+        """
+        if self._policy == "easy_first":
+            return indices.astype(np.intp)
+        if self._policy == "stratified":
+            if self._sorter.scores is None:
+                raise (
+                    ValueError(
+                        "Prioritization scores are necessary in order to use "
+                        "stratified re-rank.  Use 'knn' or 'kmeans_distance' "
+                        "methods to populate scores."
+                    )
+                )
+            return self._stratified_rerank(self._sorter.scores[::-1], indices[::-1])
+        if self._policy == "class_balance":
+            if self.class_label is None:
+                raise (ValueError("Class labels are necessary in order to use class_balance re-rank"))
+            indices_reversed = self._select_ordered_by_label(self.class_label[indices[::-1]]).astype(np.int32)
+            n = len(indices_reversed)
+            return (n - 1 - indices_reversed).astype(np.intp)
+        # elif self._policy == "hard_first" (default)
+        return indices[::-1].astype(np.intp)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
         emb: NDArray[Any] = embeddings.to_numpy(selection)
         emb /= max(np.max(np.linalg.norm(emb, axis=1)), EPSILON)
@@ -290,4 +509,5 @@ class Prioritize(Selection[Any]):
         emb = self._to_normalized_ndarray(embeddings, dataset._selection)
         ref = None if self._reference is None else self._to_normalized_ndarray(self._reference)
         # Sort indices
-        dataset._selection = self._sorter._sort(emb, ref).tolist()
+        indices = self._sorter._sort(emb, ref)
+        dataset._selection = indices[self._rerank(indices)].astype(int).tolist()

dataeval/data/selections/_reverse.py CHANGED Viewed

@@ -9,7 +9,7 @@ from dataeval.data._selection import Select, Selection, SelectionStage
 class Reverse(Selection[Any]):
     """
-    Reverse the selection order of the dataset.
+    Select dataset indices in reverse order.
     """
     stage = SelectionStage.ORDER

dataeval/data/selections/_shuffle.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import Any, Sequence
+from collections.abc import Sequence
+from typing import Any
 import numpy as np
 from numpy.random import BitGenerator, Generator, SeedSequence
@@ -15,12 +16,12 @@ from dataeval.utils._array import as_numpy
 class Shuffle(Selection[Any]):
     """
-    Shuffle the dataset using a seed.
+    Select dataset indices in a random order.
     Parameters
     ----------
     seed : int, ArrayLike, SeedSequence, BitGenerator, Generator or None, default None
-        Seed for the random number generator.
+        Seed for the random number generator. If None, results are not reproducible.
     See Also
     --------
@@ -33,7 +34,7 @@ class Shuffle(Selection[Any]):
     def __init__(
         self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
     ) -> None:
-        self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
+        self.seed = as_numpy(seed) if isinstance(seed, Sequence | Array) else seed
     def __call__(self, dataset: Select[Any]) -> None:
         rng = np.random.default_rng(self.seed)

dataeval/detectors/drift/_base.py CHANGED Viewed

@@ -12,8 +12,9 @@ __all__ = []
 import math
 from abc import abstractmethod
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import Any, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from numpy.typing import NDArray

dataeval 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl

dataeval 0.86.9py3-none-any.whl → 0.88.0py3-none-any.whl