PyPI - dataeval - Versions diffs - 0.84.1__py3-none-any.whl → 0.86.0__py3-none-any.whl - Mend

dataeval 0.84.1py3-none-any.whl → 0.86.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

dataeval/__init__.py +1 -1
dataeval/data/__init__.py +19 -0
dataeval/{utils/data → data}/_embeddings.py +137 -17
dataeval/{utils/data → data}/_metadata.py +20 -8
dataeval/{utils/data → data}/_selection.py +22 -9
dataeval/{utils/data → data}/_split.py +1 -1
dataeval/data/selections/__init__.py +19 -0
dataeval/{utils/data → data}/selections/_classbalance.py +1 -2
dataeval/data/selections/_classfilter.py +110 -0
dataeval/{utils/data → data}/selections/_indices.py +1 -1
dataeval/{utils/data → data}/selections/_limit.py +1 -1
dataeval/{utils/data → data}/selections/_prioritize.py +2 -2
dataeval/{utils/data → data}/selections/_reverse.py +1 -1
dataeval/{utils/data → data}/selections/_shuffle.py +1 -1
dataeval/detectors/drift/__init__.py +4 -1
dataeval/detectors/drift/_base.py +1 -1
dataeval/detectors/drift/_cvm.py +2 -2
dataeval/detectors/drift/_ks.py +2 -2
dataeval/detectors/drift/_mmd.py +2 -2
dataeval/detectors/drift/_mvdc.py +92 -0
dataeval/detectors/drift/_nml/__init__.py +6 -0
dataeval/detectors/drift/_nml/_base.py +68 -0
dataeval/detectors/drift/_nml/_chunk.py +404 -0
dataeval/detectors/drift/_nml/_domainclassifier.py +192 -0
dataeval/detectors/drift/_nml/_result.py +98 -0
dataeval/detectors/drift/_nml/_thresholds.py +280 -0
dataeval/detectors/linters/duplicates.py +1 -1
dataeval/detectors/linters/outliers.py +1 -1
dataeval/metadata/_distance.py +1 -1
dataeval/metadata/_ood.py +4 -4
dataeval/metrics/bias/_balance.py +1 -1
dataeval/metrics/bias/_diversity.py +1 -1
dataeval/metrics/bias/_parity.py +1 -1
dataeval/metrics/stats/_labelstats.py +2 -2
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_bias.py +2 -4
dataeval/outputs/_drift.py +68 -0
dataeval/outputs/_linters.py +1 -6
dataeval/outputs/_stats.py +1 -6
dataeval/typing.py +31 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/data/__init__.py +5 -20
dataeval/utils/data/collate.py +2 -0
dataeval/utils/datasets/__init__.py +17 -0
dataeval/utils/{data/datasets → datasets}/_base.py +3 -3
dataeval/utils/{data/datasets → datasets}/_cifar10.py +2 -2
dataeval/utils/{data/datasets → datasets}/_milco.py +2 -2
dataeval/utils/{data/datasets → datasets}/_mnist.py +2 -2
dataeval/utils/{data/datasets → datasets}/_ships.py +2 -2
dataeval/utils/{data/datasets → datasets}/_voc.py +3 -3
{dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/METADATA +3 -2
dataeval-0.86.0.dist-info/RECORD +114 -0
dataeval/utils/data/datasets/__init__.py +0 -17
dataeval/utils/data/selections/__init__.py +0 -19
dataeval/utils/data/selections/_classfilter.py +0 -44
dataeval-0.84.1.dist-info/RECORD +0 -106
/dataeval/{utils/data → data}/_images.py +0 -0
/dataeval/{utils/data → data}/_targets.py +0 -0
/dataeval/utils/{metadata.py → data/metadata.py} +0 -0
/dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
{dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.84.1.dist-info → dataeval-0.86.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.84.1"
+__version__ = "0.86.0"
 import logging

dataeval/data/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Provides utility functions for interacting with Computer Vision datasets."""
+__all__ = [
+    "Embeddings",
+    "Images",
+    "Metadata",
+    "Select",
+    "SplitDatasetOutput",
+    "Targets",
+    "split_dataset",
+]
+from dataeval.data._embeddings import Embeddings
+from dataeval.data._images import Images
+from dataeval.data._metadata import Metadata
+from dataeval.data._selection import Select
+from dataeval.data._split import split_dataset
+from dataeval.data._targets import Targets
+from dataeval.outputs._utils import SplitDatasetOutput

dataeval/{utils/data → data}/_embeddings.py RENAMED Viewed

@@ -2,19 +2,25 @@ from __future__ import annotations
 __all__ = []
+import logging
 import math
+import os
+from pathlib import Path
 from typing import Any, Iterator, Sequence, cast
 import torch
+import xxhash as xxh
 from numpy.typing import NDArray
 from torch.utils.data import DataLoader, Subset
 from tqdm import tqdm
 from dataeval.config import DeviceLike, get_device
-from dataeval.typing import Array, ArrayLike, Dataset, Transform
+from dataeval.typing import AnnotatedDataset, AnnotatedModel, Array, ArrayLike, Dataset, Transform
 from dataeval.utils._array import as_numpy
 from dataeval.utils.torch.models import SupportsEncode
+_logger = logging.getLogger(__name__)
 class Embeddings:
     """
@@ -35,10 +41,23 @@ class Embeddings:
     device : DeviceLike or None, default None
         The hardware device to use if specified, otherwise uses the DataEval
         default or torch default.
-    cache : bool, default False
-        Whether to cache the embeddings in memory.
+    cache : Path, str, or bool, default False
+        Whether to cache the embeddings to a file or in memory.
+        When a Path or string is provided, embeddings will be cached to disk.
     verbose : bool, default False
         Whether to print progress bar when encoding images.
+    Attributes
+    ----------
+    batch_size : int
+        Batch size to use when encoding images.
+    cache : Path or bool
+        The path to cache embeddings to file, or True if caching to memory.
+    device : torch.device
+        The hardware device to use if specified, otherwise uses the DataEval
+        default or torch default.
+    verbose : bool
+        Whether to print progress bar when encoding images.
     """
     device: torch.device
@@ -52,24 +71,59 @@ class Embeddings:
         transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
         model: torch.nn.Module | None = None,
         device: DeviceLike | None = None,
-        cache: bool = False,
+        cache: Path | str | bool = False,
         verbose: bool = False,
     ) -> None:
         self.device = get_device(device)
-        self.cache = cache
         self.batch_size = batch_size if batch_size > 0 else 1
         self.verbose = verbose
+        self._embeddings_only: bool = False
         self._dataset = dataset
-        self._length = len(dataset)
         model = torch.nn.Flatten() if model is None else model
         self._transforms = [transforms] if isinstance(transforms, Transform) else transforms
         self._model = model.to(self.device).eval() if isinstance(model, torch.nn.Module) else model
         self._encoder = model.encode if isinstance(model, SupportsEncode) else model
         self._collate_fn = lambda datum: [torch.as_tensor(d[0] if isinstance(d, tuple) else d) for d in datum]
-        self._cached_idx = set()
+        self._cached_idx: set[int] = set()
         self._embeddings: torch.Tensor = torch.empty(())
-        self._shallow: bool = False
+        self._cache = cache if isinstance(cache, bool) else self._resolve_path(cache)
+    def __hash__(self) -> int:
+        if self._embeddings_only:
+            bid = as_numpy(self._embeddings).ravel().tobytes()
+        else:
+            did = self._dataset.metadata["id"] if isinstance(self._dataset, AnnotatedDataset) else str(self._dataset)
+            mid = self._model.metadata["id"] if isinstance(self._model, AnnotatedModel) else str(self._model)
+            tid = str.join("|", [str(t) for t in self._transforms or []])
+            bid = f"{did}{mid}{tid}".encode()
+        return int(xxh.xxh3_64_hexdigest(bid), 16)
+    @property
+    def cache(self) -> Path | bool:
+        return self._cache
+    @cache.setter
+    def cache(self, value: Path | str | bool) -> None:
+        if isinstance(value, bool) and not value:
+            self._cached_idx = set()
+            self._embeddings = torch.empty(())
+        elif isinstance(value, (Path, str)):
+            value = self._resolve_path(value)
+        if isinstance(value, Path) and value != getattr(self, "_cache", None):
+            self._save(value)
+        self._cache = value
+    def _resolve_path(self, path: Path | str) -> Path:
+        if isinstance(path, str):
+            path = Path(os.path.abspath(path))
+        if isinstance(path, Path) and (path.is_dir() or not path.suffix):
+            path = path / f"emb-{hash(self)}.pt"
+        return path
     def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
         """
@@ -125,8 +179,10 @@ class Embeddings:
         -------
         Embeddings
         """
+        if self._embeddings_only:
+            raise ValueError("Embeddings object does not have a model.")
         return Embeddings(
-            dataset, self.batch_size, self._transforms, self._model, self.device, self.cache, self.verbose
+            dataset, self.batch_size, self._transforms, self._model, self.device, bool(self.cache), self.verbose
         )
     @classmethod
@@ -149,7 +205,7 @@ class Embeddings:
         Example
         -------
         >>> import numpy as np
-        >>> from dataeval.utils.data._embeddings import Embeddings
+        >>> from dataeval.data import Embeddings
         >>> array = np.random.randn(100, 3, 224, 224)
         >>> embeddings = Embeddings.from_array(array)
         >>> print(embeddings.to_tensor().shape)
@@ -157,12 +213,70 @@ class Embeddings:
         """
         embeddings = Embeddings([], 0, None, None, device, True, False)
         array = array if isinstance(array, Array) else as_numpy(array)
-        embeddings._length = len(array)
         embeddings._cached_idx = set(range(len(array)))
         embeddings._embeddings = torch.as_tensor(array).to(get_device(device))
-        embeddings._shallow = True
+        embeddings._embeddings_only = True
         return embeddings
+    def save(self, path: Path | str) -> None:
+        """
+        Saves the embeddings to disk.
+        Parameters
+        ----------
+        path : Path or str
+            The file path to save the embeddings to.
+        """
+        self._save(self._resolve_path(path), True)
+    def _save(self, path: Path, force: bool = False) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if self._embeddings_only or self.cache and not force:
+            embeddings = self._embeddings
+            cached_idx = self._cached_idx
+        else:
+            embeddings = self.to_tensor()
+            cached_idx = list(range(len(self)))
+        try:
+            cache_data = {
+                "embeddings": embeddings,
+                "cached_indices": cached_idx,
+                "device": self.device,
+            }
+            torch.save(cache_data, path)
+            _logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
+        except Exception as e:
+            _logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
+    @classmethod
+    def load(cls, path: Path | str) -> Embeddings:
+        """
+        Loads the embeddings from disk.
+        Parameters
+        ----------
+        path : Path or str
+            The file path to load the embeddings from.
+        """
+        emb = Embeddings([], 0)
+        path = Path(os.path.abspath(path)) if isinstance(path, str) else path
+        if path.exists() and path.is_file():
+            try:
+                cache_data = torch.load(path, weights_only=False)
+                emb._embeddings_only = True
+                emb._embeddings = cache_data["embeddings"]
+                emb._cached_idx = cache_data["cached_indices"]
+                emb.device = cache_data["device"]
+                _logger.log(logging.DEBUG, f"Loaded embeddings cache from {path}")
+            except Exception as e:
+                _logger.log(logging.ERROR, f"Failed to load embeddings cache: {e}")
+                raise e
+        else:
+            raise FileNotFoundError(f"Specified cache file {path} was not found.")
+        return emb
     def _encode(self, images: list[torch.Tensor]) -> torch.Tensor:
         if self._transforms:
             images = [transform(image) for transform in self._transforms for image in images]
@@ -195,31 +309,37 @@ class Embeddings:
                     embeddings = self._encode(images)
                     if not self._embeddings.shape:
-                        full_shape = (len(self._dataset), *embeddings.shape[1:])
+                        full_shape = (len(self), *embeddings.shape[1:])
                         self._embeddings = torch.empty(full_shape, dtype=embeddings.dtype, device=self.device)
                     self._embeddings[uncached] = embeddings
                     self._cached_idx.update(uncached)
+                if isinstance(self.cache, Path):
+                    self._save(self.cache)
             yield self._embeddings[batch]
     def __getitem__(self, key: int | slice, /) -> torch.Tensor:
         if not isinstance(key, slice) and not hasattr(key, "__int__"):
             raise TypeError("Invalid argument type.")
-        if self._shallow:
+        indices = list(range(len(self))[key]) if isinstance(key, slice) else [int(key)]
+        if self._embeddings_only:
             if not self._embeddings.shape:
                 raise ValueError("Embeddings not initialized.")
+            if not set(indices).issubset(self._cached_idx):
+                raise ValueError("Unable to generate new embeddings from a shallow instance.")
             return self._embeddings[key]
-        indices = list(range(len(self._dataset))[key]) if isinstance(key, slice) else [int(key)]
         result = torch.vstack(list(self._batch(indices))).to(self.device)
         return result.squeeze(0) if len(indices) == 1 else result
     def __iter__(self) -> Iterator[torch.Tensor]:
         # process in batches while yielding individual embeddings
-        for batch in self._batch(range(self._length)):
+        for batch in self._batch(range(len(self))):
             yield from batch
     def __len__(self) -> int:
-        return self._length
+        return len(self._embeddings) if self._embeddings_only else len(self._dataset)

dataeval/{utils/data → data}/_metadata.py RENAMED Viewed

@@ -16,12 +16,12 @@ from dataeval.typing import (
 )
 from dataeval.utils._array import as_numpy, to_numpy
 from dataeval.utils._bin import bin_data, digitize_data, is_continuous
-from dataeval.utils.metadata import merge
+from dataeval.utils.data.metadata import merge
 if TYPE_CHECKING:
-    from dataeval.utils.data import Targets
+    from dataeval.data import Targets
 else:
-    from dataeval.utils.data._targets import Targets
+    from dataeval.data._targets import Targets
 class Metadata:
@@ -191,6 +191,11 @@ class Metadata:
         self._process()
         return self._image_indices
+    @property
+    def image_count(self) -> int:
+        self._process()
+        return int(self._image_indices.max() + 1)
     def _collate(self, force: bool = False):
         if self._collated and not force:
             return
@@ -359,12 +364,19 @@ class Metadata:
     def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
         self._merge()
-        self._processed = False
-        target_len = len(self.targets.source) if self.targets.source is not None else len(self.targets)
-        if any(len(v if isinstance(v, Sized) else as_numpy(v)) != target_len for v in factors.values()):
+        targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
+        images = self.image_count
+        lengths = {k: len(v if isinstance(v, Sized) else np.atleast_1d(as_numpy(v))) for k, v in factors.items()}
+        targets_match = all(f == targets for f in lengths.values())
+        images_match = targets_match if images == targets else all(f == images for f in lengths.values())
+        if not targets_match and not images_match:
             raise ValueError(
                 "The lists/arrays in the provided factors have a different length than the current metadata factors."
             )
-        merged = cast(tuple[dict[str, ArrayLike], dict[str, list[str]]], self._merged)[0]
+        merged = cast(dict[str, ArrayLike], self._merged[0] if self._merged is not None else {})
         for k, v in factors.items():
-            merged[k] = v
+            v = as_numpy(v)
+            merged[k] = v if (self.targets.source is None or lengths[k] == targets) else v[self.targets.source]
+        self._processed = False

dataeval/{utils/data → data}/_selection.py RENAMED Viewed

@@ -25,6 +25,10 @@ class Selection(Generic[_TDatum]):
         return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.__dict__.items()])})"
+class Subselection(Generic[_TDatum]):
+    def __call__(self, original: _TDatum) -> _TDatum: ...
 class Select(AnnotatedDataset[_TDatum]):
     """
     Wraps a dataset and applies selection criteria to it.
@@ -38,7 +42,7 @@ class Select(AnnotatedDataset[_TDatum]):
     Examples
     --------
-    >>> from dataeval.utils.data.selections import ClassFilter, Limit
+    >>> from dataeval.data.selections import ClassFilter, Limit
     >>> # Construct a sample dataset with size of 100 and class count of 10
     >>> # Elements at index `idx` are returned as tuples:
@@ -63,6 +67,7 @@ class Select(AnnotatedDataset[_TDatum]):
     _selection: list[int]
     _selections: Sequence[Selection[_TDatum]]
     _size_limit: int
+    _subselections: list[tuple[Subselection[_TDatum], set[int]]]
     def __init__(
         self,
@@ -73,7 +78,8 @@ class Select(AnnotatedDataset[_TDatum]):
         self._dataset = dataset
         self._size_limit = len(dataset)
         self._selection = list(range(self._size_limit))
-        self._selections = self._sort(selections)
+        self._selections = self._sort_selections(selections)
+        self._subselections = []
         # Ensure metadata is populated correctly as DatasetMetadata TypedDict
         _metadata = getattr(dataset, "metadata", {})
@@ -81,7 +87,7 @@ class Select(AnnotatedDataset[_TDatum]):
             _metadata["id"] = dataset.__class__.__name__
         self._metadata = DatasetMetadata(**_metadata)
-        self._select()
+        self._apply_selections()
     @property
     def metadata(self) -> DatasetMetadata:
@@ -94,24 +100,31 @@ class Select(AnnotatedDataset[_TDatum]):
         selections = f"Selections: [{', '.join([str(s) for s in self._selections])}]"
         return f"{title}\n{sep}{nt}{selections}{nt}Selected Size: {len(self)}\n\n{self._dataset}"
-    def _sort(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
+    def _sort_selections(
+        self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None
+    ) -> list[Selection[_TDatum]]:
         if not selections:
             return []
-        selections = [selections] if isinstance(selections, Selection) else selections
-        grouped: dict[int, list[Selection]] = {}
-        for selection in selections:
+        selections_list = [selections] if isinstance(selections, Selection) else list(selections)
+        grouped: dict[int, list[Selection[_TDatum]]] = {}
+        for selection in selections_list:
             grouped.setdefault(selection.stage, []).append(selection)
         selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
         return selection_list
-    def _select(self) -> None:
+    def _apply_selections(self) -> None:
         for selection in self._selections:
             selection(self)
         self._selection = self._selection[: self._size_limit]
+    def _apply_subselection(self, datum: _TDatum, index: int) -> _TDatum:
+        for subselection, indices in self._subselections:
+            datum = subselection(datum) if self._selection[index] in indices else datum
+        return datum
     def __getitem__(self, index: int) -> _TDatum:
-        return self._dataset[self._selection[index]]
+        return self._apply_subselection(self._dataset[self._selection[index]], index)
     def __iter__(self) -> Iterator[_TDatum]:
         for i in range(len(self)):

dataeval/{utils/data → data}/_split.py RENAMED Viewed

@@ -12,10 +12,10 @@ from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, Str
 from sklearn.utils.multiclass import type_of_target
 from dataeval.config import EPSILON
+from dataeval.data._metadata import Metadata
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
 from dataeval.typing import AnnotatedDataset
-from dataeval.utils.data._metadata import Metadata
 _logger = logging.getLogger(__name__)

dataeval/data/selections/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Provides selection classes for selecting subsets of Computer Vision datasets."""
+__all__ = [
+    "ClassBalance",
+    "ClassFilter",
+    "Indices",
+    "Limit",
+    "Prioritize",
+    "Reverse",
+    "Shuffle",
+]
+from dataeval.data.selections._classbalance import ClassBalance
+from dataeval.data.selections._classfilter import ClassFilter
+from dataeval.data.selections._indices import Indices
+from dataeval.data.selections._limit import Limit
+from dataeval.data.selections._prioritize import Prioritize
+from dataeval.data.selections._reverse import Reverse
+from dataeval.data.selections._shuffle import Shuffle

dataeval/{utils/data → data}/selections/_classbalance.py RENAMED Viewed

@@ -2,12 +2,11 @@ from __future__ import annotations
 __all__ = []
 import numpy as np
+from dataeval.data._selection import Select, Selection, SelectionStage
 from dataeval.typing import Array, ImageClassificationDatum
 from dataeval.utils._array import as_numpy
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
 class ClassBalance(Selection[ImageClassificationDatum]):

dataeval/data/selections/_classfilter.py ADDED Viewed

@@ -0,0 +1,110 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.data._selection import Select, Selection, SelectionStage, Subselection
+from dataeval.typing import Array, ObjectDetectionDatum, ObjectDetectionTarget, SegmentationDatum, SegmentationTarget
+from dataeval.utils._array import as_numpy
+class ClassFilter(Selection[Any]):
+    """
+    Filter the dataset by class.
+    Parameters
+    ----------
+    classes : Sequence[int]
+        The classes to filter by.
+    filter_detections : bool, default True
+        Whether to filter detections from targets for object detection and segmentation datasets.
+    """
+    stage = SelectionStage.FILTER
+    def __init__(self, classes: Sequence[int], filter_detections: bool = True) -> None:
+        self.classes = classes
+        self.filter_detections = filter_detections
+    def __call__(self, dataset: Select[Any]) -> None:
+        if not self.classes:
+            return
+        selection = []
+        subselection = set()
+        for idx in dataset._selection:
+            target = dataset._dataset[idx][1]
+            if isinstance(target, Array):
+                # Get the label for the image
+                label = int(np.argmax(as_numpy(target)))
+                # Check to see if the label is in the classes to filter for
+                if label in self.classes:
+                    # Include the image
+                    selection.append(idx)
+            elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
+                # Get the set of labels from the target
+                labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
+                # Check to see if any labels are in the classes to filter for
+                if labels.intersection(self.classes):
+                    # Include the image
+                    selection.append(idx)
+                    # If we are filtering out other labels and there are other labels, add a subselection filter
+                    if self.filter_detections and labels.difference(self.classes):
+                        subselection.add(idx)
+            else:
+                raise TypeError(f"ClassFilter does not support targets of type {type(target)}.")
+        dataset._selection = selection
+        dataset._subselections.append((ClassFilterSubSelection(self.classes), subselection))
+_T = TypeVar("_T")
+_TDatum = TypeVar("_TDatum", ObjectDetectionDatum, SegmentationDatum)
+_TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
+def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
+    if isinstance(obj, Sized) and not isinstance(obj, (str, bytes, bytearray)) and len(obj) == len(mask):
+        if isinstance(obj, Array):
+            return obj[mask]
+        elif isinstance(obj, Sequence):
+            return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
+    return obj
+class ClassFilterTarget(Generic[_TTarget]):
+    def __init__(self, target: _TTarget, mask: NDArray[np.bool_]) -> None:
+        self.__dict__.update(target.__dict__)
+        self._length = len(target.labels) if isinstance(target.labels, Sized) else int(bool(target.labels))
+        self._mask = mask
+        self._target = target
+    def __getattribute__(self, name: str) -> Any:
+        if name in ("_length", "_mask", "_target") or name.startswith("__") and name.endswith("__"):
+            return super().__getattribute__(name)
+        attr = getattr(self._target, name)
+        return _try_mask_object(attr, self._mask)
+class ClassFilterSubSelection(Subselection[Any]):
+    def __init__(self, classes: Sequence[int]) -> None:
+        self.classes = classes
+    def _filter(self, d: dict[str, Any], mask: NDArray[np.bool_]) -> dict[str, Any]:
+        return {k: self._filter(v, mask) if isinstance(v, dict) else _try_mask_object(v, mask) for k, v in d.items()}
+    def __call__(self, datum: _TDatum) -> _TDatum:
+        # build a mask for any arrays
+        image, target, metadata = datum
+        mask = np.isin(as_numpy(target.labels), self.classes)
+        filtered_metadata = self._filter(metadata, mask)
+        # return a masked datum
+        filtered_datum = image, ClassFilterTarget(target, mask), filtered_metadata
+        return cast(_TDatum, filtered_datum)

dataeval/{utils/data → data}/selections/_indices.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any, Sequence
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Indices(Selection[Any]):

dataeval/{utils/data → data}/selections/_limit.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Limit(Selection[Any]):

dataeval/{utils/data → data}/selections/_prioritize.py RENAMED Viewed

@@ -14,8 +14,8 @@ from sklearn.cluster import KMeans
 from sklearn.metrics import pairwise_distances
 from dataeval.config import EPSILON, DeviceLike, get_seed
-from dataeval.utils.data import Embeddings, Select
-from dataeval.utils.data._selection import Selection, SelectionStage
+from dataeval.data import Embeddings, Select
+from dataeval.data._selection import Selection, SelectionStage
 _logger = logging.getLogger(__name__)

dataeval/{utils/data → data}/selections/_reverse.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Reverse(Selection[Any]):

dataeval/{utils/data → data}/selections/_shuffle.py RENAMED Viewed

@@ -8,9 +8,9 @@ import numpy as np
 from numpy.random import BitGenerator, Generator, SeedSequence
 from numpy.typing import NDArray
+from dataeval.data._selection import Select, Selection, SelectionStage
 from dataeval.typing import Array
 from dataeval.utils._array import as_numpy
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
 class Shuffle(Selection[Any]):

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -7,6 +7,8 @@ __all__ = [
     "DriftKS",
     "DriftMMD",
     "DriftMMDOutput",
+    "DriftMVDC",
+    "DriftMVDCOutput",
     "DriftOutput",
     "DriftUncertainty",
     "UpdateStrategy",
@@ -18,5 +20,6 @@ from dataeval.detectors.drift._base import UpdateStrategy
 from dataeval.detectors.drift._cvm import DriftCVM
 from dataeval.detectors.drift._ks import DriftKS
 from dataeval.detectors.drift._mmd import DriftMMD
+from dataeval.detectors.drift._mvdc import DriftMVDC
 from dataeval.detectors.drift._uncertainty import DriftUncertainty
-from dataeval.outputs._drift import DriftMMDOutput, DriftOutput
+from dataeval.outputs._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput

dataeval/detectors/drift/_base.py CHANGED Viewed

@@ -18,11 +18,11 @@ from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from numpy.typing import NDArray
+from dataeval.data import Embeddings
 from dataeval.outputs import DriftOutput
 from dataeval.outputs._base import set_metadata
 from dataeval.typing import Array
 from dataeval.utils._array import as_numpy, flatten
-from dataeval.utils.data import Embeddings
 R = TypeVar("R")

dataeval 0.84.1__py3-none-any.whl → 0.86.0__py3-none-any.whl

dataeval 0.84.1py3-none-any.whl → 0.86.0py3-none-any.whl