PyPI - dataeval - Versions diffs - 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

dataeval/__init__.py +1 -1
dataeval/data/__init__.py +19 -0
dataeval/data/_embeddings.py +345 -0
dataeval/{utils/data → data}/_images.py +2 -2
dataeval/{utils/data → data}/_metadata.py +8 -7
dataeval/{utils/data → data}/_selection.py +22 -9
dataeval/{utils/data → data}/_split.py +1 -1
dataeval/data/selections/__init__.py +19 -0
dataeval/data/selections/_classbalance.py +37 -0
dataeval/data/selections/_classfilter.py +109 -0
dataeval/{utils/data → data}/selections/_indices.py +1 -1
dataeval/{utils/data → data}/selections/_limit.py +1 -1
dataeval/{utils/data → data}/selections/_prioritize.py +3 -3
dataeval/{utils/data → data}/selections/_reverse.py +1 -1
dataeval/{utils/data → data}/selections/_shuffle.py +3 -3
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +55 -203
dataeval/detectors/drift/_cvm.py +19 -30
dataeval/detectors/drift/_ks.py +18 -30
dataeval/detectors/drift/_mmd.py +189 -53
dataeval/detectors/drift/_uncertainty.py +52 -56
dataeval/detectors/drift/updates.py +13 -12
dataeval/detectors/linters/duplicates.py +6 -4
dataeval/detectors/linters/outliers.py +3 -3
dataeval/detectors/ood/ae.py +1 -1
dataeval/metadata/_distance.py +1 -1
dataeval/metadata/_ood.py +4 -4
dataeval/metrics/bias/_balance.py +1 -1
dataeval/metrics/bias/_diversity.py +1 -1
dataeval/metrics/bias/_parity.py +1 -1
dataeval/metrics/stats/_base.py +7 -7
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +2 -2
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/_bias.py +1 -1
dataeval/typing.py +53 -19
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +18 -7
dataeval/utils/data/__init__.py +5 -20
dataeval/utils/data/_dataset.py +6 -4
dataeval/utils/data/collate.py +2 -0
dataeval/utils/datasets/__init__.py +17 -0
dataeval/utils/{data/datasets → datasets}/_base.py +10 -7
dataeval/utils/{data/datasets → datasets}/_cifar10.py +11 -11
dataeval/utils/{data/datasets → datasets}/_milco.py +44 -16
dataeval/utils/{data/datasets → datasets}/_mnist.py +11 -7
dataeval/utils/{data/datasets → datasets}/_ships.py +10 -6
dataeval/utils/{data/datasets → datasets}/_voc.py +43 -22
dataeval/utils/torch/_internal.py +12 -35
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/METADATA +2 -3
dataeval-1.0.0.dist-info/RECORD +107 -0
dataeval/detectors/drift/_torch.py +0 -222
dataeval/utils/data/_embeddings.py +0 -186
dataeval/utils/data/datasets/__init__.py +0 -17
dataeval/utils/data/selections/__init__.py +0 -17
dataeval/utils/data/selections/_classfilter.py +0 -59
dataeval-0.84.0.dist-info/RECORD +0 -106
/dataeval/{utils/data → data}/_targets.py +0 -0
/dataeval/utils/{metadata.py → data/metadata.py} +0 -0
/dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.84.0"
+__version__ = "1.0.0"
 import logging

dataeval/data/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Provides utility functions for interacting with Computer Vision datasets."""
+__all__ = [
+    "Embeddings",
+    "Images",
+    "Metadata",
+    "Select",
+    "SplitDatasetOutput",
+    "Targets",
+    "split_dataset",
+]
+from dataeval.data._embeddings import Embeddings
+from dataeval.data._images import Images
+from dataeval.data._metadata import Metadata
+from dataeval.data._selection import Select
+from dataeval.data._split import split_dataset
+from dataeval.data._targets import Targets
+from dataeval.outputs._utils import SplitDatasetOutput

dataeval/data/_embeddings.py ADDED Viewed

@@ -0,0 +1,345 @@
+from __future__ import annotations
+__all__ = []
+import logging
+import math
+import os
+from pathlib import Path
+from typing import Any, Iterator, Sequence, cast
+import torch
+import xxhash as xxh
+from numpy.typing import NDArray
+from torch.utils.data import DataLoader, Subset
+from tqdm import tqdm
+from dataeval.config import DeviceLike, get_device
+from dataeval.typing import AnnotatedDataset, AnnotatedModel, Array, ArrayLike, Dataset, Transform
+from dataeval.utils._array import as_numpy
+from dataeval.utils.torch.models import SupportsEncode
+_logger = logging.getLogger(__name__)
+class Embeddings:
+    """
+    Collection of image embeddings from a dataset.
+    Embeddings are accessed by index or slice and are only loaded on-demand.
+    Parameters
+    ----------
+    dataset : ImageClassificationDataset or ObjectDetectionDataset
+        Dataset to access original images from.
+    batch_size : int
+        Batch size to use when encoding images.
+    transforms : Transform or Sequence[Transform] or None, default None
+        Transforms to apply to images before encoding.
+    model : torch.nn.Module or None, default None
+        Model to use for encoding images.
+    device : DeviceLike or None, default None
+        The hardware device to use if specified, otherwise uses the DataEval
+        default or torch default.
+    cache : Path, str, or bool, default False
+        Whether to cache the embeddings to a file or in memory.
+        When a Path or string is provided, embeddings will be cached to disk.
+    verbose : bool, default False
+        Whether to print progress bar when encoding images.
+    Attributes
+    ----------
+    batch_size : int
+        Batch size to use when encoding images.
+    cache : Path or bool
+        The path to cache embeddings to file, or True if caching to memory.
+    device : torch.device
+        The hardware device to use if specified, otherwise uses the DataEval
+        default or torch default.
+    verbose : bool
+        Whether to print progress bar when encoding images.
+    """
+    device: torch.device
+    batch_size: int
+    verbose: bool
+    def __init__(
+        self,
+        dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike],
+        batch_size: int,
+        transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
+        model: torch.nn.Module | None = None,
+        device: DeviceLike | None = None,
+        cache: Path | str | bool = False,
+        verbose: bool = False,
+    ) -> None:
+        self.device = get_device(device)
+        self.batch_size = batch_size if batch_size > 0 else 1
+        self.verbose = verbose
+        self._embeddings_only: bool = False
+        self._dataset = dataset
+        model = torch.nn.Flatten() if model is None else model
+        self._transforms = [transforms] if isinstance(transforms, Transform) else transforms
+        self._model = model.to(self.device).eval() if isinstance(model, torch.nn.Module) else model
+        self._encoder = model.encode if isinstance(model, SupportsEncode) else model
+        self._collate_fn = lambda datum: [torch.as_tensor(d[0] if isinstance(d, tuple) else d) for d in datum]
+        self._cached_idx: set[int] = set()
+        self._embeddings: torch.Tensor = torch.empty(())
+        self._cache = cache if isinstance(cache, bool) else self._resolve_path(cache)
+    def __hash__(self) -> int:
+        if self._embeddings_only:
+            bid = as_numpy(self._embeddings).ravel().tobytes()
+        else:
+            did = self._dataset.metadata["id"] if isinstance(self._dataset, AnnotatedDataset) else str(self._dataset)
+            mid = self._model.metadata["id"] if isinstance(self._model, AnnotatedModel) else str(self._model)
+            tid = str.join("|", [str(t) for t in self._transforms or []])
+            bid = f"{did}{mid}{tid}".encode()
+        return int(xxh.xxh3_64_hexdigest(bid), 16)
+    @property
+    def cache(self) -> Path | bool:
+        return self._cache
+    @cache.setter
+    def cache(self, value: Path | str | bool) -> None:
+        if isinstance(value, bool) and not value:
+            self._cached_idx = set()
+            self._embeddings = torch.empty(())
+        elif isinstance(value, (Path, str)):
+            value = self._resolve_path(value)
+        if isinstance(value, Path) and value != getattr(self, "_cache", None):
+            self._save(value)
+        self._cache = value
+    def _resolve_path(self, path: Path | str) -> Path:
+        if isinstance(path, str):
+            path = Path(os.path.abspath(path))
+        if isinstance(path, Path) and (path.is_dir() or not path.suffix):
+            path = path / f"emb-{hash(self)}.pt"
+        return path
+    def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
+        """
+        Converts dataset to embeddings.
+        Parameters
+        ----------
+        indices : Sequence[int] or None, default None
+            The indices to convert to embeddings
+        Returns
+        -------
+        torch.Tensor
+        Warning
+        -------
+        Processing large quantities of data can be resource intensive.
+        """
+        if indices is not None:
+            return torch.vstack(list(self._batch(indices))).to(self.device)
+        else:
+            return self[:]
+    def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
+        """
+        Converts dataset to embeddings as numpy array.
+        Parameters
+        ----------
+        indices : Sequence[int] or None, default None
+            The indices to convert to embeddings
+        Returns
+        -------
+        NDArray[Any]
+        Warning
+        -------
+        Processing large quantities of data can be resource intensive.
+        """
+        return self.to_tensor(indices).cpu().numpy()
+    def new(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Embeddings:
+        """
+        Creates a new Embeddings object with the same parameters but a different dataset.
+        Parameters
+        ----------
+        dataset : ImageClassificationDataset or ObjectDetectionDataset
+            Dataset to access original images from.
+        Returns
+        -------
+        Embeddings
+        """
+        if self._embeddings_only:
+            raise ValueError("Embeddings object does not have a model.")
+        return Embeddings(
+            dataset, self.batch_size, self._transforms, self._model, self.device, bool(self.cache), self.verbose
+        )
+    @classmethod
+    def from_array(cls, array: ArrayLike, device: DeviceLike | None = None) -> Embeddings:
+        """
+        Instantiates a shallow Embeddings object using an array.
+        Parameters
+        ----------
+        array : ArrayLike
+            The array to convert to embeddings.
+        device : DeviceLike or None, default None
+            The hardware device to use if specified, otherwise uses the DataEval
+            default or torch default.
+        Returns
+        -------
+        Embeddings
+        Example
+        -------
+        >>> import numpy as np
+        >>> from dataeval.data import Embeddings
+        >>> array = np.random.randn(100, 3, 224, 224)
+        >>> embeddings = Embeddings.from_array(array)
+        >>> print(embeddings.to_tensor().shape)
+        torch.Size([100, 3, 224, 224])
+        """
+        embeddings = Embeddings([], 0, None, None, device, True, False)
+        array = array if isinstance(array, Array) else as_numpy(array)
+        embeddings._cached_idx = set(range(len(array)))
+        embeddings._embeddings = torch.as_tensor(array).to(get_device(device))
+        embeddings._embeddings_only = True
+        return embeddings
+    def save(self, path: Path | str) -> None:
+        """
+        Saves the embeddings to disk.
+        Parameters
+        ----------
+        path : Path or str
+            The file path to save the embeddings to.
+        """
+        self._save(self._resolve_path(path), True)
+    def _save(self, path: Path, force: bool = False) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        if self._embeddings_only or self.cache and not force:
+            embeddings = self._embeddings
+            cached_idx = self._cached_idx
+        else:
+            embeddings = self.to_tensor()
+            cached_idx = list(range(len(self)))
+        try:
+            cache_data = {
+                "embeddings": embeddings,
+                "cached_indices": cached_idx,
+                "device": self.device,
+            }
+            torch.save(cache_data, path)
+            _logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
+        except Exception as e:
+            _logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
+    @classmethod
+    def load(cls, path: Path | str) -> Embeddings:
+        """
+        Loads the embeddings from disk.
+        Parameters
+        ----------
+        path : Path or str
+            The file path to load the embeddings from.
+        """
+        emb = Embeddings([], 0)
+        path = Path(os.path.abspath(path)) if isinstance(path, str) else path
+        if path.exists() and path.is_file():
+            try:
+                cache_data = torch.load(path, weights_only=False)
+                emb._embeddings_only = True
+                emb._embeddings = cache_data["embeddings"]
+                emb._cached_idx = cache_data["cached_indices"]
+                emb.device = cache_data["device"]
+                _logger.log(logging.DEBUG, f"Loaded embeddings cache from {path}")
+            except Exception as e:
+                _logger.log(logging.ERROR, f"Failed to load embeddings cache: {e}")
+                raise e
+        else:
+            raise FileNotFoundError(f"Specified cache file {path} was not found.")
+        return emb
+    def _encode(self, images: list[torch.Tensor]) -> torch.Tensor:
+        if self._transforms:
+            images = [transform(image) for transform in self._transforms for image in images]
+        return self._encoder(torch.stack(images).to(self.device))
+    @torch.no_grad()  # Reduce overhead cost by not tracking tensor gradients
+    def _batch(self, indices: Sequence[int]) -> Iterator[torch.Tensor]:
+        dataset = cast(torch.utils.data.Dataset, self._dataset)
+        total_batches = math.ceil(len(indices) / self.batch_size)
+        # If not caching, process all indices normally
+        if not self.cache:
+            for images in tqdm(
+                DataLoader(Subset(dataset, indices), self.batch_size, collate_fn=self._collate_fn),
+                total=total_batches,
+                desc="Batch embedding",
+                disable=not self.verbose,
+            ):
+                yield self._encode(images)
+            return
+        # If caching, process each batch of indices at a time, preserving original order
+        for i in tqdm(range(0, len(indices), self.batch_size), desc="Batch embedding", disable=not self.verbose):
+            batch = indices[i : i + self.batch_size]
+            uncached = [idx for idx in batch if idx not in self._cached_idx]
+            if uncached:
+                # Process uncached indices as as single batch
+                for images in DataLoader(Subset(dataset, uncached), len(uncached), collate_fn=self._collate_fn):
+                    embeddings = self._encode(images)
+                    if not self._embeddings.shape:
+                        full_shape = (len(self), *embeddings.shape[1:])
+                        self._embeddings = torch.empty(full_shape, dtype=embeddings.dtype, device=self.device)
+                    self._embeddings[uncached] = embeddings
+                    self._cached_idx.update(uncached)
+                if isinstance(self.cache, Path):
+                    self._save(self.cache)
+            yield self._embeddings[batch]
+    def __getitem__(self, key: int | slice, /) -> torch.Tensor:
+        if not isinstance(key, slice) and not hasattr(key, "__int__"):
+            raise TypeError("Invalid argument type.")
+        indices = list(range(len(self))[key]) if isinstance(key, slice) else [int(key)]
+        if self._embeddings_only:
+            if not self._embeddings.shape:
+                raise ValueError("Embeddings not initialized.")
+            if not set(indices).issubset(self._cached_idx):
+                raise ValueError("Unable to generate new embeddings from a shallow instance.")
+            return self._embeddings[key]
+        result = torch.vstack(list(self._batch(indices))).to(self.device)
+        return result.squeeze(0) if len(indices) == 1 else result
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        # process in batches while yielding individual embeddings
+        for batch in self._batch(range(len(self))):
+            yield from batch
+    def __len__(self) -> int:
+        return len(self._embeddings) if self._embeddings_only else len(self._dataset)

dataeval/{utils/data → data}/_images.py RENAMED Viewed

@@ -4,13 +4,13 @@ __all__ = []
 from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
-from dataeval.typing import Array, Dataset
+from dataeval.typing import Array, ArrayLike, Dataset
 from dataeval.utils._array import as_numpy, channels_first_to_last
 if TYPE_CHECKING:
     from matplotlib.figure import Figure
-T = TypeVar("T", bound=Array)
+T = TypeVar("T", Array, ArrayLike)
 class Images(Generic[T]):

dataeval/{utils/data → data}/_metadata.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 import warnings
-from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence, Sized, cast
 import numpy as np
 from numpy.typing import NDArray
@@ -16,12 +16,12 @@ from dataeval.typing import (
 )
 from dataeval.utils._array import as_numpy, to_numpy
 from dataeval.utils._bin import bin_data, digitize_data, is_continuous
-from dataeval.utils.metadata import merge
+from dataeval.utils.data.metadata import merge
 if TYPE_CHECKING:
-    from dataeval.utils.data import Targets
+    from dataeval.data import Targets
 else:
-    from dataeval.utils.data._targets import Targets
+    from dataeval.data._targets import Targets
 class Metadata:
@@ -208,8 +208,9 @@ class Metadata:
             raw.append(metadata)
             if is_od_target := isinstance(target, ObjectDetectionTarget):
-                target_len = len(target.labels)
-                labels.extend(as_numpy(target.labels).tolist())
+                target_labels = as_numpy(target.labels)
+                target_len = len(target_labels)
+                labels.extend(target_labels.tolist())
                 bboxes.extend(as_numpy(target.boxes).tolist())
                 scores.extend(as_numpy(target.scores).tolist())
                 srcidx.extend([i] * target_len)
@@ -360,7 +361,7 @@ class Metadata:
         self._merge()
         self._processed = False
         target_len = len(self.targets.source) if self.targets.source is not None else len(self.targets)
-        if any(len(v) != target_len for v in factors.values()):
+        if any(len(v if isinstance(v, Sized) else as_numpy(v)) != target_len for v in factors.values()):
             raise ValueError(
                 "The lists/arrays in the provided factors have a different length than the current metadata factors."
             )

dataeval/{utils/data → data}/_selection.py RENAMED Viewed

@@ -25,6 +25,10 @@ class Selection(Generic[_TDatum]):
         return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.__dict__.items()])})"
+class Subselection(Generic[_TDatum]):
+    def __call__(self, original: _TDatum) -> _TDatum: ...
 class Select(AnnotatedDataset[_TDatum]):
     """
     Wraps a dataset and applies selection criteria to it.
@@ -38,7 +42,7 @@ class Select(AnnotatedDataset[_TDatum]):
     Examples
     --------
-    >>> from dataeval.utils.data.selections import ClassFilter, Limit
+    >>> from dataeval.data.selections import ClassFilter, Limit
     >>> # Construct a sample dataset with size of 100 and class count of 10
     >>> # Elements at index `idx` are returned as tuples:
@@ -63,6 +67,7 @@ class Select(AnnotatedDataset[_TDatum]):
     _selection: list[int]
     _selections: Sequence[Selection[_TDatum]]
     _size_limit: int
+    _subselections: list[tuple[Subselection[_TDatum], set[int]]]
     def __init__(
         self,
@@ -73,7 +78,8 @@ class Select(AnnotatedDataset[_TDatum]):
         self._dataset = dataset
         self._size_limit = len(dataset)
         self._selection = list(range(self._size_limit))
-        self._selections = self._sort(selections)
+        self._selections = self._sort_selections(selections)
+        self._subselections = []
         # Ensure metadata is populated correctly as DatasetMetadata TypedDict
         _metadata = getattr(dataset, "metadata", {})
@@ -81,7 +87,7 @@ class Select(AnnotatedDataset[_TDatum]):
             _metadata["id"] = dataset.__class__.__name__
         self._metadata = DatasetMetadata(**_metadata)
-        self._select()
+        self._apply_selections()
     @property
     def metadata(self) -> DatasetMetadata:
@@ -94,24 +100,31 @@ class Select(AnnotatedDataset[_TDatum]):
         selections = f"Selections: [{', '.join([str(s) for s in self._selections])}]"
         return f"{title}\n{sep}{nt}{selections}{nt}Selected Size: {len(self)}\n\n{self._dataset}"
-    def _sort(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
+    def _sort_selections(
+        self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None
+    ) -> list[Selection[_TDatum]]:
         if not selections:
             return []
-        selections = [selections] if isinstance(selections, Selection) else selections
-        grouped: dict[int, list[Selection]] = {}
-        for selection in selections:
+        selections_list = [selections] if isinstance(selections, Selection) else list(selections)
+        grouped: dict[int, list[Selection[_TDatum]]] = {}
+        for selection in selections_list:
             grouped.setdefault(selection.stage, []).append(selection)
         selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
         return selection_list
-    def _select(self) -> None:
+    def _apply_selections(self) -> None:
         for selection in self._selections:
             selection(self)
         self._selection = self._selection[: self._size_limit]
+    def _apply_subselection(self, datum: _TDatum, index: int) -> _TDatum:
+        for subselection, indices in self._subselections:
+            datum = subselection(datum) if index in indices else datum
+        return datum
     def __getitem__(self, index: int) -> _TDatum:
-        return self._dataset[self._selection[index]]
+        return self._apply_subselection(self._dataset[self._selection[index]], index)
     def __iter__(self) -> Iterator[_TDatum]:
         for i in range(len(self)):

dataeval/{utils/data → data}/_split.py RENAMED Viewed

@@ -12,10 +12,10 @@ from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, Str
 from sklearn.utils.multiclass import type_of_target
 from dataeval.config import EPSILON
+from dataeval.data._metadata import Metadata
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
 from dataeval.typing import AnnotatedDataset
-from dataeval.utils.data._metadata import Metadata
 _logger = logging.getLogger(__name__)

dataeval/data/selections/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""Provides selection classes for selecting subsets of Computer Vision datasets."""
+__all__ = [
+    "ClassBalance",
+    "ClassFilter",
+    "Indices",
+    "Limit",
+    "Prioritize",
+    "Reverse",
+    "Shuffle",
+]
+from dataeval.data.selections._classbalance import ClassBalance
+from dataeval.data.selections._classfilter import ClassFilter
+from dataeval.data.selections._indices import Indices
+from dataeval.data.selections._limit import Limit
+from dataeval.data.selections._prioritize import Prioritize
+from dataeval.data.selections._reverse import Reverse
+from dataeval.data.selections._shuffle import Shuffle

dataeval/data/selections/_classbalance.py ADDED Viewed

@@ -0,0 +1,37 @@
+from __future__ import annotations
+__all__ = []
+import numpy as np
+from dataeval.data._selection import Select, Selection, SelectionStage
+from dataeval.typing import Array, ImageClassificationDatum
+from dataeval.utils._array import as_numpy
+class ClassBalance(Selection[ImageClassificationDatum]):
+    """
+    Balance the dataset by class.
+    Note
+    ----
+    The total number of instances of each class will be equalized which may result
+    in a lower total number of instances than specified by the selection limit.
+    """
+    stage = SelectionStage.FILTER
+    def __call__(self, dataset: Select[ImageClassificationDatum]) -> None:
+        class_indices: dict[int, list[int]] = {}
+        for i, idx in enumerate(dataset._selection):
+            target = dataset._dataset[idx][1]
+            if isinstance(target, Array):
+                label = int(np.argmax(as_numpy(target)))
+            else:
+                # ObjectDetectionTarget and SegmentationTarget not supported yet
+                raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
+            class_indices.setdefault(label, []).append(i)
+        per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))
+        subselection = sorted([i for v in class_indices.values() for i in v[:per_class_limit]])
+        dataset._selection = [dataset._selection[i] for i in subselection]

dataeval 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl