PyPI - dataeval - Versions diffs - 0.82.1__py3-none-any.whl → 0.83.0__py3-none-any.whl - Mend

dataeval 0.82.1py3-none-any.whl → 0.83.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

dataeval/__init__.py +7 -2
dataeval/config.py +10 -0
dataeval/metadata/__init__.py +2 -2
dataeval/metadata/_ood.py +144 -27
dataeval/metrics/bias/_balance.py +3 -3
dataeval/metrics/estimators/_ber.py +2 -1
dataeval/metrics/stats/_base.py +17 -18
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +2 -1
dataeval/outputs/_metadata.py +7 -0
dataeval/typing.py +40 -9
dataeval/utils/_mst.py +1 -2
dataeval/utils/data/_embeddings.py +15 -10
dataeval/utils/data/_selection.py +22 -11
dataeval/utils/data/datasets/_base.py +4 -2
dataeval/utils/data/datasets/_cifar10.py +17 -9
dataeval/utils/data/datasets/_milco.py +18 -12
dataeval/utils/data/datasets/_mnist.py +24 -8
dataeval/utils/data/datasets/_ships.py +18 -8
dataeval/utils/data/datasets/_types.py +1 -5
dataeval/utils/data/datasets/_voc.py +47 -24
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classfilter.py +1 -1
dataeval/utils/data/selections/_prioritize.py +296 -0
dataeval/utils/data/selections/_shuffle.py +13 -4
dataeval/utils/torch/_gmm.py +3 -2
{dataeval-0.82.1.dist-info → dataeval-0.83.0.dist-info}/METADATA +4 -4
{dataeval-0.82.1.dist-info → dataeval-0.83.0.dist-info}/RECORD +34 -34
dataeval/detectors/ood/metadata_ood_mi.py +0 -91
{dataeval-0.82.1.dist-info → dataeval-0.83.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.82.1.dist-info → dataeval-0.83.0.dist-info}/WHEEL +0 -0

dataeval/typing.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Common type hints used for interoperability with DataEval.
+Common type protocols used for interoperability with DataEval.
 """
 __all__ = [
@@ -16,6 +16,7 @@ __all__ = [
     "SegmentationTarget",
     "SegmentationDatum",
     "SegmentationDataset",
+    "Transform",
 ]
@@ -66,6 +67,7 @@ class Array(Protocol):
     def __len__(self) -> int: ...
+T = TypeVar("T")
 _T_co = TypeVar("_T_co", covariant=True)
 _ScalarType = Union[int, float, bool, str]
 ArrayLike: TypeAlias = Union[Sequence[_ScalarType], Sequence[Sequence[_ScalarType]], Sequence[Array], Array]
@@ -140,7 +142,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
 ImageClassificationDatum: TypeAlias = tuple[Array, Array, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`Array` of shape (N,) - Class label as one-hot encoded ground-truth or prediction confidences.
@@ -150,7 +152,7 @@ A type definition for an image classification datum tuple.
 ImageClassificationDataset: TypeAlias = AnnotatedDataset[ImageClassificationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
 """
 # ========== OBJECT DETECTION DATASETS ==========
@@ -159,7 +161,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificatio
 @runtime_checkable
 class ObjectDetectionTarget(Protocol):
     """
-    A protocol for targets in an Object Detection dataset.
+    Protocol for targets in an Object Detection dataset.
     Attributes
     ----------
@@ -180,7 +182,7 @@ class ObjectDetectionTarget(Protocol):
 ObjectDetectionDatum: TypeAlias = tuple[Array, ObjectDetectionTarget, dict[str, Any]]
 """
-A type definition for an object detection datum tuple.
+Type alias for an object detection datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`ObjectDetectionTarget` - Object detection target information for the image.
@@ -190,7 +192,7 @@ A type definition for an object detection datum tuple.
 ObjectDetectionDataset: TypeAlias = AnnotatedDataset[ObjectDetectionDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
 """
@@ -200,7 +202,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDat
 @runtime_checkable
 class SegmentationTarget(Protocol):
     """
-    A protocol for targets in a Segmentation dataset.
+    Protocol for targets in a Segmentation dataset.
     Attributes
     ----------
@@ -221,7 +223,7 @@ class SegmentationTarget(Protocol):
 SegmentationDatum: TypeAlias = tuple[Array, SegmentationTarget, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`SegmentationTarget` - Segmentation target information for the image.
@@ -230,5 +232,34 @@ A type definition for an image classification datum tuple.
 SegmentationDataset: TypeAlias = AnnotatedDataset[SegmentationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
 """
+@runtime_checkable
+class Transform(Generic[T], Protocol):
+    """
+    Protocol defining a transform function.
+    Requires a `__call__` method that returns transformed data.
+    Example
+    -------
+    >>> from typing import Any
+    >>> from numpy.typing import NDArray
+    >>> class MyTransform:
+    ...     def __init__(self, divisor: float) -> None:
+    ...         self.divisor = divisor
+    ...
+    ...     def __call__(self, data: NDArray[Any], /) -> NDArray[Any]:
+    ...         return data / self.divisor
+    >>> my_transform = MyTransform(divisor=255.0)
+    >>> isinstance(my_transform, Transform)
+    True
+    >>> my_transform(np.array([1, 2, 3]))
+    array([0.004, 0.008, 0.012])
+    """
+    def __call__(self, data: T, /) -> T: ...

dataeval/utils/_mst.py CHANGED Viewed

@@ -10,10 +10,9 @@ from scipy.sparse.csgraph import minimum_spanning_tree as mst
 from scipy.spatial.distance import pdist, squareform
 from sklearn.neighbors import NearestNeighbors
+from dataeval.config import EPSILON
 from dataeval.utils._array import flatten
-EPSILON = 1e-5
 def minimum_spanning_tree(X: NDArray[Any]) -> Any:
     """

dataeval/utils/data/_embeddings.py CHANGED Viewed

@@ -57,20 +57,27 @@ class Embeddings:
         self._encoder = model.encode if isinstance(model, SupportsEncode) else model
         self._collate_fn = lambda datum: [torch.as_tensor(i) for i, _, _ in datum]
-    def to_tensor(self) -> torch.Tensor:
+    def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
         """
-        Converts entire dataset to embeddings.
+        Converts dataset to embeddings.
-        Warning
-        -------
-        Will process the entire dataset in batches and return
-        embeddings as a single Tensor in memory.
+        Parameters
+        ----------
+        indices : Sequence[int] or None, default None
+            The indices to convert to embeddings
         Returns
         -------
         torch.Tensor
+        Warning
+        -------
+        Processing large quantities of data can be resource intensive.
         """
-        return self[:]
+        if indices is not None:
+            return torch.vstack(list(self._batch(indices))).to(self.device)
+        else:
+            return self[:]
     # Reduce overhead cost by not tracking tensor gradients
     @torch.no_grad
@@ -85,9 +92,7 @@ class Embeddings:
             embeddings = self._encoder(torch.stack(images).to(self.device))
             yield embeddings
-    def __getitem__(self, key: int | slice | list[int], /) -> torch.Tensor:
-        if isinstance(key, list):
-            return torch.vstack(list(self._batch(key))).to(self.device)
+    def __getitem__(self, key: int | slice, /) -> torch.Tensor:
         if isinstance(key, slice):
             return torch.vstack(list(self._batch(range(len(self._dataset))[key]))).to(self.device)
         elif isinstance(key, int):

dataeval/utils/data/_selection.py CHANGED Viewed

@@ -5,9 +5,9 @@ __all__ = []
 from enum import IntEnum
 from typing import Generic, Iterator, Sequence, TypeVar
-from dataeval.typing import AnnotatedDataset, DatasetMetadata
+from dataeval.typing import AnnotatedDataset, DatasetMetadata, Transform
-_TDatum = TypeVar("_TDatum", covariant=True)
+_TDatum = TypeVar("_TDatum")
 class SelectionStage(IntEnum):
@@ -35,6 +35,8 @@ class Select(AnnotatedDataset[_TDatum]):
         The dataset to wrap.
     selections : Selection or list[Selection], optional
         The selection criteria to apply to the dataset.
+    transforms : Transform or list[Transform], optional
+        The transforms to apply to the dataset.
     Examples
     --------
@@ -67,13 +69,17 @@ class Select(AnnotatedDataset[_TDatum]):
     def __init__(
         self,
         dataset: AnnotatedDataset[_TDatum],
-        selections: Selection[_TDatum] | list[Selection[_TDatum]] | None = None,
+        selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None = None,
+        transforms: Transform[_TDatum] | Sequence[Transform[_TDatum]] | None = None,
     ) -> None:
         self.__dict__.update(dataset.__dict__)
         self._dataset = dataset
         self._size_limit = len(dataset)
         self._selection = list(range(self._size_limit))
-        self._selections = self._sort_selections(selections)
+        self._selections = self._sort(selections)
+        self._transforms = (
+            [] if transforms is None else [transforms] if isinstance(transforms, Transform) else transforms
+        )
         # Ensure metadata is populated correctly as DatasetMetadata TypedDict
         _metadata = getattr(dataset, "metadata", {})
@@ -81,8 +87,7 @@ class Select(AnnotatedDataset[_TDatum]):
             _metadata["id"] = dataset.__class__.__name__
         self._metadata = DatasetMetadata(**_metadata)
-        if self._selections:
-            self._apply_selections()
+        self._select()
     @property
     def metadata(self) -> DatasetMetadata:
@@ -92,10 +97,11 @@ class Select(AnnotatedDataset[_TDatum]):
         nt = "\n    "
         title = f"{self.__class__.__name__} Dataset"
         sep = "-" * len(title)
-        selections = f"Selections: [{', '.join([str(s) for s in self._sort_selections(self._selections)])}]"
-        return f"{title}\n{sep}{nt}{selections}{nt}Selected Size: {len(self)}\n\n{self._dataset}"
+        selections = f"Selections: [{', '.join([str(s) for s in self._selections])}]"
+        transforms = f"Transforms: [{', '.join([str(t) for t in self._transforms])}]"
+        return f"{title}\n{sep}{nt}{selections}{nt}{transforms}{nt}Selected Size: {len(self)}\n\n{self._dataset}"
-    def _sort_selections(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
+    def _sort(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
         if not selections:
             return []
@@ -106,13 +112,18 @@ class Select(AnnotatedDataset[_TDatum]):
         selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
         return selection_list
-    def _apply_selections(self) -> None:
+    def _select(self) -> None:
         for selection in self._selections:
             selection(self)
         self._selection = self._selection[: self._size_limit]
+    def _transform(self, datum: _TDatum) -> _TDatum:
+        for t in self._transforms:
+            datum = t(datum)
+        return datum
     def __getitem__(self, index: int) -> _TDatum:
-        return self._dataset[self._selection[index]]
+        return self._transform(self._dataset[self._selection[index]])
     def __iter__(self) -> Iterator[_TDatum]:
         for i in range(len(self)):

dataeval/utils/data/datasets/_base.py CHANGED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
 from dataeval.utils.data.datasets._fileio import _ensure_exists
 from dataeval.utils.data.datasets._mixin import BaseDatasetMixin
@@ -16,9 +16,11 @@ from dataeval.utils.data.datasets._types import (
     ObjectDetectionTarget,
     SegmentationDataset,
     SegmentationTarget,
-    Transform,
 )
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 _TArray = TypeVar("_TArray")
 _TTarget = TypeVar("_TTarget")
 _TRawTarget = TypeVar("_TRawTarget", list[int], list[str])

dataeval/utils/data/datasets/_cifar10.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Literal, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import numpy as np
 from numpy.typing import NDArray
@@ -11,7 +11,9 @@ from PIL import Image
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 CIFARClassStringMap = Literal["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
 TCIFARClassMap = TypeVar("TCIFARClassMap", CIFARClassStringMap, int, list[CIFARClassStringMap], list[int])
@@ -30,21 +32,27 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_milco.py CHANGED Viewed

@@ -1,23 +1,23 @@
 from __future__ import annotations
-from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
 __all__ = []
 from pathlib import Path
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseODDataset, DataLocation
-from dataeval.utils.data.datasets._types import Transform
+from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     """
     A side-scan sonar dataset focused on mine (object) detection.
     The dataset comes from the paper
     `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
     by N.P. Santos et. al. (2024).
@@ -43,21 +43,27 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "base"
+        The base image set is the only available image set for the MILCO dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_mnist.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Literal, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import numpy as np
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 MNISTClassStringMap = Literal["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
 TMNISTClassMap = TypeVar("TMNISTClassMap", MNISTClassStringMap, int, list[MNISTClassStringMap], list[int])
@@ -52,19 +54,33 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
+    corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
+        "shear", "scale", "rotate", "brightness", "translate", "stripe", "fog", "spatter", \
+        "dotted_line", "zigzag", "canny_edges" or None, default None
+        Corruption to apply to the data.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    corruption : str or None
+        Corruption applied to the data.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_ships.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 import numpy as np
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
@@ -32,19 +34,27 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "base"
+        The base image set is the only available image set for the Ships dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_types.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from dataclasses import dataclass
-from typing import Any, Generic, Protocol, TypedDict, TypeVar
+from typing import Any, Generic, TypedDict, TypeVar
 from torch.utils.data import Dataset
 from typing_extensions import NotRequired, Required
@@ -46,7 +46,3 @@ class SegmentationTarget(Generic[_TArray]):
 class SegmentationDataset(AnnotatedDataset[tuple[_TArray, SegmentationTarget[_TArray], dict[str, Any]]]): ...
-class Transform(Generic[_TArray], Protocol):
-    def __call__(self, data: _TArray, /) -> _TArray: ...

dataeval 0.82.1__py3-none-any.whl → 0.83.0__py3-none-any.whl

dataeval 0.82.1py3-none-any.whl → 0.83.0py3-none-any.whl