PyPI - dataeval - Versions diffs - 0.84.0__py3-none-any.whl → 0.84.1__py3-none-any.whl - Mend

dataeval 0.84.0py3-none-any.whl → 0.84.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

dataeval/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +55 -203
dataeval/detectors/drift/_cvm.py +19 -30
dataeval/detectors/drift/_ks.py +18 -30
dataeval/detectors/drift/_mmd.py +189 -53
dataeval/detectors/drift/_uncertainty.py +52 -56
dataeval/detectors/drift/updates.py +13 -12
dataeval/detectors/linters/duplicates.py +5 -3
dataeval/detectors/linters/outliers.py +2 -2
dataeval/detectors/ood/ae.py +1 -1
dataeval/metrics/stats/_base.py +7 -7
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/typing.py +22 -19
dataeval/utils/_array.py +18 -7
dataeval/utils/data/_dataset.py +6 -4
dataeval/utils/data/_embeddings.py +46 -7
dataeval/utils/data/_images.py +2 -2
dataeval/utils/data/_metadata.py +5 -4
dataeval/utils/data/datasets/_base.py +7 -4
dataeval/utils/data/datasets/_cifar10.py +9 -9
dataeval/utils/data/datasets/_milco.py +42 -14
dataeval/utils/data/datasets/_mnist.py +9 -5
dataeval/utils/data/datasets/_ships.py +8 -4
dataeval/utils/data/datasets/_voc.py +40 -19
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classbalance.py +38 -0
dataeval/utils/data/selections/_classfilter.py +14 -29
dataeval/utils/data/selections/_prioritize.py +1 -1
dataeval/utils/data/selections/_shuffle.py +2 -2
dataeval/utils/torch/_internal.py +12 -35
{dataeval-0.84.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
{dataeval-0.84.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +39 -39
dataeval/detectors/drift/_torch.py +0 -222
{dataeval-0.84.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.84.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0

dataeval/utils/data/datasets/_milco.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 from numpy.typing import NDArray
@@ -16,21 +16,20 @@ if TYPE_CHECKING:
 class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     """
-    A side-scan sonar dataset focused on mine (object) detection.
+    A side-scan sonar dataset focused on mine-like object detection.
     The dataset comes from the paper
     `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
     by N.P. Santos et. al. (2024).
-    This class only accesses a portion of the above dataset due to size constraints.
     The full dataset contains 1170 side-scan sonar images collected using a 900-1800 kHz Marine Sonic
     dual frequency side-scan sonar of a Teledyne Marine Gavia Autonomous Underwater Vehicle.
     All the images were carefully analyzed and annotated, including the image coordinates of the
     Bounding Box (BB) of the detected objects divided into NOn-Mine-like BOttom Objects (NOMBO)
     and MIne-Like COntacts (MILCO) classes.
-    This dataset is consists of 261 images (120 images from 2015, 93 images from 2017, and 48 images from 2021).
-    In these 261 images, there are 315 MILCO objects, and 175 NOMBO objects.
+    This dataset is consists of 345 images from 2010, 120 images from 2015, 93 images from 2017, 564 images from 2018,
+    and 48 images from 2021). In these 1170 images, there are 432 MILCO objects, and 235 NOMBO objects.
     The class “0” corresponds to a MILCO object and the class “1” corresponds to a NOMBO object.
     The raw BB coordinates provided in the downloaded text files are (x, y, w, h),
     given as percentages of the image (x_BB = x/img_width, y_BB = y/img_height, etc.).
@@ -40,11 +39,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``milco`` folder exists.
+    image_set: "train", "operational", or "base", default "train"
+        If "train", then the images from 2015, 2017 and 2021 are selected,
+        resulting in 315 MILCO objects and 177 NOMBO objects.
+        If "operational", then the images from 2010 and 2018 are selected,
+        resulting in 117 MILCO objects and 58 NOMBO objects.
+        If "base", then the full dataset is selected.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform, Sequence[Transform] or None, default None
-        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
@@ -52,8 +57,8 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     path : pathlib.Path
         Location of the folder containing the data.
-    image_set : "base"
-        The base image set is the only available image set for the MILCO dataset.
+    image_set : "train", "operational" or "base"
+        The selected image set from the dataset.
     index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
     label2index : dict[str, int]
@@ -64,6 +69,10 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_
     """
     _resources = [
@@ -85,6 +94,18 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
             md5=True,
             checksum="b84749b21fa95a4a4c7de3741db78bc7",
         ),
+        DataLocation(
+            url="https://figshare.com/ndownloader/files/43169008",
+            filename="2010.zip",
+            md5=True,
+            checksum="43347a0cc383c0d3dbe0d24ae56f328d",
+        ),
+        DataLocation(
+            url="https://figshare.com/ndownloader/files/43169011",
+            filename="2018.zip",
+            md5=True,
+            checksum="25d091044a10c78674fedad655023e3b",
+        ),
     ]
     index2label: dict[int, str] = {
@@ -95,15 +116,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
+        image_set: Literal["train", "operational", "base"] = "train",
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(
             root,
-            download,
-            "base",
+            image_set,
             transforms,
+            download,
             verbose,
         )
@@ -112,10 +134,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         targets: list[str] = []
         datum_metadata: dict[str, list[Any]] = {}
         metadata_list: list[dict[str, Any]] = []
+        image_sets: dict[str, list[int]] = {
+            "base": list(range(len(self._resources))),
+            "train": list(range(3)),
+            "operational": list(range(3, len(self._resources))),
+        }
         # Load the data
-        for resource in self._resources:
-            self._resource = resource
+        resource_indices = image_sets[self.image_set]
+        for idx in resource_indices:
+            self._resource = self._resources[idx]
             filepath, target, metadata = super()._load_data()
             filepaths.extend(filepath)
             targets.extend(target)

dataeval/utils/data/datasets/_mnist.py CHANGED Viewed

@@ -49,9 +49,6 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``mnist`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
     corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
@@ -60,6 +57,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Corruption to apply to the data.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -81,6 +81,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_ for corruption dataset
     """
     _resources = [
@@ -114,10 +118,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
         image_set: Literal["train", "test", "base"] = "train",
         corruption: CorruptionStringMap | None = None,
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         self.corruption = corruption
@@ -127,9 +131,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         super().__init__(
             root,
-            download,
             image_set,
             transforms,
+            download,
             verbose,
         )

dataeval/utils/data/datasets/_ships.py CHANGED Viewed

@@ -31,11 +31,11 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``shipdataset`` folder exists.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform, Sequence[Transform] or None, default None
-        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
@@ -55,6 +55,10 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `CC BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_
     """
     _resources = [
@@ -74,15 +78,15 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     def __init__(
         self,
         root: str | Path,
-        download: bool = False,
         transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(
             root,
-            download,
             "base",
             transforms,
+            download,
             verbose,
         )
         self._scenes: list[str] = self._load_scenes()

dataeval/utils/data/datasets/_voc.py CHANGED Viewed

@@ -14,6 +14,8 @@ from dataeval.utils.data.datasets._base import (
     BaseODDataset,
     BaseSegDataset,
     DataLocation,
+    _TArray,
+    _TTarget,
 )
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin, BaseDatasetTorchMixin
 from dataeval.utils.data.datasets._types import ObjectDetectionTarget, SegmentationTarget
@@ -21,9 +23,6 @@ from dataeval.utils.data.datasets._types import ObjectDetectionTarget, Segmentat
 if TYPE_CHECKING:
     from dataeval.typing import Transform
-_TArray = TypeVar("_TArray")
-_TTarget = TypeVar("_TTarget")
 VOCClassStringMap = Literal[
     "aeroplane",
     "bicycle",
@@ -121,19 +120,19 @@ class BaseVOCDataset(BaseDataset[_TArray, _TTarget, list[str]]):
     def __init__(
         self,
         root: str | Path,
-        year: Literal["2007", "2008", "2009", "2010", "2011", "2012"] = "2012",
         image_set: Literal["train", "val", "test", "base"] = "train",
-        download: bool = False,
+        year: Literal["2007", "2008", "2009", "2010", "2011", "2012"] = "2012",
         transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
+        download: bool = False,
         verbose: bool = False,
     ) -> None:
         self.year = year
         self._resource_index = self._get_year_image_set_index(year, image_set)
         super().__init__(
             root,
-            download,
             image_set,
             transforms,
+            download,
             verbose,
         )
@@ -191,10 +190,14 @@ class BaseVOCDataset(BaseDataset[_TArray, _TTarget, list[str]]):
         for entry in data:
             file_name = Path(entry).name
             file_stem = Path(entry).stem
-            # Remove file extension and split by "_"
-            parts = file_stem.split("_")
-            file_meta["year"].append(parts[0])
-            file_meta["image_id"].append(parts[1])
+            if self.year != "2007":
+                # Remove file extension and split by "_"
+                parts = file_stem.split("_")
+                file_meta["year"].append(parts[0])
+                file_meta["image_id"].append(parts[1])
+            else:
+                file_meta["year"].append(self.year)
+                file_meta["image_id"].append(file_stem)
             file_meta["mask_path"].append(str(seg_folder / file_name))
             annotations.append(str(ann_folder / file_stem) + ".xml")
@@ -250,9 +253,6 @@ class VOCDetection(
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``vocdataset`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "val", "test", or "base", default "train"
         If "test", then dataset year must be "2007".
         If "base", then the combined dataset of "train" and "val" is returned.
@@ -260,6 +260,9 @@ class VOCDetection(
         The dataset year.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -267,6 +270,8 @@ class VOCDetection(
     ----------
     path : pathlib.Path
         Location of the folder containing the data.
+    year : "2007", "2008", "2009", "2010", "2011" or "2012"
+        The selected dataset year.
     image_set : "train", "val", "test" or "base"
         The selected image set from the dataset.
     index2label : dict[int, str]
@@ -279,6 +284,10 @@ class VOCDetection(
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
     """
@@ -294,9 +303,6 @@ class VOCDetectionTorch(
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``vocdataset`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "val", "test", or "base", default "train"
         If "test", then dataset year must be "2007".
         If "base", then the combined dataset of "train" and "val" is returned.
@@ -304,6 +310,9 @@ class VOCDetectionTorch(
         The dataset year.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -311,6 +320,8 @@ class VOCDetectionTorch(
     ----------
     path : pathlib.Path
         Location of the folder containing the data.
+    year : "2007", "2008", "2009", "2010", "2011" or "2012"
+        The selected dataset year.
     image_set : "train", "val", "test" or "base"
         The selected image set from the dataset.
     index2label : dict[int, str]
@@ -323,6 +334,10 @@ class VOCDetectionTorch(
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
     """
@@ -338,9 +353,6 @@ class VOCSegmentation(
     ----------
     root : str or pathlib.Path
         Root directory of dataset where the ``vocdataset`` folder exists.
-    download : bool, default False
-        If True, downloads the dataset from the internet and puts it in root directory.
-        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "val", "test", or "base", default "train"
         If "test", then dataset year must be "2007".
         If "base", then the combined dataset of "train" and "val" is returned.
@@ -348,6 +360,9 @@ class VOCSegmentation(
         The dataset year.
     transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
+    download : bool, default False
+        If True, downloads the dataset from the internet and puts it in root directory.
+        Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     verbose : bool, default False
         If True, outputs print statements.
@@ -355,6 +370,8 @@ class VOCSegmentation(
     ----------
     path : pathlib.Path
         Location of the folder containing the data.
+    year : "2007", "2008", "2009", "2010", "2011" or "2012"
+        The selected dataset year.
     image_set : "train", "val", "test" or "base"
         The selected image set from the dataset.
     index2label : dict[int, str]
@@ -367,6 +384,10 @@ class VOCSegmentation(
         The transforms to be applied to the data.
     size : int
         The size of the dataset.
+    Note
+    ----
+    Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
     """
     def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]:

dataeval/utils/data/selections/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Provides selection classes for selecting subsets of Computer Vision datasets."""
 __all__ = [
+    "ClassBalance",
     "ClassFilter",
     "Indices",
     "Limit",
@@ -9,6 +10,7 @@ __all__ = [
     "Shuffle",
 ]
+from dataeval.utils.data.selections._classbalance import ClassBalance
 from dataeval.utils.data.selections._classfilter import ClassFilter
 from dataeval.utils.data.selections._indices import Indices
 from dataeval.utils.data.selections._limit import Limit

dataeval/utils/data/selections/_classbalance.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+__all__ = []
+import numpy as np
+from dataeval.typing import Array, ImageClassificationDatum
+from dataeval.utils._array import as_numpy
+from dataeval.utils.data._selection import Select, Selection, SelectionStage
+class ClassBalance(Selection[ImageClassificationDatum]):
+    """
+    Balance the dataset by class.
+    Note
+    ----
+    The total number of instances of each class will be equalized which may result
+    in a lower total number of instances than specified by the selection limit.
+    """
+    stage = SelectionStage.FILTER
+    def __call__(self, dataset: Select[ImageClassificationDatum]) -> None:
+        class_indices: dict[int, list[int]] = {}
+        for i, idx in enumerate(dataset._selection):
+            target = dataset._dataset[idx][1]
+            if isinstance(target, Array):
+                label = int(np.argmax(as_numpy(target)))
+            else:
+                # ObjectDetectionTarget and SegmentationTarget not supported yet
+                raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
+            class_indices.setdefault(label, []).append(i)
+        per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))
+        subselection = sorted([i for v in class_indices.values() for i in v[:per_class_limit]])
+        dataset._selection = [dataset._selection[i] for i in subselection]

dataeval/utils/data/selections/_classfilter.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 __all__ = []
-from typing import Sequence, TypeVar
+from typing import Sequence
 import numpy as np
@@ -10,50 +10,35 @@ from dataeval.typing import Array, ImageClassificationDatum
 from dataeval.utils._array import as_numpy
 from dataeval.utils.data._selection import Select, Selection, SelectionStage
-TImageClassificationDatum = TypeVar("TImageClassificationDatum", bound=ImageClassificationDatum)
-class ClassFilter(Selection[TImageClassificationDatum]):
+class ClassFilter(Selection[ImageClassificationDatum]):
     """
-    Filter and balance the dataset by class.
+    Filter the dataset by class.
     Parameters
     ----------
-    classes : Sequence[int] or None, default None
-        The classes to filter by. If None, all classes are included.
-    balance : bool, default False
-        Whether to balance the classes.
-    Note
-    ----
-    If `balance` is True, the total number of instances of each class will
-    be equalized. This may result in a lower total number of instances.
+    classes : Sequence[int]
+        The classes to filter by.
     """
     stage = SelectionStage.FILTER
-    def __init__(self, classes: Sequence[int] | None = None, balance: bool = False) -> None:
+    def __init__(self, classes: Sequence[int]) -> None:
         self.classes = classes
-        self.balance = balance
-    def __call__(self, dataset: Select[TImageClassificationDatum]) -> None:
-        if self.classes is None and not self.balance:
+    def __call__(self, dataset: Select[ImageClassificationDatum]) -> None:
+        if not self.classes:
             return
-        per_class_limit = dataset._size_limit // len(self.classes) if self.classes and self.balance else 0
-        class_indices: dict[int, list[int]] = {} if self.classes is None else {k: [] for k in self.classes}
-        for i, idx in enumerate(dataset._selection):
+        selection = []
+        for idx in dataset._selection:
             target = dataset._dataset[idx][1]
             if isinstance(target, Array):
                 label = int(np.argmax(as_numpy(target)))
             else:
                 # ObjectDetectionTarget and SegmentationTarget not supported yet
                 raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
-            if not self.classes or label in self.classes:
-                class_indices.setdefault(label, []).append(i)
-            if per_class_limit and all(len(indices) >= per_class_limit for indices in class_indices.values()):
-                break
-        per_class_limit = min(len(c) for c in class_indices.values()) if self.balance else dataset._size_limit
-        subselection = sorted([i for v in class_indices.values() for i in v[:per_class_limit]])
-        dataset._selection = [dataset._selection[i] for i in subselection]
+            if label in self.classes:
+                selection.append(idx)
+        dataset._selection = selection

dataeval/utils/data/selections/_prioritize.py CHANGED Viewed

@@ -272,7 +272,7 @@ class Prioritize(Selection[Any]):
             return _KMeansComplexitySorter(samples, self._c)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
-        emb: NDArray[Any] = embeddings.to_tensor(selection).cpu().numpy()
+        emb: NDArray[Any] = embeddings.to_numpy(selection)
         emb /= max(np.max(np.linalg.norm(emb, axis=1)), EPSILON)
         return emb

dataeval/utils/data/selections/_shuffle.py CHANGED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 from numpy.random import BitGenerator, Generator, SeedSequence
 from numpy.typing import NDArray
-from dataeval.typing import Array, ArrayLike
+from dataeval.typing import Array
 from dataeval.utils._array import as_numpy
 from dataeval.utils.data._selection import Select, Selection, SelectionStage
@@ -30,7 +30,7 @@ class Shuffle(Selection[Any]):
     seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
     stage = SelectionStage.ORDER
-    def __init__(self, seed: int | ArrayLike | SeedSequence | BitGenerator | Generator | None = None):
+    def __init__(self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None):
         self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
     def __call__(self, dataset: Select[Any]) -> None:

dataeval/utils/torch/_internal.py CHANGED Viewed

@@ -2,7 +2,6 @@ from __future__ import annotations
 __all__ = []
-from functools import partial
 from typing import Any, Callable
 import numpy as np
@@ -12,16 +11,16 @@ from torch.utils.data import DataLoader, TensorDataset
 from tqdm import tqdm
 from dataeval.config import DeviceLike, get_device
+from dataeval.typing import Array
 def predict_batch(
-    x: NDArray[Any] | torch.Tensor,
-    model: Callable | torch.nn.Module | torch.nn.Sequential,
+    x: Array,
+    model: torch.nn.Module,
     device: DeviceLike | None = None,
     batch_size: int = int(1e10),
     preprocess_fn: Callable[[torch.Tensor], torch.Tensor] | None = None,
-    dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
+) -> torch.Tensor:
     """
     Make batch predictions on a model.
@@ -29,7 +28,7 @@ def predict_batch(
     ----------
     x : np.ndarray | torch.Tensor
         Batch of instances.
-    model : Callable | nn.Module | nn.Sequential
+    model : nn.Module
         PyTorch model.
     device : DeviceLike or None, default None
         The hardware device to use if specified, otherwise uses the DataEval
@@ -38,21 +37,18 @@ def predict_batch(
         Batch size used during prediction.
     preprocess_fn : Callable | None, default None
         Optional preprocessing function for each batch.
-    dtype : np.dtype | torch.dtype, default np.float32
-        Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
     Returns
     -------
-    NDArray | torch.Tensor | tuple
-        Numpy array, torch tensor or tuples of those with model outputs.
+    torch.Tensor
+        PyTorch tensor with model outputs.
     """
     device = get_device(device)
-    if isinstance(x, np.ndarray):
-        x = torch.tensor(x, device=device)
+    if isinstance(model, torch.nn.Module):
+        model = model.to(device).eval()
+    x = torch.tensor(x, device=device)
     n = len(x)
     n_minibatch = int(np.ceil(n / batch_size))
-    return_np = not isinstance(dtype, torch.dtype)
-    preds_tuple = None
     preds_array = []
     with torch.no_grad():
         for i in range(n_minibatch):
@@ -60,28 +56,9 @@ def predict_batch(
             x_batch = x[istart:istop]
             if isinstance(preprocess_fn, Callable):
                 x_batch = preprocess_fn(x_batch)
+            preds_array.append(model(x_batch.to(dtype=torch.float32)).cpu())
-            preds_tmp = model(x_batch.to(dtype=torch.float32))
-            if isinstance(preds_tmp, (list, tuple)):
-                if preds_tuple is None:  # init tuple with lists to store predictions
-                    preds_tuple = tuple([] for _ in range(len(preds_tmp)))
-                for j, p in enumerate(preds_tmp):
-                    p = p.cpu() if isinstance(p, torch.Tensor) else p
-                    preds_tuple[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
-            elif isinstance(preds_tmp, (np.ndarray, torch.Tensor)):
-                preds_tmp = preds_tmp.cpu() if isinstance(preds_tmp, torch.Tensor) else preds_tmp
-                preds_array.append(
-                    preds_tmp if not return_np or isinstance(preds_tmp, np.ndarray) else preds_tmp.numpy()
-                )
-            else:
-                raise TypeError(
-                    f"Model output type {type(preds_tmp)} not supported. The model \
-                    output type needs to be one of list, tuple, NDArray or \
-                    torch.Tensor."
-                )
-    concat = partial(np.concatenate, axis=0) if return_np else partial(torch.cat, dim=0)
-    out = tuple(concat(p) for p in preds_tuple) if preds_tuple is not None else concat(preds_array)
-    return out
+    return torch.cat(preds_array, dim=0)
 def trainer(

{dataeval-0.84.0.dist-info → dataeval-0.84.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.84.0
+Version: 0.84.1
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT
@@ -82,8 +82,7 @@ using MAITE-compliant datasets and models.
 **Python versions:** 3.9 - 3.12
-**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
-*Gradient*
+**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*
 Choose your preferred method of installation below or follow our
 [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).

dataeval 0.84.0__py3-none-any.whl → 0.84.1__py3-none-any.whl

dataeval 0.84.0py3-none-any.whl → 0.84.1py3-none-any.whl