PyPI - dataeval - Versions diffs - 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl - Mend

dataeval 0.82.0py3-none-any.whl → 0.83.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

dataeval/__init__.py +7 -2
dataeval/config.py +78 -11
dataeval/detectors/drift/_mmd.py +9 -9
dataeval/detectors/drift/_torch.py +7 -7
dataeval/detectors/drift/_uncertainty.py +4 -4
dataeval/detectors/linters/duplicates.py +3 -3
dataeval/detectors/linters/outliers.py +3 -3
dataeval/detectors/ood/ae.py +5 -4
dataeval/detectors/ood/base.py +2 -2
dataeval/detectors/ood/mixin.py +1 -1
dataeval/detectors/ood/vae.py +2 -1
dataeval/metadata/__init__.py +2 -2
dataeval/metadata/_distance.py +11 -44
dataeval/metadata/_ood.py +152 -33
dataeval/metrics/bias/_balance.py +9 -5
dataeval/metrics/bias/_diversity.py +3 -0
dataeval/metrics/bias/_parity.py +2 -0
dataeval/metrics/estimators/_ber.py +2 -1
dataeval/metrics/stats/_base.py +20 -21
dataeval/metrics/stats/_boxratiostats.py +1 -1
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +8 -8
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +5 -0
dataeval/outputs/_base.py +50 -21
dataeval/outputs/_bias.py +1 -1
dataeval/outputs/_linters.py +4 -2
dataeval/outputs/_metadata.py +61 -0
dataeval/outputs/_stats.py +12 -6
dataeval/typing.py +40 -9
dataeval/utils/_mst.py +1 -2
dataeval/utils/data/_embeddings.py +23 -19
dataeval/utils/data/_metadata.py +16 -7
dataeval/utils/data/_selection.py +22 -15
dataeval/utils/data/_split.py +3 -2
dataeval/utils/data/datasets/_base.py +4 -2
dataeval/utils/data/datasets/_cifar10.py +17 -9
dataeval/utils/data/datasets/_milco.py +18 -12
dataeval/utils/data/datasets/_mnist.py +24 -8
dataeval/utils/data/datasets/_ships.py +18 -8
dataeval/utils/data/datasets/_types.py +1 -5
dataeval/utils/data/datasets/_voc.py +47 -24
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classfilter.py +5 -3
dataeval/utils/data/selections/_prioritize.py +296 -0
dataeval/utils/data/selections/_shuffle.py +13 -4
dataeval/utils/torch/_gmm.py +3 -2
dataeval/utils/torch/_internal.py +5 -5
dataeval/utils/torch/trainer.py +8 -8
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/METADATA +4 -4
dataeval-0.83.0.dist-info/RECORD +105 -0
dataeval/detectors/ood/metadata_ood_mi.py +0 -93
dataeval-0.82.0.dist-info/RECORD +0 -104
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/WHEEL +0 -0

dataeval/utils/data/datasets/_cifar10.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Literal, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import numpy as np
 from numpy.typing import NDArray
@@ -11,7 +11,9 @@ from PIL import Image
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 CIFARClassStringMap = Literal["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
 TCIFARClassMap = TypeVar("TCIFARClassMap", CIFARClassStringMap, int, list[CIFARClassStringMap], list[int])
@@ -30,21 +32,27 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_milco.py CHANGED Viewed

@@ -1,23 +1,23 @@
 from __future__ import annotations
-from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
 __all__ = []
 from pathlib import Path
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseODDataset, DataLocation
-from dataeval.utils.data.datasets._types import Transform
+from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     """
     A side-scan sonar dataset focused on mine (object) detection.
     The dataset comes from the paper
     `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
     by N.P. Santos et. al. (2024).
@@ -43,21 +43,27 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "base"
+        The base image set is the only available image set for the MILCO dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_mnist.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Literal, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import numpy as np
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 MNISTClassStringMap = Literal["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
 TMNISTClassMap = TypeVar("TMNISTClassMap", MNISTClassStringMap, int, list[MNISTClassStringMap], list[int])
@@ -52,19 +54,33 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
     image_set : "train", "test" or "base", default "train"
         If "base", returns all of the data to allow the user to create their own splits.
+    corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
+        "shear", "scale", "rotate", "brightness", "translate", "stripe", "fog", "spatter", \
+        "dotted_line", "zigzag", "canny_edges" or None, default None
+        Corruption to apply to the data.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    corruption : str or None
+        Corruption applied to the data.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_ships.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Sequence
+from typing import TYPE_CHECKING, Any, Sequence
 import numpy as np
 from numpy.typing import NDArray
 from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
-from dataeval.utils.data.datasets._types import Transform
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
@@ -32,19 +34,27 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
     download : bool, default False
         If True, downloads the dataset from the internet and puts it in root directory.
         Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "base"
+        The base image set is the only available image set for the Ships dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     _resources = [

dataeval/utils/data/datasets/_types.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from dataclasses import dataclass
-from typing import Any, Generic, Protocol, TypedDict, TypeVar
+from typing import Any, Generic, TypedDict, TypeVar
 from torch.utils.data import Dataset
 from typing_extensions import NotRequired, Required
@@ -46,7 +46,3 @@ class SegmentationTarget(Generic[_TArray]):
 class SegmentationDataset(AnnotatedDataset[tuple[_TArray, SegmentationTarget[_TArray], dict[str, Any]]]): ...
-class Transform(Generic[_TArray], Protocol):
-    def __call__(self, data: _TArray, /) -> _TArray: ...

dataeval/utils/data/datasets/_voc.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 from pathlib import Path
-from typing import Any, Literal, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
 import torch
 from defusedxml.ElementTree import parse
@@ -16,7 +16,10 @@ from dataeval.utils.data.datasets._base import (
     DataLocation,
 )
 from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin, BaseDatasetTorchMixin
-from dataeval.utils.data.datasets._types import ObjectDetectionTarget, SegmentationTarget, Transform
+from dataeval.utils.data.datasets._types import ObjectDetectionTarget, SegmentationTarget
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 _TArray = TypeVar("_TArray")
 _TTarget = TypeVar("_TTarget")
@@ -201,6 +204,8 @@ class BaseVOCDataset(BaseDataset[_TArray, _TTarget, list[str]]):
         boxes: list[list[float]] = []
         label_str = []
         root = parse(annotation).getroot()
+        if root is None:
+            raise ValueError(f"Unable to parse {annotation}")
         num_objects = len(root.findall("object"))
         additional_meta: dict[str, Any] = {
             "folder": [root.findtext("folder", default="") for _ in range(num_objects)],
@@ -253,21 +258,27 @@ class VOCDetection(
         If "base", then the combined dataset of "train" and "val" is returned.
     year : "2007", "2008", "2009", "2010", "2011" or "2012", default "2012"
         The dataset year.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "val", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
@@ -277,7 +288,7 @@ class VOCDetectionTorch(
     BaseDatasetTorchMixin,
 ):
     """
-    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Detection Dataset.
+    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Detection Dataset as PyTorch tensors.
     Parameters
     ----------
@@ -291,21 +302,27 @@ class VOCDetectionTorch(
         If "base", then the combined dataset of "train" and "val" is returned.
     year : "2007", "2008", "2009", "2010", "2011" or "2012", default "2012"
         The dataset year.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "val", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
@@ -329,21 +346,27 @@ class VOCSegmentation(
         If "base", then the combined dataset of "train" and "val" is returned.
     year : "2007", "2008", "2009", "2010", "2011" or "2012", default "2012"
         The dataset year.
-    transforms : Transform | Sequence[Transform] | None, default None
+    transforms : Transform, Sequence[Transform] or None, default None
         Transform(s) to apply to the data.
     verbose : bool, default False
         If True, outputs print statements.
     Attributes
     ----------
-    index2label : dict
+    path : pathlib.Path
+        Location of the folder containing the data.
+    image_set : "train", "val", "test" or "base"
+        The selected image set from the dataset.
+    index2label : dict[int, str]
         Dictionary which translates from class integers to the associated class strings.
-    label2index : dict
+    label2index : dict[str, int]
         Dictionary which translates from class strings to the associated class integers.
-    path : Path
-        Location of the folder containing the data.
-    metadata : dict
-        Dictionary containing Dataset metadata, such as `id` which returns the dataset class name.
+    metadata : DatasetMetadata
+        Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
+    transforms : Sequence[Transform]
+        The transforms to be applied to the data.
+    size : int
+        The size of the dataset.
     """
     def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]:

dataeval/utils/data/selections/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ __all__ = [
     "ClassFilter",
     "Indices",
     "Limit",
+    "Prioritize",
     "Reverse",
     "Shuffle",
 ]
@@ -11,5 +12,6 @@ __all__ = [
 from dataeval.utils.data.selections._classfilter import ClassFilter
 from dataeval.utils.data.selections._indices import Indices
 from dataeval.utils.data.selections._limit import Limit
+from dataeval.utils.data.selections._prioritize import Prioritize
 from dataeval.utils.data.selections._reverse import Reverse
 from dataeval.utils.data.selections._shuffle import Shuffle

dataeval/utils/data/selections/_classfilter.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 __all__ = []
-from typing import Sequence
+from typing import Sequence, TypeVar
 import numpy as np
@@ -10,8 +10,10 @@ from dataeval.typing import Array, ImageClassificationDatum
 from dataeval.utils._array import as_numpy
 from dataeval.utils.data._selection import Select, Selection, SelectionStage
+TImageClassificationDatum = TypeVar("TImageClassificationDatum", bound=ImageClassificationDatum)
-class ClassFilter(Selection[ImageClassificationDatum]):
+class ClassFilter(Selection[TImageClassificationDatum]):
     """
     Filter and balance the dataset by class.
@@ -34,7 +36,7 @@ class ClassFilter(Selection[ImageClassificationDatum]):
         self.classes = classes
         self.balance = balance
-    def __call__(self, dataset: Select[ImageClassificationDatum]) -> None:
+    def __call__(self, dataset: Select[TImageClassificationDatum]) -> None:
         if self.classes is None and not self.balance:
             return

dataeval 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl

dataeval 0.82.0py3-none-any.whl → 0.83.0py3-none-any.whl