PyPI - dataeval - Versions diffs - 0.82.1__py3-none-any.whl → 0.84.0__py3-none-any.whl - Mend

dataeval 0.82.1py3-none-any.whl → 0.84.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

dataeval/__init__.py +7 -2
dataeval/config.py +13 -3
dataeval/metadata/__init__.py +2 -2
dataeval/metadata/_ood.py +144 -27
dataeval/metrics/bias/__init__.py +11 -1
dataeval/metrics/bias/_balance.py +3 -3
dataeval/metrics/bias/_completeness.py +130 -0
dataeval/metrics/estimators/_ber.py +2 -1
dataeval/metrics/stats/_base.py +31 -36
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +4 -45
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +4 -2
dataeval/outputs/_bias.py +31 -22
dataeval/outputs/_metadata.py +7 -0
dataeval/outputs/_stats.py +2 -3
dataeval/typing.py +43 -12
dataeval/utils/_array.py +26 -1
dataeval/utils/_mst.py +1 -2
dataeval/utils/data/_dataset.py +2 -0
dataeval/utils/data/_embeddings.py +115 -32
dataeval/utils/data/_images.py +38 -15
dataeval/utils/data/_selection.py +7 -8
dataeval/utils/data/_split.py +76 -129
dataeval/utils/data/datasets/_base.py +4 -2
dataeval/utils/data/datasets/_cifar10.py +17 -9
dataeval/utils/data/datasets/_milco.py +18 -12
dataeval/utils/data/datasets/_mnist.py +24 -8
dataeval/utils/data/datasets/_ships.py +18 -8
dataeval/utils/data/datasets/_types.py +1 -5
dataeval/utils/data/datasets/_voc.py +47 -24
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classfilter.py +1 -1
dataeval/utils/data/selections/_prioritize.py +296 -0
dataeval/utils/data/selections/_shuffle.py +13 -4
dataeval/utils/metadata.py +1 -1
dataeval/utils/torch/_gmm.py +3 -2
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/METADATA +4 -4
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/RECORD +44 -43
dataeval/detectors/ood/metadata_ood_mi.py +0 -91
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/_base.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 __all__ = []
+import math
 import re
 import warnings
 from collections import ChainMap
@@ -17,26 +18,13 @@ from numpy.typing import NDArray
 from dataeval.config import get_max_processes
 from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
-from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
+from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
 from dataeval.utils._array import to_numpy
 from dataeval.utils._image import normalize_image_shape, rescale
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
-def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
-    """
-    Normalizes the bounding box shape into (N,4).
-    """
-    ndim = bounding_box.ndim
-    if ndim == 1:
-        return np.expand_dims(bounding_box, axis=0)
-    elif ndim > 2:
-        raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
-    else:
-        return bounding_box
+BoundingBox = tuple[float, float, float, float]
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
@@ -46,11 +34,15 @@ class StatsProcessor(Generic[TStatsOutput]):
     image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
     channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
-    def __init__(self, image: NDArray[Any], box: NDArray[Any] | None, per_channel: bool) -> None:
+    def __init__(self, image: NDArray[Any], box: BoundingBox | None, per_channel: bool) -> None:
         self.raw = image
         self.width: int = image.shape[-1]
         self.height: int = image.shape[-2]
-        self.box: NDArray[np.int64] = np.array([0, 0, self.width, self.height]) if box is None else box.astype(np.int64)
+        box = BoundingBox((0, 0, self.width, self.height)) if box is None else box
+        # Clip the bounding box to image
+        x0, y0 = (min(j, max(0, math.floor(box[i]))) for i, j in zip((0, 1), (self.width - 1, self.height - 1)))
+        x1, y1 = (min(j, max(1, math.ceil(box[i]))) for i, j in zip((2, 3), (self.width, self.height)))
+        self.box: NDArray[np.int64] = np.array([x0, y0, x1, y1], dtype=np.int64)
         self._per_channel = per_channel
         self._image = None
         self._shape = None
@@ -122,22 +114,17 @@ class StatsProcessorOutput:
 def process_stats(
     i: int,
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
-    per_box: bool,
+    image: ArrayLike,
+    boxes: list[BoundingBox] | None,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    data = dataset[i]
-    image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
-    target = None if not isinstance(target, ObjectDetectionTarget) else target
-    boxes = to_numpy(target.boxes) if target is not None else None
+    image = to_numpy(image)
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
     warnings_list: list[str] = []
-    nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
-    for i_b, box in enumerate(nboxes):
-        i_b = None if box is None else i_b
+    for i_b, box in [(None, None)] if boxes is None else enumerate(boxes):
         processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
         if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
             warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
@@ -151,17 +138,28 @@ def process_stats(
 def process_stats_unpack(
-    i: int,
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
-    per_box: bool,
+    args: tuple[int, Array, list[BoundingBox] | None],
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
+    return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
+def _enumerate(dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]], per_box: bool):
+    for i in range(len(dataset)):
+        d = dataset[i]
+        image = d[0] if isinstance(d, tuple) else d
+        if per_box and isinstance(d, tuple) and isinstance(d[1], ObjectDetectionTarget):
+            boxes = cast(Array, d[1].boxes)
+            target = [BoundingBox(float(box[i]) for i in range(4)) for box in boxes]
+        else:
+            target = None
+        yield i, image, target
 def run_stats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
@@ -175,7 +173,7 @@ def run_stats(
     Parameters
     ----------
-    data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
+    data : Dataset[Array] | Dataset[tuple[Array, Any, Any]]
         A dataset of images and targets to compute statistics on.
     per_box : bool
         A flag which determines if the statistics should be evaluated on a per-box basis or not.
@@ -206,18 +204,15 @@ def run_stats(
     warning_list = []
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
-    # TODO: Introduce global controls for CPU job parallelism and GPU configurations
     with Pool(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
                 partial(
                     process_stats_unpack,
-                    dataset=dataset,
-                    per_box=per_box,
                     per_channel=per_channel,
                     stats_processor_cls=stats_processor_cls,
                 ),
-                range(len(dataset)),
+                _enumerate(dataset, per_box),
             ),
             total=len(dataset),
         ):

dataeval/metrics/stats/_dimensionstats.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import DimensionStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike, Dataset
+from dataeval.typing import Array, Dataset
 from dataeval.utils._image import get_bitdepth
@@ -34,7 +34,7 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
 @set_metadata
 def dimensionstats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
 ) -> DimensionStatsOutput:

dataeval/metrics/stats/_hashstats.py CHANGED Viewed

@@ -14,7 +14,7 @@ from scipy.fftpack import dct
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import HashStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike, Dataset
+from dataeval.typing import Array, ArrayLike, Dataset
 from dataeval.utils._array import as_numpy
 from dataeval.utils._image import normalize_image_shape, rescale
@@ -105,7 +105,7 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
 @set_metadata
 def hashstats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
 ) -> HashStatsOutput:

dataeval/metrics/stats/_imagestats.py CHANGED Viewed

@@ -10,12 +10,12 @@ from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
 from dataeval.metrics.stats._visualstats import VisualStatsProcessor
 from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike, Dataset
+from dataeval.typing import Array, Dataset
 @overload
 def imagestats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
     per_channel: Literal[True],
@@ -24,7 +24,7 @@ def imagestats(
 @overload
 def imagestats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
     per_channel: Literal[False] = False,
@@ -33,7 +33,7 @@ def imagestats(
 @set_metadata
 def imagestats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
     per_channel: bool = False,

dataeval/metrics/stats/_labelstats.py CHANGED Viewed

@@ -5,54 +5,16 @@ __all__ = []
 from collections import Counter, defaultdict
 from typing import Any, Mapping, TypeVar
-import numpy as np
 from dataeval.outputs import LabelStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import AnnotatedDataset, ArrayLike
-from dataeval.utils._array import as_numpy
+from dataeval.typing import AnnotatedDataset
 from dataeval.utils.data._metadata import Metadata
 TValue = TypeVar("TValue")
-def _ensure_2d(labels: ArrayLike) -> ArrayLike:
-    if isinstance(labels, np.ndarray):
-        return labels[:, None]
-    else:
-        return [[lbl] for lbl in labels]  # type: ignore
-def _get_list_depth(lst):
-    if isinstance(lst, list) and lst:
-        return 1 + max(_get_list_depth(item) for item in lst)
-    return 0
-def _check_labels_dimension(labels: ArrayLike) -> ArrayLike:
-    # Check for nested lists beyond 2 levels
-    if isinstance(labels, np.ndarray):
-        if labels.ndim == 1:
-            return _ensure_2d(labels)
-        elif labels.ndim == 2:
-            return labels
-        else:
-            raise ValueError("The label array must not have more than 2 dimensions.")
-    elif isinstance(labels, list):
-        depth = _get_list_depth(labels)
-        if depth == 1:
-            return _ensure_2d(labels)
-        elif depth == 2:
-            return labels
-        else:
-            raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
-    else:
-        raise TypeError("Labels must be either a NumPy array or a list.")
 def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
-    return [v for _, v in sorted(d.items())]
+    return [t[1] for t in sorted(d.items())]
 @set_metadata
@@ -98,12 +60,9 @@ def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
     label_per_image: list[int] = []
     index2label = dict(enumerate(dataset.class_names))
-    labels = [target.labels.tolist() for target in dataset.targets]
-    labels_2d = _check_labels_dimension(labels)
-    for i, group in enumerate(labels_2d):
-        group = as_numpy(group).tolist()
+    for i, target in enumerate(dataset.targets):
+        group = target.labels.tolist()
         # Count occurrences of each label in all sublists
         label_counts.update(group)

dataeval/metrics/stats/_pixelstats.py CHANGED Viewed

@@ -10,7 +10,7 @@ from scipy.stats import entropy, kurtosis, skew
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import PixelStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike, Dataset
+from dataeval.typing import Array, Dataset
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
@@ -37,7 +37,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
 @set_metadata
 def pixelstats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
     per_channel: bool = False,

dataeval/metrics/stats/_visualstats.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 from dataeval.metrics.stats._base import StatsProcessor, run_stats
 from dataeval.outputs import VisualStatsOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike, Dataset
+from dataeval.typing import Array, Dataset
 from dataeval.utils._image import edge_filter
 QUARTILES = (0, 25, 50, 75, 100)
@@ -44,7 +44,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
 @set_metadata
 def visualstats(
-    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
     *,
     per_box: bool = False,
     per_channel: bool = False,

dataeval/outputs/__init__.py CHANGED Viewed

@@ -4,11 +4,11 @@ as well as runtime metadata for reproducibility and logging.
 """
 from ._base import ExecutionMetadata
-from ._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
+from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
 from ._drift import DriftMMDOutput, DriftOutput
 from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
 from ._linters import DuplicatesOutput, OutliersOutput
-from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput
+from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
 from ._ood import OODOutput, OODScoreOutput
 from ._stats import (
     ChannelStatsOutput,
@@ -29,6 +29,7 @@ __all__ = [
     "ChannelStatsOutput",
     "ClustererOutput",
     "CoverageOutput",
+    "CompletenessOutput",
     "DimensionStatsOutput",
     "DivergenceOutput",
     "DiversityOutput",
@@ -44,6 +45,7 @@ __all__ = [
     "MetadataDistanceValues",
     "MostDeviatedFactorsOutput",
     "OODOutput",
+    "OODPredictorOutput",
     "OODScoreOutput",
     "OutliersOutput",
     "ParityOutput",

dataeval/outputs/_bias.py CHANGED Viewed

@@ -14,9 +14,10 @@ with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
 from dataeval.outputs._base import Output
-from dataeval.typing import ArrayLike
-from dataeval.utils._array import to_numpy
+from dataeval.typing import ArrayLike, Dataset
+from dataeval.utils._array import as_numpy, channels_first_to_last
 from dataeval.utils._plot import heatmap
+from dataeval.utils.data._images import Images
 TData = TypeVar("TData", np.float64, NDArray[np.float64])
@@ -107,13 +108,13 @@ class CoverageOutput(Output):
     critical_value_radii: NDArray[np.float64]
     coverage_radius: float
-    def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
+    def plot(self, images: Images[Any] | Dataset[Any], top_k: int = 6) -> Figure:
         """
         Plot the top k images together for visualization.
         Parameters
         ----------
-        images : ArrayLike
+        images : Images or Dataset
             Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
         top_k : int, default 6
             Number of images to plot (plotting assumes groups of 3)
@@ -130,46 +131,54 @@ class CoverageOutput(Output):
         import matplotlib.pyplot as plt
         # Determine which images to plot
-        highest_uncovered_indices = self.uncovered_indices[:top_k]
+        selected_indices = self.uncovered_indices[:top_k]
-        # Grab the images
-        selected_images = to_numpy(images)[highest_uncovered_indices]
+        images = Images(images) if isinstance(images, Dataset) else images
         # Plot the images
-        num_images = min(top_k, len(images))
-        ndim = selected_images.ndim
-        if ndim == 4:
-            selected_images = np.moveaxis(selected_images, 1, -1)
-        elif ndim == 3:
-            selected_images = np.repeat(selected_images[:, :, :, np.newaxis], 3, axis=-1)
-        else:
-            raise ValueError(
-                f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {ndim}-dimensional set of images."
-            )
+        num_images = min(top_k, len(selected_indices))
         rows = int(np.ceil(num_images / 3))
         fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
         if rows == 1:
             for j in range(3):
-                if j >= len(selected_images):
+                if j >= len(selected_indices):
                     continue
-                axs[j].imshow(selected_images[j])
+                image = channels_first_to_last(as_numpy(images[selected_indices[j]]))
+                axs[j].imshow(image)
                 axs[j].axis("off")
         else:
             for i in range(rows):
                 for j in range(3):
                     i_j = i * 3 + j
-                    if i_j >= len(selected_images):
+                    if i_j >= len(selected_indices):
                         continue
-                    axs[i, j].imshow(selected_images[i_j])
+                    image = channels_first_to_last(as_numpy(images[selected_indices[i_j]]))
+                    axs[i, j].imshow(image)
                     axs[i, j].axis("off")
         fig.tight_layout()
         return fig
+@dataclass(frozen=True)
+class CompletenessOutput(Output):
+    """
+    Output from the completeness function.
+    Attributes
+    ----------
+    fraction_filled : float
+        Fraction of boxes that contain at least one data point
+    empty_box_centers : List[np.ndarray]
+        List of coordinates for centers of empty boxes
+    """
+    fraction_filled: float
+    empty_box_centers: NDArray[np.float64]
 @dataclass(frozen=True)
 class BalanceOutput(Output):
     """

dataeval/outputs/_metadata.py CHANGED Viewed

@@ -52,3 +52,10 @@ class MetadataDistanceOutput(MappingOutput[str, MetadataDistanceValues]):
     value : :class:`.MetadataDistanceValues`
         Output per feature name containing the statistic, statistic location, distance, and pvalue.
     """
+class OODPredictorOutput(MappingOutput[str, float]):
+    """
+    Output class for results of :func:`find_ood_predictors` for the
+    mutual information between factors and being out of distribution
+    """

dataeval/outputs/_stats.py CHANGED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 import contextlib
 from dataclasses import dataclass
-from typing import Any, Iterable, Optional, Union
+from typing import Any, Iterable, NamedTuple, Optional, Union
 import numpy as np
 from numpy.typing import NDArray
@@ -22,8 +22,7 @@ SOURCE_INDEX = "source_index"
 BOX_COUNT = "box_count"
-@dataclass(frozen=True)
-class SourceIndex:
+class SourceIndex(NamedTuple):
     """
     The indices of the source image, box and channel.

dataeval/typing.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Common type hints used for interoperability with DataEval.
+Common type protocols used for interoperability with DataEval.
 """
 __all__ = [
@@ -16,13 +16,14 @@ __all__ = [
     "SegmentationTarget",
     "SegmentationDatum",
     "SegmentationDataset",
+    "Transform",
 ]
 import sys
 from typing import Any, Generic, Iterator, Protocol, Sequence, TypedDict, TypeVar, Union, runtime_checkable
-from typing_extensions import NotRequired, Required
+from typing_extensions import NotRequired, ReadOnly, Required
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
@@ -66,6 +67,7 @@ class Array(Protocol):
     def __len__(self) -> int: ...
+T = TypeVar("T")
 _T_co = TypeVar("_T_co", covariant=True)
 _ScalarType = Union[int, float, bool, str]
 ArrayLike: TypeAlias = Union[Sequence[_ScalarType], Sequence[Sequence[_ScalarType]], Sequence[Array], Array]
@@ -89,8 +91,8 @@ class DatasetMetadata(TypedDict, total=False):
         A lookup table converting label value to class name
     """
-    id: Required[str]
-    index2label: NotRequired[dict[int, str]]
+    id: Required[ReadOnly[str]]
+    index2label: NotRequired[ReadOnly[dict[int, str]]]
 @runtime_checkable
@@ -140,7 +142,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
 ImageClassificationDatum: TypeAlias = tuple[Array, Array, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`Array` of shape (N,) - Class label as one-hot encoded ground-truth or prediction confidences.
@@ -150,7 +152,7 @@ A type definition for an image classification datum tuple.
 ImageClassificationDataset: TypeAlias = AnnotatedDataset[ImageClassificationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
 """
 # ========== OBJECT DETECTION DATASETS ==========
@@ -159,7 +161,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificatio
 @runtime_checkable
 class ObjectDetectionTarget(Protocol):
     """
-    A protocol for targets in an Object Detection dataset.
+    Protocol for targets in an Object Detection dataset.
     Attributes
     ----------
@@ -180,7 +182,7 @@ class ObjectDetectionTarget(Protocol):
 ObjectDetectionDatum: TypeAlias = tuple[Array, ObjectDetectionTarget, dict[str, Any]]
 """
-A type definition for an object detection datum tuple.
+Type alias for an object detection datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`ObjectDetectionTarget` - Object detection target information for the image.
@@ -190,7 +192,7 @@ A type definition for an object detection datum tuple.
 ObjectDetectionDataset: TypeAlias = AnnotatedDataset[ObjectDetectionDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
 """
@@ -200,7 +202,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDat
 @runtime_checkable
 class SegmentationTarget(Protocol):
     """
-    A protocol for targets in a Segmentation dataset.
+    Protocol for targets in a Segmentation dataset.
     Attributes
     ----------
@@ -221,7 +223,7 @@ class SegmentationTarget(Protocol):
 SegmentationDatum: TypeAlias = tuple[Array, SegmentationTarget, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`SegmentationTarget` - Segmentation target information for the image.
@@ -230,5 +232,34 @@ A type definition for an image classification datum tuple.
 SegmentationDataset: TypeAlias = AnnotatedDataset[SegmentationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
 """
+@runtime_checkable
+class Transform(Generic[T], Protocol):
+    """
+    Protocol defining a transform function.
+    Requires a `__call__` method that returns transformed data.
+    Example
+    -------
+    >>> from typing import Any
+    >>> from numpy.typing import NDArray
+    >>> class MyTransform:
+    ...     def __init__(self, divisor: float) -> None:
+    ...         self.divisor = divisor
+    ...
+    ...     def __call__(self, data: NDArray[Any], /) -> NDArray[Any]:
+    ...         return data / self.divisor
+    >>> my_transform = MyTransform(divisor=255.0)
+    >>> isinstance(my_transform, Transform)
+    True
+    >>> my_transform(np.array([1, 2, 3]))
+    array([0.004, 0.008, 0.012])
+    """
+    def __call__(self, data: T, /) -> T: ...

dataeval/utils/_array.py CHANGED Viewed

@@ -13,7 +13,7 @@ import torch
 from numpy.typing import NDArray
 from dataeval._log import LogMessage
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array, ArrayLike
 _logger = logging.getLogger(__name__)
@@ -167,3 +167,28 @@ def flatten(array: ArrayLike) -> NDArray[Any]:
     """
     nparr = as_numpy(array)
     return nparr.reshape((nparr.shape[0], -1))
+_TArray = TypeVar("_TArray", bound=Array)
+def channels_first_to_last(array: _TArray) -> _TArray:
+    """
+    Converts array from channels first to channels last format
+    Parameters
+    ----------
+    array : ArrayLike
+        Input array
+    Returns
+    -------
+    ArrayLike
+        Converted array
+    """
+    if isinstance(array, np.ndarray):
+        return np.transpose(array, (1, 2, 0))
+    elif isinstance(array, torch.Tensor):
+        return torch.permute(array, (1, 2, 0))
+    else:
+        raise TypeError(f"Unsupported array type {type(array)} for conversion.")

dataeval/utils/_mst.py CHANGED Viewed

@@ -10,10 +10,9 @@ from scipy.sparse.csgraph import minimum_spanning_tree as mst
 from scipy.spatial.distance import pdist, squareform
 from sklearn.neighbors import NearestNeighbors
+from dataeval.config import EPSILON
 from dataeval.utils._array import flatten
-EPSILON = 1e-5
 def minimum_spanning_tree(X: NDArray[Any]) -> Any:
     """

dataeval 0.82.1__py3-none-any.whl → 0.84.0__py3-none-any.whl

dataeval 0.82.1py3-none-any.whl → 0.84.0py3-none-any.whl