PyPI - dataeval - Versions diffs - 0.87.0__py3-none-any.whl → 0.88.0__py3-none-any.whl - Mend

dataeval 0.87.0py3-none-any.whl → 0.88.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

dataeval/_log.py +1 -1
dataeval/_version.py +2 -2
dataeval/data/_embeddings.py +78 -35
dataeval/data/_images.py +41 -8
dataeval/data/_metadata.py +294 -41
dataeval/data/_selection.py +22 -7
dataeval/data/_split.py +2 -1
dataeval/data/selections/_classfilter.py +4 -3
dataeval/data/selections/_indices.py +2 -1
dataeval/data/selections/_shuffle.py +3 -2
dataeval/detectors/drift/_base.py +2 -1
dataeval/detectors/drift/_mmd.py +2 -1
dataeval/detectors/drift/_nml/_base.py +1 -1
dataeval/detectors/drift/_nml/_chunk.py +2 -1
dataeval/detectors/drift/_nml/_result.py +3 -2
dataeval/detectors/drift/_nml/_thresholds.py +6 -5
dataeval/detectors/drift/_uncertainty.py +2 -1
dataeval/detectors/linters/duplicates.py +2 -1
dataeval/detectors/linters/outliers.py +4 -3
dataeval/detectors/ood/ae.py +1 -1
dataeval/detectors/ood/base.py +2 -1
dataeval/detectors/ood/mixin.py +2 -1
dataeval/metadata/_utils.py +1 -1
dataeval/metrics/bias/_balance.py +1 -1
dataeval/metrics/stats/_base.py +3 -29
dataeval/metrics/stats/_boxratiostats.py +2 -1
dataeval/metrics/stats/_dimensionstats.py +2 -1
dataeval/metrics/stats/_hashstats.py +2 -1
dataeval/metrics/stats/_pixelstats.py +2 -1
dataeval/metrics/stats/_visualstats.py +2 -1
dataeval/outputs/_base.py +2 -3
dataeval/outputs/_bias.py +2 -1
dataeval/outputs/_estimators.py +1 -1
dataeval/outputs/_linters.py +3 -3
dataeval/outputs/_stats.py +3 -3
dataeval/outputs/_utils.py +1 -1
dataeval/outputs/_workflows.py +29 -24
dataeval/typing.py +11 -9
dataeval/utils/_array.py +3 -2
dataeval/utils/_bin.py +2 -1
dataeval/utils/_method.py +2 -3
dataeval/utils/_multiprocessing.py +34 -0
dataeval/utils/_plot.py +2 -1
dataeval/utils/data/__init__.py +4 -5
dataeval/utils/data/{metadata.py → _merge.py} +3 -2
dataeval/utils/data/_validate.py +2 -1
dataeval/utils/data/collate.py +2 -1
dataeval/utils/torch/_internal.py +2 -1
dataeval/utils/torch/trainer.py +1 -1
dataeval/workflows/sufficiency.py +13 -9
{dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/METADATA +4 -5
dataeval-0.88.0.dist-info/RECORD +105 -0
dataeval/utils/data/_dataset.py +0 -253
dataeval-0.87.0.dist-info/RECORD +0 -105
{dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
{dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/licenses/LICENSE +0 -0

dataeval/_log.py CHANGED Viewed

@@ -1,6 +1,6 @@
 __all__ = []
-from typing import Callable
+from collections.abc import Callable
 class LogMessage:

dataeval/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.87.0'
-__version_tuple__ = version_tuple = (0, 87, 0)
+__version__ = version = '0.88.0'
+__version_tuple__ = version_tuple = (0, 88, 0)

dataeval/data/_embeddings.py CHANGED Viewed

@@ -5,8 +5,9 @@ __all__ = []
 import logging
 import math
 import os
+from collections.abc import Iterator, Sequence
 from pathlib import Path
-from typing import Any, Iterator, Sequence, cast
+from typing import Any, cast
 import torch
 import xxhash as xxh
@@ -15,7 +16,14 @@ from torch.utils.data import DataLoader, Subset
 from tqdm import tqdm
 from dataeval.config import DeviceLike, get_device
-from dataeval.typing import AnnotatedDataset, AnnotatedModel, Array, ArrayLike, Dataset, Transform
+from dataeval.typing import (
+    AnnotatedDataset,
+    AnnotatedModel,
+    Array,
+    ArrayLike,
+    Dataset,
+    Transform,
+)
 from dataeval.utils._array import as_numpy
 from dataeval.utils.torch.models import SupportsEncode
@@ -26,38 +34,41 @@ class Embeddings:
     """
     Collection of image embeddings from a dataset.
-    Embeddings are accessed by index or slice and are only loaded on-demand.
+    Embeddings are accessed by index or slice and are loaded on-demand.
     Parameters
     ----------
     dataset : ImageClassificationDataset or ObjectDetectionDataset
         Dataset to access original images from.
     batch_size : int
-        Batch size to use when encoding images.
+        Batch size to use when encoding images. When less than 1, automatically sets to 1 for safe processing.
     transforms : Transform or Sequence[Transform] or None, default None
-        Transforms to apply to images before encoding.
+        Image transformationss to apply before encoding. When None, uses raw images without
+        preprocessing.
     model : torch.nn.Module or None, default None
-        Model to use for encoding images.
+        Neural network model that generates embeddings from images. When None, uses Flatten layer for simple
+        baseline compatibility with all DataEval tools without requiring pre-trained weights or GPU resources.
     device : DeviceLike or None, default None
-        The hardware device to use if specified, otherwise uses the DataEval
-        default or torch default.
+        Hardware device for computation. When None, automatically selects DataEval's configured device, falling
+        back to PyTorch's default.
     cache : Path, str, or bool, default False
-        Whether to cache the embeddings to a file or in memory.
-        When a Path or string is provided, embeddings will be cached to disk.
+        When True, caches embeddings in memory for faster repeated access.
+        When Path or string is provided, persists embeddings to disk for reuse across sessions.
+        Default False minimizes memory usage.
     verbose : bool, default False
-        Whether to print progress bar when encoding images.
+        When True, displays a progress bar when encoding images. Default False reduces console output
+        for cleaner automated workflows.
     Attributes
     ----------
     batch_size : int
-        Batch size to use when encoding images.
+        Number of images processed per batch during encoding. Minimum value of 1.
     cache : Path or bool
-        The path to cache embeddings to file, or True if caching to memory.
+        Disk path where embeddings are stored, or True when cached in memory.
     device : torch.device
-        The hardware device to use if specified, otherwise uses the DataEval
-        default or torch default.
+        Hardware device used for tensor computations.
     verbose : bool
-        Whether to print progress bar when encoding images.
+        Whether progress information is displayed during operations.
     """
     device: torch.device
@@ -66,6 +77,7 @@ class Embeddings:
     def __init__(
         self,
+        # Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
         dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike],
         batch_size: int,
         transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
@@ -80,8 +92,8 @@ class Embeddings:
         self._embeddings_only: bool = False
         self._dataset = dataset
-        model = torch.nn.Flatten() if model is None else model
         self._transforms = [transforms] if isinstance(transforms, Transform) else transforms
+        model = torch.nn.Flatten() if model is None else model
         self._model = model.to(self.device).eval() if isinstance(model, torch.nn.Module) else model
         self._encoder = model.encode if isinstance(model, SupportsEncode) else model
         self._collate_fn = lambda datum: [torch.as_tensor(d[0] if isinstance(d, tuple) else d) for d in datum]
@@ -110,7 +122,7 @@ class Embeddings:
         if isinstance(value, bool) and not value:
             self._cached_idx = set()
             self._embeddings = torch.empty(())
-        elif isinstance(value, (Path, str)):
+        elif isinstance(value, Path | str):
             value = self._resolve_path(value)
         if isinstance(value, Path) and value != getattr(self, "_cache", None):
@@ -127,20 +139,24 @@ class Embeddings:
     def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
         """
-        Converts dataset to embeddings.
+        Convert dataset items to embedding tensor.
+        Process specified dataset indices through the model in batches and
+        return concatenated embeddings as a single tensor.
         Parameters
         ----------
         indices : Sequence[int] or None, default None
-            The indices to convert to embeddings
+            Dataset indices to convert to embeddings. When None, processes entire dataset.
         Returns
         -------
         torch.Tensor
+            Concatenated embeddings with shape (n_samples, embedding_dim).
-        Warning
-        -------
-        Processing large quantities of data can be resource intensive.
+        Warnings
+        --------
+        Processing large datasets can be memory and compute intensive.
         """
         if indices is not None:
             return torch.vstack(list(self._batch(indices))).to(self.device)
@@ -148,35 +164,45 @@ class Embeddings:
     def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
         """
-        Converts dataset to embeddings as numpy array.
+        Convert dataset items to embedding array.
         Parameters
         ----------
         indices : Sequence[int] or None, default None
-            The indices to convert to embeddings
+            Dataset indices to convert to embeddings. When None, processes entire dataset.
         Returns
         -------
         NDArray[Any]
+            Embedding array with shape (n_samples, embedding_dim)
         Warning
         -------
-        Processing large quantities of data can be resource intensive.
+        Processing large datasets can be memory and compute intensive.
         """
         return self.to_tensor(indices).cpu().numpy()
     def new(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Embeddings:
         """
-        Creates a new Embeddings object with the same parameters but a different dataset.
+        Create new Embeddings instance with a different dataset.
+        Generate a new Embeddings object using the same model, transforms,
+        and configuration but with a different dataset.
         Parameters
         ----------
         dataset : ImageClassificationDataset or ObjectDetectionDataset
-            Dataset to access original images from.
+            Dataset that provides images for the new Embeddings instance.
         Returns
         -------
         Embeddings
+            New Embeddings object configured identically to the current instance.
+        Raises
+        ------
+        ValueError
+            When called on embeddings-only instance that lacks a model.
         """
         if self._embeddings_only:
             raise ValueError("Embeddings object does not have a model.")
@@ -187,15 +213,15 @@ class Embeddings:
     @classmethod
     def from_array(cls, array: ArrayLike, device: DeviceLike | None = None) -> Embeddings:
         """
-        Instantiates a shallow Embeddings object using an array.
+        Create Embeddings instance from an existing image array.
         Parameters
         ----------
         array : ArrayLike
-            The array to convert to embeddings.
+            In-memory image data to wrap in an Embeddings object.
         device : DeviceLike or None, default None
-            The hardware device to use if specified, otherwise uses the DataEval
-            default or torch default.
+            Hardware device for computation. When None, automatically selects DataEval's configured device, falling
+            back to PyTorch's default.
         Returns
         -------
@@ -219,12 +245,15 @@ class Embeddings:
     def save(self, path: Path | str) -> None:
         """
-        Saves the embeddings to disk.
+        Save embeddings to disk.
+        Persist current embeddings to the specified file path for later
+        loading and reuse.
         Parameters
         ----------
         path : Path or str
-            The file path to save the embeddings to.
+            File path where embeddings will be saved.
         """
         self._save(self._resolve_path(path), True)
@@ -254,10 +283,24 @@ class Embeddings:
         """
         Loads the embeddings from disk.
+        Create an Embeddings instance from previously saved embedding data.
         Parameters
         ----------
         path : Path or str
-            The file path to load the embeddings from.
+            File path to load embeddings from.
+        Returns
+        -------
+        Embeddings
+            Embeddings-only instance containing the loaded data.
+        Raises
+        ------
+        FileNotFoundError
+            When the specified file path does not exist.
+        Exception
+            When file loading or parsing fails.
         """
         emb = Embeddings([], 0)
         path = Path(os.path.abspath(path)) if isinstance(path, str) else path

dataeval/data/_images.py CHANGED Viewed

@@ -2,7 +2,8 @@ from __future__ import annotations
 __all__ = []
-from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
+from collections.abc import Iterator, Sequence
+from typing import TYPE_CHECKING, Any, Generic, TypeVar, cast, overload
 import numpy as np
@@ -19,12 +20,18 @@ class Images(Generic[T]):
     """
     Collection of image data from a dataset.
-    Images are accessed by index or slice and are only loaded on-demand.
+    Images are accessed by index or slice and are loaded on-demand for
+    memory-efficient processing of large datasets.
     Parameters
     ----------
     dataset : Dataset[tuple[T, ...]] or Dataset[T]
-        Dataset to access images from.
+        Dataset that provides image data for access and visualization.
+    Attributes
+    ----------
+    None
+        All dataset access is handled through indexing operations.
     """
     def __init__(
@@ -36,16 +43,20 @@ class Images(Generic[T]):
     def to_list(self) -> Sequence[T]:
         """
-        Converts entire dataset to a sequence of images.
+        Convert entire dataset to a sequence of images.
-        Warning
-        -------
-        Will load the entire dataset and return the images as a
-        single sequence of images in memory.
+        Load all images from the dataset and return a single sequence
+        in memory for batch processing or analysis.
         Returns
         -------
         list[T]
+            Complete sequence of all images in the dataset
+        Warnings
+        --------
+        Loading entire dataset into memory can consume significant resources
+        for large image collections.
         """
         return self[:]
@@ -55,6 +66,28 @@ class Images(Generic[T]):
         images_per_row: int = 3,
         figsize: tuple[int, int] = (10, 10),
     ) -> Figure:
+        """
+        Display images in a grid layout.
+        Create matplotlib figure showing specified images arranged in a
+        grid format for visual inspection and comparison.
+        Parameters
+        ----------
+        indices : Sequence[int]
+            Dataset indices of images to display in the plot.
+        images_per_row : int, default 3
+            Number of images displayed per row in the grid. Default 3 provides a balanced layout
+            for most screen sizes.
+        figsize : tuple[int, int], default (10, 10)
+            Figure dimensions as (width, height) in inches. Default (10, 10)
+            accommodates typical grid layouts with readable detail.
+        Returns
+        -------
+        Figure
+            Matplotlib figure object containing the image grid display.
+        """
         import matplotlib.pyplot as plt
         num_images = len(indices)

dataeval 0.87.0__py3-none-any.whl → 0.88.0__py3-none-any.whl

dataeval 0.87.0py3-none-any.whl → 0.88.0py3-none-any.whl