PyPI - dataeval - Versions diffs - 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl - Mend

dataeval 0.82.0py3-none-any.whl → 0.83.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

dataeval/__init__.py +7 -2
dataeval/config.py +78 -11
dataeval/detectors/drift/_mmd.py +9 -9
dataeval/detectors/drift/_torch.py +7 -7
dataeval/detectors/drift/_uncertainty.py +4 -4
dataeval/detectors/linters/duplicates.py +3 -3
dataeval/detectors/linters/outliers.py +3 -3
dataeval/detectors/ood/ae.py +5 -4
dataeval/detectors/ood/base.py +2 -2
dataeval/detectors/ood/mixin.py +1 -1
dataeval/detectors/ood/vae.py +2 -1
dataeval/metadata/__init__.py +2 -2
dataeval/metadata/_distance.py +11 -44
dataeval/metadata/_ood.py +152 -33
dataeval/metrics/bias/_balance.py +9 -5
dataeval/metrics/bias/_diversity.py +3 -0
dataeval/metrics/bias/_parity.py +2 -0
dataeval/metrics/estimators/_ber.py +2 -1
dataeval/metrics/stats/_base.py +20 -21
dataeval/metrics/stats/_boxratiostats.py +1 -1
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +8 -8
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +5 -0
dataeval/outputs/_base.py +50 -21
dataeval/outputs/_bias.py +1 -1
dataeval/outputs/_linters.py +4 -2
dataeval/outputs/_metadata.py +61 -0
dataeval/outputs/_stats.py +12 -6
dataeval/typing.py +40 -9
dataeval/utils/_mst.py +1 -2
dataeval/utils/data/_embeddings.py +23 -19
dataeval/utils/data/_metadata.py +16 -7
dataeval/utils/data/_selection.py +22 -15
dataeval/utils/data/_split.py +3 -2
dataeval/utils/data/datasets/_base.py +4 -2
dataeval/utils/data/datasets/_cifar10.py +17 -9
dataeval/utils/data/datasets/_milco.py +18 -12
dataeval/utils/data/datasets/_mnist.py +24 -8
dataeval/utils/data/datasets/_ships.py +18 -8
dataeval/utils/data/datasets/_types.py +1 -5
dataeval/utils/data/datasets/_voc.py +47 -24
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classfilter.py +5 -3
dataeval/utils/data/selections/_prioritize.py +296 -0
dataeval/utils/data/selections/_shuffle.py +13 -4
dataeval/utils/torch/_gmm.py +3 -2
dataeval/utils/torch/_internal.py +5 -5
dataeval/utils/torch/trainer.py +8 -8
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/METADATA +4 -4
dataeval-0.83.0.dist-info/RECORD +105 -0
dataeval/detectors/ood/metadata_ood_mi.py +0 -93
dataeval-0.82.0.dist-info/RECORD +0 -104
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/WHEEL +0 -0

dataeval/outputs/_base.py CHANGED Viewed

@@ -4,11 +4,11 @@ __all__ = []
 import inspect
 import logging
-from collections.abc import Mapping
+from collections.abc import Collection, Mapping, Sequence
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from functools import partial, wraps
-from typing import Any, Callable, Iterator, TypeVar
+from typing import Any, Callable, Generic, Iterator, TypeVar, overload
 import numpy as np
 from typing_extensions import ParamSpec
@@ -56,16 +56,13 @@ class ExecutionMetadata:
         )
-class Output:
-    _meta: ExecutionMetadata | None = None
+T = TypeVar("T", covariant=True)
-    def __str__(self) -> str:
-        return f"{self.__class__.__name__}: {str(self.dict())}"
-    def dict(self) -> dict[str, Any]:
-        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
+class GenericOutput(Generic[T]):
+    _meta: ExecutionMetadata | None = None
-    @property
+    def data(self) -> T: ...
     def meta(self) -> ExecutionMetadata:
         """
         Metadata about the execution of the function or method for the Output class.
@@ -73,34 +70,66 @@ class Output:
         return self._meta or ExecutionMetadata.empty()
-TKey = TypeVar("TKey", str, int, float, set)
-TValue = TypeVar("TValue")
+class Output(GenericOutput[dict[str, Any]]):
+    def data(self) -> dict[str, Any]:
+        return {k: v for k, v in self.__dict__.items() if k != "_meta"}
+    def __repr__(self) -> str:
+        return str(self)
-class MappingOutput(Mapping[TKey, TValue], Output):
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.data().items()])})"
+class BaseCollectionMixin(Collection[Any]):
     __slots__ = ["_data"]
+    def data(self) -> Any:
+        return self._data
+    def __len__(self) -> int:
+        return len(self._data)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({repr(self._data)})"
+    def __str__(self) -> str:
+        return str(self._data)
+TKey = TypeVar("TKey", str, int, float, set)
+TValue = TypeVar("TValue")
+class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Mapping[TKey, TValue]]):
     def __init__(self, data: Mapping[TKey, TValue]):
         self._data = data
     def __getitem__(self, key: TKey) -> TValue:
-        return self._data.__getitem__(key)
+        return self._data[key]
     def __iter__(self) -> Iterator[TKey]:
-        return self._data.__iter__()
+        return iter(self._data)
-    def __len__(self) -> int:
-        return self._data.__len__()
-    def dict(self) -> dict[str, TValue]:
-        return {str(k): v for k, v in self._data.items()}
+class SequenceOutput(Sequence[TValue], BaseCollectionMixin, GenericOutput[Sequence[TValue]]):
+    def __init__(self, data: Sequence[TValue]):
+        self._data = data
+    @overload
+    def __getitem__(self, index: int) -> TValue: ...
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[TValue]: ...
-    def __str__(self) -> str:
-        return str(self.dict())
+    def __getitem__(self, index: int | slice) -> TValue | Sequence[TValue]:
+        return self._data[index]
+    def __iter__(self) -> Iterator[TValue]:
+        return iter(self._data)
 P = ParamSpec("P")
-R = TypeVar("R", bound=Output)
+R = TypeVar("R", bound=GenericOutput)
 def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:

dataeval/outputs/_bias.py CHANGED Viewed

@@ -364,7 +364,7 @@ class DiversityOutput(Output):
                 col_labels,
                 xlabel="Factors",
                 ylabel="Class",
-                cbarlabel=f"Normalized {asdict(self.meta)['arguments']['method'].title()} Index",
+                cbarlabel=f"Normalized {asdict(self.meta())['arguments']['method'].title()} Index",
             )
         else:

dataeval/outputs/_linters.py CHANGED Viewed

@@ -24,7 +24,7 @@ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
 @dataclass(frozen=True)
-class DuplicatesOutput(Generic[TIndexCollection], Output):
+class DuplicatesOutput(Output, Generic[TIndexCollection]):
     """
     Output class for :class:`.Duplicates` lint detector.
@@ -35,6 +35,8 @@ class DuplicatesOutput(Generic[TIndexCollection], Output):
     near: list[list[int] | dict[int, list[int]]]
         Indices of images that are near matches
+    Notes
+    -----
     - For a single dataset, indices are returned as a list of index groups.
     - For multiple datasets, indices are returned as dictionaries where the key is the
       index of the dataset, and the value is the list index groups from that dataset.
@@ -99,7 +101,7 @@ def _create_pandas_dataframe(class_wise):
 @dataclass(frozen=True)
-class OutliersOutput(Generic[TIndexIssueMap], Output):
+class OutliersOutput(Output, Generic[TIndexIssueMap]):
     """
     Output class for :class:`.Outliers` lint detector.

dataeval/outputs/_metadata.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+__all__ = []
+from typing import NamedTuple
+from dataeval.outputs._base import MappingOutput, SequenceOutput
+class MostDeviatedFactorsOutput(SequenceOutput[tuple[str, float]]):
+    """
+    Output class for results of :func:`.most_deviated_factors` for OOD samples with metadata.
+    Attributes
+    ----------
+    value : tuple[str, float]
+        A tuple of the factor name and deviation of the highest metadata deviation
+    """
+class MetadataDistanceValues(NamedTuple):
+    """
+    Statistics comparing metadata distance.
+    Attributes
+    ----------
+    statistic : float
+        the KS statistic
+    location : float
+        The value at which the KS statistic has its maximum, measured in IQR-normalized units relative
+        to the median of the reference distribution.
+    dist : float
+        The Earth Mover's Distance normalized by the interquartile range (IQR) of the reference
+    pvalue : float
+        The p-value from the KS two-sample test
+    """
+    statistic: float
+    location: float
+    dist: float
+    pvalue: float
+class MetadataDistanceOutput(MappingOutput[str, MetadataDistanceValues]):
+    """
+    Output class for results of ks_2samp featurewise comparisons of new metadata to reference metadata.
+    Attributes
+    ----------
+    key : str
+        Metadata feature names
+    value : :class:`.MetadataDistanceValues`
+        Output per feature name containing the statistic, statistic location, distance, and pvalue.
+    """
+class OODPredictorOutput(MappingOutput[str, float]):
+    """
+    Output class for results of :func:`find_ood_predictors` for the
+    mutual information between factors and being out of distribution
+    """

dataeval/outputs/_stats.py CHANGED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 import contextlib
 from dataclasses import dataclass
-from typing import Iterable, Optional, Union
+from typing import Any, Iterable, Optional, Union
 import numpy as np
 from numpy.typing import NDArray
@@ -63,7 +63,7 @@ class BaseStatsOutput(Output):
     def __post_init__(self) -> None:
         length = len(self.source_index)
-        bad = {k: len(v) for k, v in self.dict().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
+        bad = {k: len(v) for k, v in self.data().items() if k not in [SOURCE_INDEX, BOX_COUNT] and len(v) != length}
         if bad:
             raise ValueError(f"All values must have the same length as source_index. Bad values: {str(bad)}.")
@@ -105,7 +105,7 @@ class BaseStatsOutput(Output):
     def _get_channels(
         self, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
     ) -> tuple[int, list[bool] | None]:
-        source_index = self.dict()[SOURCE_INDEX]
+        source_index = self.data()[SOURCE_INDEX]
         raw_channels = int(max([si.channel or 0 for si in source_index])) + 1
         if isinstance(channel_index, int):
             max_channels = 1 if channel_index < raw_channels else raw_channels
@@ -127,15 +127,21 @@ class BaseStatsOutput(Output):
         return max_channels, ch_mask
+    def factors(self) -> dict[str, NDArray[Any]]:
+        return {
+            k: v
+            for k, v in self.data().items()
+            if k not in (SOURCE_INDEX, BOX_COUNT) and isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1
+        }
     def plot(
         self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
     ) -> None:
         max_channels, ch_mask = self._get_channels(channel_limit, channel_index)
-        d = {k: v for k, v in self.dict().items() if isinstance(v, np.ndarray) and v[v != 0].size > 0 and v.ndim == 1}
         if max_channels == 1:
-            histogram_plot(d, log)
+            histogram_plot(self.factors(), log)
         else:
-            channel_histogram_plot(d, log, max_channels, ch_mask)
+            channel_histogram_plot(self.factors(), log, max_channels, ch_mask)
 @dataclass(frozen=True)

dataeval/typing.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Common type hints used for interoperability with DataEval.
+Common type protocols used for interoperability with DataEval.
 """
 __all__ = [
@@ -16,6 +16,7 @@ __all__ = [
     "SegmentationTarget",
     "SegmentationDatum",
     "SegmentationDataset",
+    "Transform",
 ]
@@ -66,6 +67,7 @@ class Array(Protocol):
     def __len__(self) -> int: ...
+T = TypeVar("T")
 _T_co = TypeVar("_T_co", covariant=True)
 _ScalarType = Union[int, float, bool, str]
 ArrayLike: TypeAlias = Union[Sequence[_ScalarType], Sequence[Sequence[_ScalarType]], Sequence[Array], Array]
@@ -140,7 +142,7 @@ class AnnotatedDataset(Dataset[_T_co], Generic[_T_co], Protocol):
 ImageClassificationDatum: TypeAlias = tuple[Array, Array, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`Array` of shape (N,) - Class label as one-hot encoded ground-truth or prediction confidences.
@@ -150,7 +152,7 @@ A type definition for an image classification datum tuple.
 ImageClassificationDataset: TypeAlias = AnnotatedDataset[ImageClassificationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ImageClassificationDatum` elements.
 """
 # ========== OBJECT DETECTION DATASETS ==========
@@ -159,7 +161,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ImageClassificatio
 @runtime_checkable
 class ObjectDetectionTarget(Protocol):
     """
-    A protocol for targets in an Object Detection dataset.
+    Protocol for targets in an Object Detection dataset.
     Attributes
     ----------
@@ -180,7 +182,7 @@ class ObjectDetectionTarget(Protocol):
 ObjectDetectionDatum: TypeAlias = tuple[Array, ObjectDetectionTarget, dict[str, Any]]
 """
-A type definition for an object detection datum tuple.
+Type alias for an object detection datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`ObjectDetectionTarget` - Object detection target information for the image.
@@ -190,7 +192,7 @@ A type definition for an object detection datum tuple.
 ObjectDetectionDataset: TypeAlias = AnnotatedDataset[ObjectDetectionDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDatum` elements.
 """
@@ -200,7 +202,7 @@ A type definition for an :class:`AnnotatedDataset` of :class:`ObjectDetectionDat
 @runtime_checkable
 class SegmentationTarget(Protocol):
     """
-    A protocol for targets in a Segmentation dataset.
+    Protocol for targets in a Segmentation dataset.
     Attributes
     ----------
@@ -221,7 +223,7 @@ class SegmentationTarget(Protocol):
 SegmentationDatum: TypeAlias = tuple[Array, SegmentationTarget, dict[str, Any]]
 """
-A type definition for an image classification datum tuple.
+Type alias for an image classification datum tuple.
 - :class:`Array` of shape (C, H, W) - Image data in channel, height, width format.
 - :class:`SegmentationTarget` - Segmentation target information for the image.
@@ -230,5 +232,34 @@ A type definition for an image classification datum tuple.
 SegmentationDataset: TypeAlias = AnnotatedDataset[SegmentationDatum]
 """
-A type definition for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
+Type alias for an :class:`AnnotatedDataset` of :class:`SegmentationDatum` elements.
 """
+@runtime_checkable
+class Transform(Generic[T], Protocol):
+    """
+    Protocol defining a transform function.
+    Requires a `__call__` method that returns transformed data.
+    Example
+    -------
+    >>> from typing import Any
+    >>> from numpy.typing import NDArray
+    >>> class MyTransform:
+    ...     def __init__(self, divisor: float) -> None:
+    ...         self.divisor = divisor
+    ...
+    ...     def __call__(self, data: NDArray[Any], /) -> NDArray[Any]:
+    ...         return data / self.divisor
+    >>> my_transform = MyTransform(divisor=255.0)
+    >>> isinstance(my_transform, Transform)
+    True
+    >>> my_transform(np.array([1, 2, 3]))
+    array([0.004, 0.008, 0.012])
+    """
+    def __call__(self, data: T, /) -> T: ...

dataeval/utils/_mst.py CHANGED Viewed

@@ -10,10 +10,9 @@ from scipy.sparse.csgraph import minimum_spanning_tree as mst
 from scipy.spatial.distance import pdist, squareform
 from sklearn.neighbors import NearestNeighbors
+from dataeval.config import EPSILON
 from dataeval.utils._array import flatten
-EPSILON = 1e-5
 def minimum_spanning_tree(X: NDArray[Any]) -> Any:
     """

dataeval/utils/data/_embeddings.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 from torch.utils.data import DataLoader, Subset
 from tqdm import tqdm
-from dataeval.config import get_device
+from dataeval.config import DeviceLike, get_device
 from dataeval.typing import Array, Dataset
 from dataeval.utils.torch.models import SupportsEncode
@@ -24,13 +24,14 @@ class Embeddings:
     ----------
     dataset : ImageClassificationDataset or ObjectDetectionDataset
         Dataset to access original images from.
-    batch_size : int, optional
+    batch_size : int
         Batch size to use when encoding images.
-    model : torch.nn.Module, optional
+    model : torch.nn.Module or None, default None
         Model to use for encoding images.
-    device : torch.device, optional
-        Device to use for encoding images.
-    verbose : bool, optional
+    device : DeviceLike or None, default None
+        The hardware device to use if specified, otherwise uses the DataEval
+        default or torch default.
+    verbose : bool, default False
         Whether to print progress bar when encoding images.
     """
@@ -42,9 +43,8 @@ class Embeddings:
         self,
         dataset: Dataset[tuple[Array, Any, Any]],
         batch_size: int,
-        indices: Sequence[int] | None = None,
         model: torch.nn.Module | None = None,
-        device: torch.device | str | None = None,
+        device: DeviceLike | None = None,
         verbose: bool = False,
     ) -> None:
         self.device = get_device(device)
@@ -52,26 +52,32 @@ class Embeddings:
         self.verbose = verbose
         self._dataset = dataset
-        self._indices = indices if indices is not None else range(len(dataset))
         model = torch.nn.Flatten() if model is None else model
         self._model = model.to(self.device).eval()
         self._encoder = model.encode if isinstance(model, SupportsEncode) else model
         self._collate_fn = lambda datum: [torch.as_tensor(i) for i, _, _ in datum]
-    def to_tensor(self) -> torch.Tensor:
+    def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
         """
-        Converts entire dataset to embeddings.
+        Converts dataset to embeddings.
-        Warning
-        -------
-        Will process the entire dataset in batches and return
-        embeddings as a single Tensor in memory.
+        Parameters
+        ----------
+        indices : Sequence[int] or None, default None
+            The indices to convert to embeddings
         Returns
         -------
         torch.Tensor
+        Warning
+        -------
+        Processing large quantities of data can be resource intensive.
         """
-        return self[:]
+        if indices is not None:
+            return torch.vstack(list(self._batch(indices))).to(self.device)
+        else:
+            return self[:]
     # Reduce overhead cost by not tracking tensor gradients
     @torch.no_grad
@@ -86,9 +92,7 @@ class Embeddings:
             embeddings = self._encoder(torch.stack(images).to(self.device))
             yield embeddings
-    def __getitem__(self, key: int | slice | list[int], /) -> torch.Tensor:
-        if isinstance(key, list):
-            return torch.vstack(list(self._batch(key))).to(self.device)
+    def __getitem__(self, key: int | slice, /) -> torch.Tensor:
         if isinstance(key, slice):
             return torch.vstack(list(self._batch(range(len(self._dataset))[key]))).to(self.device)
         elif isinstance(key, int):

dataeval/utils/data/_metadata.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 __all__ = []
 import warnings
-from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence
+from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence, cast
 import numpy as np
 from numpy.typing import NDArray
@@ -11,6 +11,7 @@ from numpy.typing import NDArray
 from dataeval.typing import (
     AnnotatedDataset,
     Array,
+    ArrayLike,
     ObjectDetectionTarget,
 )
 from dataeval.utils._array import as_numpy, to_numpy
@@ -276,16 +277,12 @@ class Metadata:
         if self._processed and not force:
             return
-        # Trigger collate and merge if not yet done
-        self._collate()
-        self._merge()
+        # Create image indices from targets
+        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
         # Validate the metadata dimensions
         self._validate()
-        # Create image indices from targets
-        self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
         # Include specified metadata keys
         if self.include:
             metadata = {i: self.merged[i] for i in self.include if i in self.merged}
@@ -358,3 +355,15 @@ class Metadata:
         )
         self._total_num_factors = len(self._discrete_factor_names + self._continuous_factor_names) + 1
         self._processed = True
+    def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
+        self._merge()
+        self._processed = False
+        target_len = len(self.targets.source) if self.targets.source is not None else len(self.targets)
+        if any(len(v) != target_len for v in factors.values()):
+            raise ValueError(
+                "The lists/arrays in the provided factors have a different length than the current metadata factors."
+            )
+        merged = cast(tuple[dict[str, ArrayLike], dict[str, list[str]]], self._merged)[0]
+        for k, v in factors.items():
+            merged[k] = v

dataeval/utils/data/_selection.py CHANGED Viewed

@@ -3,9 +3,9 @@ from __future__ import annotations
 __all__ = []
 from enum import IntEnum
-from typing import Any, Generic, Iterator, Sequence, TypeVar
+from typing import Generic, Iterator, Sequence, TypeVar
-from dataeval.typing import AnnotatedDataset, DatasetMetadata
+from dataeval.typing import AnnotatedDataset, DatasetMetadata, Transform
 _TDatum = TypeVar("_TDatum")
@@ -35,6 +35,8 @@ class Select(AnnotatedDataset[_TDatum]):
         The dataset to wrap.
     selections : Selection or list[Selection], optional
         The selection criteria to apply to the dataset.
+    transforms : Transform or list[Transform], optional
+        The transforms to apply to the dataset.
     Examples
     --------
@@ -67,13 +69,17 @@ class Select(AnnotatedDataset[_TDatum]):
     def __init__(
         self,
         dataset: AnnotatedDataset[_TDatum],
-        selections: Selection[_TDatum] | list[Selection[_TDatum]] | None = None,
+        selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None = None,
+        transforms: Transform[_TDatum] | Sequence[Transform[_TDatum]] | None = None,
     ) -> None:
+        self.__dict__.update(dataset.__dict__)
         self._dataset = dataset
         self._size_limit = len(dataset)
         self._selection = list(range(self._size_limit))
-        self._selections = self._sort_selections(selections)
-        self.__dict__.update(dataset.__dict__)
+        self._selections = self._sort(selections)
+        self._transforms = (
+            [] if transforms is None else [transforms] if isinstance(transforms, Transform) else transforms
+        )
         # Ensure metadata is populated correctly as DatasetMetadata TypedDict
         _metadata = getattr(dataset, "metadata", {})
@@ -81,8 +87,7 @@ class Select(AnnotatedDataset[_TDatum]):
             _metadata["id"] = dataset.__class__.__name__
         self._metadata = DatasetMetadata(**_metadata)
-        if self._selections:
-            self._apply_selections()
+        self._select()
     @property
     def metadata(self) -> DatasetMetadata:
@@ -92,10 +97,11 @@ class Select(AnnotatedDataset[_TDatum]):
         nt = "\n    "
         title = f"{self.__class__.__name__} Dataset"
         sep = "-" * len(title)
-        selections = f"Selections: [{', '.join([str(s) for s in self._sort_selections(self._selections)])}]"
-        return f"{title}\n{sep}{nt}{selections}\n\n{self._dataset}"
+        selections = f"Selections: [{', '.join([str(s) for s in self._selections])}]"
+        transforms = f"Transforms: [{', '.join([str(t) for t in self._transforms])}]"
+        return f"{title}\n{sep}{nt}{selections}{nt}{transforms}{nt}Selected Size: {len(self)}\n\n{self._dataset}"
-    def _sort_selections(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
+    def _sort(self, selections: Selection[_TDatum] | Sequence[Selection[_TDatum]] | None) -> list[Selection]:
         if not selections:
             return []
@@ -106,17 +112,18 @@ class Select(AnnotatedDataset[_TDatum]):
         selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
         return selection_list
-    def _apply_selections(self) -> None:
+    def _select(self) -> None:
         for selection in self._selections:
             selection(self)
         self._selection = self._selection[: self._size_limit]
-    def __getattr__(self, name: str, /) -> Any:
-        selfattr = getattr(self._dataset, name, None)
-        return selfattr if selfattr is not None else getattr(self._dataset, name)
+    def _transform(self, datum: _TDatum) -> _TDatum:
+        for t in self._transforms:
+            datum = t(datum)
+        return datum
     def __getitem__(self, index: int) -> _TDatum:
-        return self._dataset[self._selection[index]]
+        return self._transform(self._dataset[self._selection[index]])
     def __iter__(self) -> Iterator[_TDatum]:
         for i in range(len(self)):

dataeval/utils/data/_split.py CHANGED Viewed

@@ -12,6 +12,7 @@ from sklearn.metrics import silhouette_score
 from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
 from sklearn.utils.multiclass import type_of_target
+from dataeval.config import get_seed
 from dataeval.outputs._base import set_metadata
 from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
@@ -212,9 +213,9 @@ def bin_kmeans(array: NDArray[Any]) -> NDArray[np.intp]:
         best_score = 0.50
     bin_index = np.zeros(len(array), dtype=np.intp)
     for k in range(2, 20):
-        clusterer = KMeans(n_clusters=k)
+        clusterer = KMeans(n_clusters=k, random_state=get_seed())
         cluster_labels = clusterer.fit_predict(array)
-        score = silhouette_score(array, cluster_labels, sample_size=25_000)
+        score = silhouette_score(array, cluster_labels, sample_size=25_000, random_state=get_seed())
         if score > best_score:
             best_score = score
             bin_index = cluster_labels.astype(np.intp)

dataeval/utils/data/datasets/_base.py CHANGED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
+from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
 from dataeval.utils.data.datasets._fileio import _ensure_exists
 from dataeval.utils.data.datasets._mixin import BaseDatasetMixin
@@ -16,9 +16,11 @@ from dataeval.utils.data.datasets._types import (
     ObjectDetectionTarget,
     SegmentationDataset,
     SegmentationTarget,
-    Transform,
 )
+if TYPE_CHECKING:
+    from dataeval.typing import Transform
 _TArray = TypeVar("_TArray")
 _TTarget = TypeVar("_TTarget")
 _TRawTarget = TypeVar("_TRawTarget", list[int], list[str])

dataeval 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl

dataeval 0.82.0py3-none-any.whl → 0.83.0py3-none-any.whl