PyPI - dataeval - Versions diffs - 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

dataeval/__init__.py +3 -3
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +40 -85
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -5
dataeval/detectors/linters/duplicates.py +13 -36
dataeval/detectors/linters/outliers.py +23 -148
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +30 -9
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/mixin.py +21 -7
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +6 -0
dataeval/metadata/_distance.py +167 -0
dataeval/metadata/_ood.py +217 -0
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +6 -4
dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
dataeval/metrics/bias/_coverage.py +98 -0
dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
dataeval/metrics/estimators/__init__.py +15 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
dataeval/metrics/estimators/_clusterer.py +44 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
dataeval/metrics/stats/__init__.py +16 -13
dataeval/metrics/stats/{base.py → _base.py} +82 -133
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
dataeval/metrics/stats/_dimensionstats.py +75 -0
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +131 -0
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
dataeval/outputs/__init__.py +53 -0
dataeval/{output.py → outputs/_base.py} +55 -25
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +184 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +387 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +234 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +14 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +6 -6
dataeval/utils/data/__init__.py +26 -0
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +104 -0
dataeval/utils/data/_images.py +68 -0
dataeval/utils/data/_metadata.py +360 -0
dataeval/utils/data/_selection.py +126 -0
dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
dataeval/utils/data/_targets.py +85 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_types.py +52 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +57 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +51 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +11 -346
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
dataeval-0.82.0.dist-info/RECORD +104 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/metrics/bias/coverage.py +0 -194
dataeval/metrics/stats/datasetstats.py +0 -202
dataeval/metrics/stats/dimensionstats.py +0 -115
dataeval/metrics/stats/labelstats.py +0 -210
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.1.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -7,12 +7,12 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
-__all__ = ["detectors", "log", "metrics", "utils", "workflows"]
-__version__ = "0.76.1"
+__all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
+__version__ = "0.82.0"
 import logging
-from dataeval import detectors, metrics, utils, workflows
+from dataeval import config, detectors, metrics, typing, utils, workflows
 logging.getLogger(__name__).addHandler(logging.NullHandler())

dataeval/config.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+Global configuration settings for DataEval.
+"""
+from __future__ import annotations
+__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes"]
+import torch
+from torch import device
+_device: device | None = None
+_processes: int | None = None
+def set_device(device: str | device | int) -> None:
+    """
+    Sets the default device to use when executing against a PyTorch backend.
+    Parameters
+    ----------
+    device : str or int or `torch.device`
+        The default device to use. See `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
+        documentation for more information.
+    """
+    global _device
+    _device = torch.device(device)
+def get_device(override: str | device | int | None = None) -> torch.device:
+    """
+    Returns the PyTorch device to use.
+    Parameters
+    ----------
+    override : str or int or `torch.device` or None, default None
+        The user specified override if provided, otherwise returns the default device.
+    Returns
+    -------
+    `torch.device`
+    """
+    if override is None:
+        global _device
+        return torch.get_default_device() if _device is None else _device
+    else:
+        return torch.device(override)
+def set_max_processes(processes: int | None) -> None:
+    """
+    Sets the maximum number of worker processes to use when running tasks that support parallel processing.
+    Parameters
+    ----------
+    processes : int or None
+        The maximum number of worker processes to use, or None to use
+        `os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
+        to determine the number of worker processes.
+    """
+    global _processes
+    _processes = processes
+def get_max_processes() -> int | None:
+    """
+    Returns the maximum number of worker processes to use when running tasks that support parallel processing.
+    Returns
+    -------
+    int or None
+        The maximum number of worker processes to use, or None to use
+        `os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
+        to determine the number of worker processes.
+    """
+    global _processes
+    return _processes

dataeval/detectors/__init__.py CHANGED Viewed

@@ -4,4 +4,4 @@ Detectors can determine if a dataset or individual images in a dataset are indic
 __all__ = ["drift", "linters", "ood"]
-from dataeval.detectors import drift, linters, ood
+from . import drift, linters, ood

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -14,9 +14,9 @@ __all__ = [
 ]
 from dataeval.detectors.drift import updates
-from dataeval.detectors.drift.base import DriftOutput
-from dataeval.detectors.drift.cvm import DriftCVM
-from dataeval.detectors.drift.ks import DriftKS
-from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
-from dataeval.detectors.drift.torch import preprocess_drift
-from dataeval.detectors.drift.uncertainty import DriftUncertainty
+from dataeval.detectors.drift._cvm import DriftCVM
+from dataeval.detectors.drift._ks import DriftKS
+from dataeval.detectors.drift._mmd import DriftMMD
+from dataeval.detectors.drift._torch import preprocess_drift
+from dataeval.detectors.drift._uncertainty import DriftUncertainty
+from dataeval.outputs._drift import DriftMMDOutput, DriftOutput

dataeval/detectors/drift/{base.py → _base.py} RENAMED Viewed

@@ -10,86 +10,29 @@ from __future__ import annotations
 __all__ = []
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
+import math
+from abc import abstractmethod
 from functools import wraps
-from typing import Any, Callable, Literal, TypeVar
+from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.interop import as_numpy
-from dataeval.output import Output, set_metadata
+from dataeval.outputs import DriftOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import Array, ArrayLike
+from dataeval.utils._array import as_numpy, to_numpy
 R = TypeVar("R")
-class UpdateStrategy(ABC):
+@runtime_checkable
+class UpdateStrategy(Protocol):
     """
-    Updates reference dataset for drift detector
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __init__(self, n: int) -> None:
-        self.n = n
-    @abstractmethod
-    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
-        """Abstract implementation of update strategy"""
-@dataclass(frozen=True)
-class DriftBaseOutput(Output):
-    """
-    Base output class for Drift Detector classes
-    Attributes
-    ----------
-    is_drift : bool
-        Drift prediction for the images
-    threshold : float
-        Threshold after multivariate correction if needed
+    Protocol for reference dataset update strategy for drift detectors
     """
-    is_drift: bool
-    threshold: float
-    p_val: float
-    distance: float
-@dataclass(frozen=True)
-class DriftOutput(DriftBaseOutput):
-    """
-    Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
-    Attributes
-    ----------
-    is_drift : bool
-        :term:`Drift` prediction for the images
-    threshold : float
-        Threshold after multivariate correction if needed
-    feature_drift : NDArray
-        Feature-level array of images detected to have drifted
-    feature_threshold : float
-        Feature-level threshold to determine drift
-    p_vals : NDArray
-        Feature-level p-values
-    distances : NDArray
-        Feature-level distances
-    """
-    # is_drift: bool
-    # threshold: float
-    # p_val: float
-    # distance: float
-    feature_drift: NDArray[np.bool_]
-    feature_threshold: float
-    p_vals: NDArray[np.float32]
-    distances: NDArray[np.float32]
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]: ...
 def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
@@ -196,7 +139,7 @@ class BaseDrift:
         if correction not in ["bonferroni", "fdr"]:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
-        self._x_ref = as_numpy(x_ref)
+        self._x_ref = x_ref
         self.x_ref_preprocessed: bool = x_ref_preprocessed
         # Other attributes
@@ -204,25 +147,25 @@ class BaseDrift:
         self.update_x_ref = update_x_ref
         self.preprocess_fn = preprocess_fn
         self.correction = correction
-        self.n: int = len(self._x_ref)
+        self.n: int = len(x_ref)
         # Ref counter for preprocessed x
         self._x_refcount = 0
     @property
-    def x_ref(self) -> NDArray[Any]:
+    def x_ref(self) -> ArrayLike:
         """
         Retrieve the reference data, applying preprocessing if not already done.
         Returns
         -------
-        NDArray
+        ArrayLike
             The reference dataset (`x_ref`), preprocessed if needed.
         """
         if not self.x_ref_preprocessed:
             self.x_ref_preprocessed = True
             if self.preprocess_fn is not None:
-                self._x_ref = as_numpy(self.preprocess_fn(self._x_ref))
+                self._x_ref = self.preprocess_fn(self._x_ref)
         return self._x_ref
@@ -323,32 +266,44 @@ class BaseDriftUnivariate(BaseDrift):
         # lazy process n_features as needed
         if not isinstance(self._n_features, int):
             # compute number of features for the univariate tests
-            if not isinstance(self.preprocess_fn, Callable) or self.x_ref_preprocessed:
-                # infer features from preprocessed reference data
-                self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
-            else:
-                # infer number of features after applying preprocessing step
-                x = as_numpy(self.preprocess_fn(self._x_ref[0:1]))  # type: ignore
-                self._n_features = x.reshape(x.shape[0], -1).shape[-1]
+            x_ref = (
+                self.x_ref
+                if self.preprocess_fn is None or self.x_ref_preprocessed
+                else self.preprocess_fn(self._x_ref[0:1])
+            )
+            # infer features from preprocessed reference data
+            shape = x_ref.shape if isinstance(x_ref, Array) else as_numpy(x_ref).shape
+            self._n_features = int(math.prod(shape[1:]))  # Multiplies all channel sizes after first
         return self._n_features
     @preprocess_x
-    @abstractmethod
     def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
         """
-        Abstract method to calculate feature scores after preprocessing.
+        Calculates p-values and test statistics per feature.
         Parameters
         ----------
         x : ArrayLike
-            The batch of data to calculate univariate :term:`drift<Drift>` scores for each feature.
+            Batch of instances
         Returns
         -------
         tuple[NDArray, NDArray]
-            A tuple containing p-values and distance :term:`statistics<Statistics>` for each feature.
+            Feature level p-values and test statistics
         """
+        x_np = to_numpy(x)
+        x_np = x_np.reshape(x_np.shape[0], -1)
+        x_ref_np = as_numpy(self.x_ref)
+        x_ref_np = x_ref_np.reshape(x_ref_np.shape[0], -1)
+        p_val = np.zeros(self.n_features, dtype=np.float32)
+        dist = np.zeros_like(p_val)
+        for f in range(self.n_features):
+            dist[f], p_val[f] = self._score_fn(x_ref_np[:, f], x_np[:, f])
+        return p_val, dist
+    @abstractmethod
+    def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]: ...
     def _apply_correction(self, p_vals: NDArray) -> tuple[bool, float]:
         """

dataeval/detectors/drift/{cvm.py → _cvm.py} RENAMED Viewed

@@ -13,11 +13,11 @@ __all__ = []
 from typing import Callable, Literal
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
 from scipy.stats import cramervonmises_2samp
-from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
-from dataeval.interop import to_numpy
+from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
+from dataeval.typing import ArrayLike
 class DriftCVM(BaseDriftUnivariate):
@@ -55,6 +55,21 @@ class DriftCVM(BaseDriftUnivariate):
         Number of features used in the statistical test. No need to pass it if no
         preprocessing takes place. In case of a preprocessing step, this can also
         be inferred automatically but could be more expensive to compute.
+    Example
+    -------
+    >>> from functools import partial
+    >>> from dataeval.detectors.drift import preprocess_drift
+    Use a preprocess function to encode images before testing for drift
+    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
+    >>> drift = DriftCVM(train_images, preprocess_fn=preprocess_fn)
+    Test incoming images for drift
+    >>> drift.predict(test_images).drifted
+    True
     """
     def __init__(
@@ -77,28 +92,6 @@ class DriftCVM(BaseDriftUnivariate):
             n_features=n_features,
         )
-    @preprocess_x
-    def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
-        """
-        Performs the two-sample Cramér-von Mises test(s), computing the :term:`p-value<P-value>` and
-        test statistic per feature.
-        Parameters
-        ----------
-        x : ArrayLike
-            Batch of instances.
-        Returns
-        -------
-        tuple[NDArray, NDArray]
-            Feature level p-values and CVM statistic
-        """
-        x_np = to_numpy(x)
-        x_np = x_np.reshape(x_np.shape[0], -1)
-        x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
-        p_val = np.zeros(self.n_features, dtype=np.float32)
-        dist = np.zeros_like(p_val)
-        for f in range(self.n_features):
-            result = cramervonmises_2samp(x_ref[:, f], x_np[:, f], method="auto")
-            p_val[f], dist[f] = result.pvalue, result.statistic
-        return p_val, dist
+    def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]:
+        result = cramervonmises_2samp(x, y, method="auto")
+        return np.float32(result.statistic), np.float32(result.pvalue)

dataeval/detectors/drift/{ks.py → _ks.py} RENAMED Viewed

@@ -13,11 +13,11 @@ __all__ = []
 from typing import Callable, Literal
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
 from scipy.stats import ks_2samp
-from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
-from dataeval.interop import to_numpy
+from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
+from dataeval.typing import ArrayLike
 class DriftKS(BaseDriftUnivariate):
@@ -58,6 +58,21 @@ class DriftKS(BaseDriftUnivariate):
         Number of features used in the statistical test. No need to pass it if no
         preprocessing takes place. In case of a preprocessing step, this can also
         be inferred automatically but could be more expensive to compute.
+    Example
+    -------
+    >>> from functools import partial
+    >>> from dataeval.detectors.drift import preprocess_drift
+    Use a preprocess function to encode images before testing for drift
+    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
+    >>> drift = DriftKS(train_images, preprocess_fn=preprocess_fn)
+    Test incoming images for drift
+    >>> drift.predict(test_images).drifted
+    True
     """
     def __init__(
@@ -84,26 +99,5 @@ class DriftKS(BaseDriftUnivariate):
         # Other attributes
         self.alternative = alternative
-    @preprocess_x
-    def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
-        """
-        Compute KS scores and :term:Statistics` per feature.
-        Parameters
-        ----------
-        x : ArrayLike
-            Batch of instances.
-        Returns
-        -------
-        tuple[NDArray, NDArray]
-            Feature level :term:p-values and KS statistic
-        """
-        x = to_numpy(x)
-        x = x.reshape(x.shape[0], -1)
-        x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
-        p_val = np.zeros(self.n_features, dtype=np.float32)
-        dist = np.zeros_like(p_val)
-        for f in range(self.n_features):
-            dist[f], p_val[f] = ks_2samp(x_ref[:, f], x[:, f], alternative=self.alternative, method="exact")
-        return p_val, dist
+    def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]:
+        return ks_2samp(x, y, alternative=self.alternative, method="exact")

dataeval/detectors/drift/{mmd.py → _mmd.py} RENAMED Viewed

@@ -10,43 +10,16 @@ from __future__ import annotations
 __all__ = []
-from dataclasses import dataclass
 from typing import Callable
 import torch
-from numpy.typing import ArrayLike
-from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
-from dataeval.detectors.drift.torch import GaussianRBF, mmd2_from_kernel_matrix
-from dataeval.interop import as_numpy
-from dataeval.output import set_metadata
-from dataeval.utils.torch.internal import get_device
-@dataclass(frozen=True)
-class DriftMMDOutput(DriftBaseOutput):
-    """
-    Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
-    Attributes
-    ----------
-    is_drift : bool
-        Drift prediction for the images
-    threshold : float
-        :term:`P-Value` used for significance of the permutation test
-    p_val : float
-        P-value obtained from the permutation test
-    distance : float
-        MMD^2 between the reference and test set
-    distance_threshold : float
-        MMD^2 threshold above which drift is flagged
-    """
-    # is_drift: bool
-    # threshold: float
-    # p_val: float
-    # distance: float
-    distance_threshold: float
+from dataeval.config import get_device
+from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
+from dataeval.detectors.drift._torch import GaussianRBF, mmd2_from_kernel_matrix
+from dataeval.outputs import DriftMMDOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike
 class DriftMMD(BaseDrift):
@@ -84,6 +57,21 @@ class DriftMMD(BaseDrift):
     device : str | None, default None
         Device type used. The default None uses the GPU and falls back on CPU.
         Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+    Example
+    -------
+    >>> from functools import partial
+    >>> from dataeval.detectors.drift import preprocess_drift
+    Use a preprocess function to encode images before testing for drift
+    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
+    >>> drift = DriftMMD(train_images, preprocess_fn=preprocess_fn)
+    Test incoming images for drift
+    >>> drift.predict(test_images).drifted
+    True
     """
     def __init__(
@@ -110,12 +98,12 @@ class DriftMMD(BaseDrift):
         self.device: torch.device = get_device(device)
         # initialize kernel
-        sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
+        sigma_tensor = torch.as_tensor(sigma, device=self.device) if sigma is not None else None
         self._kernel = GaussianRBF(sigma_tensor).to(self.device)
         # compute kernel matrix for the reference data
         if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
-            x = torch.from_numpy(self.x_ref).to(self.device)
+            x = torch.as_tensor(self.x_ref, device=self.device)
             self._k_xx = self._kernel(x, x, infer_sigma=self._infer_sigma)
             self._infer_sigma = False
         else:
@@ -147,21 +135,21 @@ class DriftMMD(BaseDrift):
             p-value obtained from the permutation test, MMD^2 between the reference and test set,
             and MMD^2 threshold above which :term:`drift<Drift>` is flagged
         """
-        x = as_numpy(x)
-        x_ref = torch.from_numpy(self.x_ref).to(self.device)
-        n = x.shape[0]
-        kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
+        x_ref = torch.as_tensor(self.x_ref, device=self.device)
+        x_test = torch.as_tensor(x, device=self.device)
+        n = x_test.shape[0]
+        kernel_mat = self._kernel_matrix(x_ref, x_test)
         kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())  # zero diagonal
         mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
-        mmd2_permuted = torch.Tensor(
-            [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
+        mmd2_permuted = torch.tensor(
+            [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False)] * self.n_permutations,
+            device=self.device,
         )
-        mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
         p_val = (mmd2 <= mmd2_permuted).float().mean()
         # compute distance threshold
         idx_threshold = int(self.p_val * len(mmd2_permuted))
         distance_threshold = torch.sort(mmd2_permuted, descending=True).values[idx_threshold]
-        return p_val.numpy().item(), mmd2.numpy().item(), distance_threshold.numpy().item()
+        return float(p_val.item()), float(mmd2.item()), float(distance_threshold.item())
     @set_metadata
     @preprocess_x

dataeval/detectors/drift/{torch.py → _torch.py} RENAMED Viewed

@@ -17,7 +17,8 @@ import torch
 import torch.nn as nn
 from numpy.typing import NDArray
-from dataeval.utils.torch.internal import get_device, predict_batch
+from dataeval.config import get_device
+from dataeval.utils.torch._internal import predict_batch
 def mmd2_from_kernel_matrix(

dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} RENAMED Viewed

@@ -14,14 +14,16 @@ from functools import partial
 from typing import Callable, Literal
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
 from scipy.special import softmax
 from scipy.stats import entropy
-from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
-from dataeval.detectors.drift.ks import DriftKS
-from dataeval.detectors.drift.torch import preprocess_drift
-from dataeval.utils.torch.internal import get_device
+from dataeval.config import get_device
+from dataeval.detectors.drift._base import UpdateStrategy
+from dataeval.detectors.drift._ks import DriftKS
+from dataeval.detectors.drift._torch import preprocess_drift
+from dataeval.outputs import DriftOutput
+from dataeval.typing import ArrayLike
 def classifier_uncertainty(
@@ -87,7 +89,7 @@ class DriftUncertainty:
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preds_type : "probs" | "logits", default "logits"
+    preds_type : "probs" | "logits", default "probs"
         Type of prediction output by the model. Options are 'probs' (in [0,1]) or
         'logits' (in [-inf,inf]).
     batch_size : int, default 32
@@ -98,7 +100,22 @@ class DriftUncertainty:
         objects to a batch which can be processed by the model.
     device : str | None, default None
         Device type used. The default None tries to use the GPU and falls back on
-        CPU if needed. Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
+        CPU if needed. Can be specified by passing either 'cuda' or 'cpu'.
+    Example
+    -------
+    >>> model = ClassificationModel()
+    >>> drift = DriftUncertainty(x_ref, model=model, batch_size=20)
+    Verify reference images have not drifted
+    >>> drift.predict(x_ref.copy()).drifted
+    False
+    Test incoming images for drift
+    >>> drift.predict(x_test).drifted
+    True
     """
     def __init__(

dataeval/detectors/drift/updates.py CHANGED Viewed

@@ -7,15 +7,32 @@ from __future__ import annotations
 __all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
+from abc import ABC, abstractmethod
 from typing import Any
 import numpy as np
 from numpy.typing import NDArray
-from dataeval.detectors.drift.base import UpdateStrategy
+class BaseUpdateStrategy(ABC):
+    """
+    Updates reference dataset for drift detector
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __init__(self, n: int) -> None:
+        self.n = n
+    @abstractmethod
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        """Abstract implementation of update strategy"""
-class LastSeenUpdate(UpdateStrategy):
+class LastSeenUpdate(BaseUpdateStrategy):
     """
     Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
@@ -30,7 +47,7 @@ class LastSeenUpdate(UpdateStrategy):
         return x_updated[-self.n :]
-class ReservoirSamplingUpdate(UpdateStrategy):
+class ReservoirSamplingUpdate(BaseUpdateStrategy):
     """
     Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.

dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl