PyPI - dataeval - Versions diffs - 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

dataeval/__init__.py +1 -1
dataeval/data/__init__.py +19 -0
dataeval/data/_embeddings.py +345 -0
dataeval/{utils/data → data}/_images.py +2 -2
dataeval/{utils/data → data}/_metadata.py +8 -7
dataeval/{utils/data → data}/_selection.py +22 -9
dataeval/{utils/data → data}/_split.py +1 -1
dataeval/data/selections/__init__.py +19 -0
dataeval/data/selections/_classbalance.py +37 -0
dataeval/data/selections/_classfilter.py +109 -0
dataeval/{utils/data → data}/selections/_indices.py +1 -1
dataeval/{utils/data → data}/selections/_limit.py +1 -1
dataeval/{utils/data → data}/selections/_prioritize.py +3 -3
dataeval/{utils/data → data}/selections/_reverse.py +1 -1
dataeval/{utils/data → data}/selections/_shuffle.py +3 -3
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +55 -203
dataeval/detectors/drift/_cvm.py +19 -30
dataeval/detectors/drift/_ks.py +18 -30
dataeval/detectors/drift/_mmd.py +189 -53
dataeval/detectors/drift/_uncertainty.py +52 -56
dataeval/detectors/drift/updates.py +13 -12
dataeval/detectors/linters/duplicates.py +6 -4
dataeval/detectors/linters/outliers.py +3 -3
dataeval/detectors/ood/ae.py +1 -1
dataeval/metadata/_distance.py +1 -1
dataeval/metadata/_ood.py +4 -4
dataeval/metrics/bias/_balance.py +1 -1
dataeval/metrics/bias/_diversity.py +1 -1
dataeval/metrics/bias/_parity.py +1 -1
dataeval/metrics/stats/_base.py +7 -7
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +2 -2
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/_bias.py +1 -1
dataeval/typing.py +53 -19
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +18 -7
dataeval/utils/data/__init__.py +5 -20
dataeval/utils/data/_dataset.py +6 -4
dataeval/utils/data/collate.py +2 -0
dataeval/utils/datasets/__init__.py +17 -0
dataeval/utils/{data/datasets → datasets}/_base.py +10 -7
dataeval/utils/{data/datasets → datasets}/_cifar10.py +11 -11
dataeval/utils/{data/datasets → datasets}/_milco.py +44 -16
dataeval/utils/{data/datasets → datasets}/_mnist.py +11 -7
dataeval/utils/{data/datasets → datasets}/_ships.py +10 -6
dataeval/utils/{data/datasets → datasets}/_voc.py +43 -22
dataeval/utils/torch/_internal.py +12 -35
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/METADATA +2 -3
dataeval-1.0.0.dist-info/RECORD +107 -0
dataeval/detectors/drift/_torch.py +0 -222
dataeval/utils/data/_embeddings.py +0 -186
dataeval/utils/data/datasets/__init__.py +0 -17
dataeval/utils/data/selections/__init__.py +0 -17
dataeval/utils/data/selections/_classfilter.py +0 -59
dataeval-0.84.0.dist-info/RECORD +0 -106
/dataeval/{utils/data → data}/_targets.py +0 -0
/dataeval/utils/{metadata.py → data/metadata.py} +0 -0
/dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/WHEEL +0 -0

dataeval/data/selections/_classfilter.py ADDED Viewed

@@ -0,0 +1,109 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any, Generic, Iterable, Sequence, Sized, TypeVar, cast
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.data._selection import Select, Selection, SelectionStage, Subselection
+from dataeval.typing import Array, ObjectDetectionDatum, ObjectDetectionTarget, SegmentationDatum, SegmentationTarget
+from dataeval.utils._array import as_numpy
+from dataeval.utils.data.metadata import flatten
+class ClassFilter(Selection[Any]):
+    """
+    Filter the dataset by class.
+    Parameters
+    ----------
+    classes : Sequence[int]
+        The classes to filter by.
+    filter_detections : bool, default True
+        Whether to filter detections from targets for object detection and segmentation datasets.
+    """
+    stage = SelectionStage.FILTER
+    def __init__(self, classes: Sequence[int], filter_detections: bool = True) -> None:
+        self.classes = classes
+        self.filter_detections = filter_detections
+    def __call__(self, dataset: Select[Any]) -> None:
+        if not self.classes:
+            return
+        selection = []
+        subselection = set()
+        for idx in dataset._selection:
+            target = dataset._dataset[idx][1]
+            if isinstance(target, Array):
+                # Get the label for the image
+                label = int(np.argmax(as_numpy(target)))
+                # Check to see if the label is in the classes to filter for
+                if label in self.classes:
+                    # Include the image
+                    selection.append(idx)
+            elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
+                # Get the set of labels from the target
+                labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
+                # Check to see if any labels are in the classes to filter for
+                if labels.intersection(self.classes):
+                    # Include the image
+                    selection.append(idx)
+                    # If we are filtering out other labels and there are other labels, add a subselection filter
+                    if self.filter_detections and labels.difference(self.classes):
+                        subselection.add(idx)
+            else:
+                raise TypeError(f"ClassFilter does not support targets of type {type(target)}.")
+        dataset._selection = selection
+        dataset._subselections.append((ClassFilterSubSelection(self.classes), subselection))
+_T = TypeVar("_T")
+_TDatum = TypeVar("_TDatum", ObjectDetectionDatum, SegmentationDatum)
+_TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
+def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
+    if isinstance(obj, Sized) and not isinstance(obj, (str, bytes, bytearray)) and len(obj) == len(mask):
+        if isinstance(obj, Array):
+            return obj[mask]
+        elif isinstance(obj, Sequence):
+            return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
+    return obj
+class ClassFilterTarget(Generic[_TTarget]):
+    def __init__(self, target: _TTarget, mask: NDArray[np.bool_]) -> None:
+        self.__dict__.update(target.__dict__)
+        self._length = len(target.labels) if isinstance(target.labels, Sized) else int(bool(target.labels))
+        self._mask = mask
+        self._target = target
+    def __getattribute__(self, name: str) -> Any:
+        if name in ("_length", "_mask", "_target") or name.startswith("__") and name.endswith("__"):
+            return super().__getattribute__(name)
+        attr = getattr(self._target, name)
+        return _try_mask_object(attr, self._mask)
+class ClassFilterSubSelection(Subselection[Any]):
+    def __init__(self, classes: Sequence[int]) -> None:
+        self.classes = classes
+    def __call__(self, datum: _TDatum) -> _TDatum:
+        # build a mask for any arrays
+        image, target, metadata = datum
+        mask = np.isin(as_numpy(target.labels), self.classes)
+        flattened_metadata = flatten(metadata)[0]
+        filtered_metadata = {k: _try_mask_object(v, mask) for k, v in flattened_metadata.items()}
+        # return a masked datum
+        filtered_datum = image, ClassFilterTarget(target, mask), filtered_metadata
+        return cast(_TDatum, filtered_datum)

dataeval/{utils/data → data}/selections/_indices.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any, Sequence
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Indices(Selection[Any]):

dataeval/{utils/data → data}/selections/_limit.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Limit(Selection[Any]):

dataeval/{utils/data → data}/selections/_prioritize.py RENAMED Viewed

@@ -14,8 +14,8 @@ from sklearn.cluster import KMeans
 from sklearn.metrics import pairwise_distances
 from dataeval.config import EPSILON, DeviceLike, get_seed
-from dataeval.utils.data import Embeddings, Select
-from dataeval.utils.data._selection import Selection, SelectionStage
+from dataeval.data import Embeddings, Select
+from dataeval.data._selection import Selection, SelectionStage
 _logger = logging.getLogger(__name__)
@@ -272,7 +272,7 @@ class Prioritize(Selection[Any]):
             return _KMeansComplexitySorter(samples, self._c)
     def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
-        emb: NDArray[Any] = embeddings.to_tensor(selection).cpu().numpy()
+        emb: NDArray[Any] = embeddings.to_numpy(selection)
         emb /= max(np.max(np.linalg.norm(emb, axis=1)), EPSILON)
         return emb

dataeval/{utils/data → data}/selections/_reverse.py RENAMED Viewed

@@ -4,7 +4,7 @@ __all__ = []
 from typing import Any
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
+from dataeval.data._selection import Select, Selection, SelectionStage
 class Reverse(Selection[Any]):

dataeval/{utils/data → data}/selections/_shuffle.py RENAMED Viewed

@@ -8,9 +8,9 @@ import numpy as np
 from numpy.random import BitGenerator, Generator, SeedSequence
 from numpy.typing import NDArray
-from dataeval.typing import Array, ArrayLike
+from dataeval.data._selection import Select, Selection, SelectionStage
+from dataeval.typing import Array
 from dataeval.utils._array import as_numpy
-from dataeval.utils.data._selection import Select, Selection, SelectionStage
 class Shuffle(Selection[Any]):
@@ -30,7 +30,7 @@ class Shuffle(Selection[Any]):
     seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
     stage = SelectionStage.ORDER
-    def __init__(self, seed: int | ArrayLike | SeedSequence | BitGenerator | Generator | None = None):
+    def __init__(self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None):
         self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
     def __call__(self, dataset: Select[Any]) -> None:

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -9,14 +9,14 @@ __all__ = [
     "DriftMMDOutput",
     "DriftOutput",
     "DriftUncertainty",
-    "preprocess_drift",
+    "UpdateStrategy",
     "updates",
 ]
 from dataeval.detectors.drift import updates
+from dataeval.detectors.drift._base import UpdateStrategy
 from dataeval.detectors.drift._cvm import DriftCVM
 from dataeval.detectors.drift._ks import DriftKS
 from dataeval.detectors.drift._mmd import DriftMMD
-from dataeval.detectors.drift._torch import preprocess_drift
 from dataeval.detectors.drift._uncertainty import DriftUncertainty
 from dataeval.outputs._drift import DriftMMDOutput, DriftOutput

dataeval/detectors/drift/_base.py CHANGED Viewed

@@ -13,15 +13,16 @@ __all__ = []
 import math
 from abc import abstractmethod
 from functools import wraps
-from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
 import numpy as np
 from numpy.typing import NDArray
+from dataeval.data import Embeddings
 from dataeval.outputs import DriftOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import Array, ArrayLike
-from dataeval.utils._array import as_numpy, to_numpy
+from dataeval.typing import Array
+from dataeval.utils._array import as_numpy, flatten
 R = TypeVar("R")
@@ -32,220 +33,88 @@ class UpdateStrategy(Protocol):
     Protocol for reference dataset update strategy for drift detectors
     """
-    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]: ...
+    def __call__(self, x_ref: NDArray[np.float32], x_new: NDArray[np.float32], count: int) -> NDArray[np.float32]: ...
-def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
+def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
     """Decorator to update x_ref with x using selected update methodology"""
     @wraps(fn)
-    def _(self, x, *args, **kwargs) -> R:
-        output = fn(self, x, *args, **kwargs)
+    def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
+        output = fn(self, data, *args, **kwargs)
         # update reference dataset
-        if self.update_x_ref is not None:
-            self._x_ref = self.update_x_ref(self.x_ref, x, self.n)
+        if self.update_strategy is not None:
+            self._x_ref = self.update_strategy(self.x_ref, self._encode(data), self.n)
+            self.n += len(data)
-        # used for reservoir sampling
-        self.n += len(x)
-        return output
-    return _
-def preprocess_x(fn: Callable[..., R]) -> Callable[..., R]:
-    """Decorator to run preprocess_fn on x before calling wrapped function"""
-    @wraps(fn)
-    def _(self, x, *args, **kwargs) -> R:
-        if self._x_refcount == 0:
-            self._x = self._preprocess(x)
-        self._x_refcount += 1
-        output = fn(self, self._x, *args, **kwargs)
-        self._x_refcount -= 1
-        if self._x_refcount == 0:
-            del self._x
         return output
     return _
 class BaseDrift:
-    """
-    A generic :term:`drift<Drift>` detection component for preprocessing data and applying statistical correction.
-    This class handles common tasks related to drift detection, such as preprocessing
-    the reference data (`x_ref`), performing statistical correction (e.g., Bonferroni, FDR),
-    and updating the reference data if needed.
-    Parameters
-    ----------
-    x_ref : ArrayLike
-        The reference dataset used for drift detection. This is the baseline data against
-        which new data points will be compared.
-    p_val : float, optional
-        The significance level for detecting drift, by default 0.05.
-    x_ref_preprocessed : bool, optional
-        Flag indicating whether the reference data has already been preprocessed, by default False.
-    update_x_ref : UpdateStrategy, optional
-        A strategy object specifying how the reference data should be updated when drift is detected,
-        by default None.
-    preprocess_fn : Callable[[ArrayLike], ArrayLike], optional
-        A function to preprocess the data before drift detection, by default None.
-    correction : {'bonferroni', 'fdr'}, optional
-        Statistical correction method applied to p-values, by default "bonferroni".
-    Attributes
-    ----------
-    _x_ref : ArrayLike
-        The reference dataset that is either raw or preprocessed.
-    p_val : float
-        The significance level for drift detection.
-    update_x_ref : UpdateStrategy or None
-        The strategy for updating the reference data if applicable.
-    preprocess_fn : Callable or None
-        Function used for preprocessing input data before drift detection.
-    correction : str
-        Statistical correction method applied to p-values.
-    n : int
-        The number of samples in the reference dataset (`x_ref`).
-    x_ref_preprocessed : bool
-        A flag that indicates whether the reference dataset has been preprocessed.
-    _x_refcount : int
-        Counter for how many times the reference data has been accessed after preprocessing.
-    Methods
-    -------
-    x_ref:
-        Property that returns the reference dataset, and applies preprocessing if not already done.
-    _preprocess(x):
-        Preprocesses the given data using the specified `preprocess_fn` if provided.
-    """
+    p_val: float
+    update_strategy: UpdateStrategy | None
+    correction: Literal["bonferroni", "fdr"]
+    n: int
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[..., ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
     ) -> None:
         # Type checking
-        if preprocess_fn is not None and not isinstance(preprocess_fn, Callable):
-            raise ValueError("`preprocess_fn` is not a valid Callable.")
-        if update_x_ref is not None and not isinstance(update_x_ref, UpdateStrategy):
-            raise ValueError("`update_x_ref` is not a valid ReferenceUpdate class.")
+        if update_strategy is not None and not isinstance(update_strategy, UpdateStrategy):
+            raise ValueError("`update_strategy` is not a valid UpdateStrategy class.")
         if correction not in ["bonferroni", "fdr"]:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
-        self._x_ref = x_ref
-        self.x_ref_preprocessed: bool = x_ref_preprocessed
-        # Other attributes
+        self._data = data
         self.p_val = p_val
-        self.update_x_ref = update_x_ref
-        self.preprocess_fn = preprocess_fn
+        self.update_strategy = update_strategy
         self.correction = correction
-        self.n: int = len(x_ref)
+        self.n = len(data)
-        # Ref counter for preprocessed x
-        self._x_refcount = 0
+        self._x_ref: NDArray[np.float32] | None = None
     @property
-    def x_ref(self) -> ArrayLike:
+    def x_ref(self) -> NDArray[np.float32]:
         """
-        Retrieve the reference data, applying preprocessing if not already done.
+        Retrieve the reference data of the drift detector.
         Returns
         -------
-        ArrayLike
-            The reference dataset (`x_ref`), preprocessed if needed.
+        NDArray[np.float32]
+            The reference data as a 32-bit floating point numpy array.
         """
-        if not self.x_ref_preprocessed:
-            self.x_ref_preprocessed = True
-            if self.preprocess_fn is not None:
-                self._x_ref = self.preprocess_fn(self._x_ref)
+        if self._x_ref is None:
+            self._x_ref = self._encode(self._data)
         return self._x_ref
-    def _preprocess(self, x: ArrayLike) -> ArrayLike:
-        """
-        Preprocess the given data before computing the :term:`drift<Drift>` scores.
-        Parameters
-        ----------
-        x : ArrayLike
-            The input data to preprocess.
-        Returns
-        -------
-        ArrayLike
-            The preprocessed input data.
-        """
-        if self.preprocess_fn is not None:
-            x = self.preprocess_fn(x)
-        return x
+    def _encode(self, data: Embeddings | Array) -> NDArray[np.float32]:
+        array = (
+            data.to_numpy().astype(np.float32)
+            if isinstance(data, Embeddings)
+            else self._data.new(data).to_numpy().astype(np.float32)
+            if isinstance(self._data, Embeddings)
+            else as_numpy(data).astype(np.float32)
+        )
+        return flatten(array)
 class BaseDriftUnivariate(BaseDrift):
-    """
-    Base class for :term:`drift<Drift>` detection methods using univariate statistical tests.
-    This class inherits from `BaseDrift` and serves as a generic component for detecting
-    distribution drift in univariate features. If the number of features `n_features` is greater
-    than 1, a multivariate correction method (e.g., Bonferroni or FDR) is applied to control
-    the :term:`false positive rate<False Positive Rate (FP)>`, ensuring it does not exceed the specified
-    :term:`p-value<P-Value>`.
-    Parameters
-    ----------
-    x_ref : ArrayLike
-        Reference data used as the baseline to compare against when detecting drift.
-    p_val : float, default 0.05
-        Significance level used for detecting drift.
-    x_ref_preprocessed : bool, default False
-        Indicates whether the reference data has been preprocessed.
-    update_x_ref : UpdateStrategy | None, default None
-        Strategy for updating the reference data when drift is detected.
-    preprocess_fn : Callable[ArrayLike] | None, default None
-        Function used to preprocess input data before detecting drift.
-    correction : 'bonferroni' | 'fdr', default 'bonferroni'
-        Multivariate correction method applied to p-values.
-    n_features : int | None, default None
-        Number of features used in the univariate drift tests. If not provided, it will
-        be inferred from the data.
-    Attributes
-    ----------
-    p_val : float
-        The significance level for drift detection.
-    correction : str
-        The method for controlling the :term:`False Discovery Rate (FDR)` or applying a Bonferroni correction.
-    update_x_ref : UpdateStrategy | None
-        Strategy for updating the reference data if applicable.
-    preprocess_fn : Callable | None
-        Function used for preprocessing input data before drift detection.
-    """
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
         n_features: int | None = None,
     ) -> None:
-        super().__init__(
-            x_ref,
-            p_val,
-            x_ref_preprocessed,
-            update_x_ref,
-            preprocess_fn,
-            correction,
-        )
+        super().__init__(data, p_val, update_strategy, correction)
         self._n_features = n_features
@@ -255,8 +124,7 @@ class BaseDriftUnivariate(BaseDrift):
         Get the number of features in the reference data.
         If the number of features is not provided during initialization, it will be inferred
-        from the reference data (``x_ref``). If a preprocessing function is provided, the number
-        of features will be inferred after applying the preprocessing function.
+        from the reference data (``x_ref``).
         Returns
         -------
@@ -264,48 +132,36 @@ class BaseDriftUnivariate(BaseDrift):
             Number of features in the reference data.
         """
         # lazy process n_features as needed
-        if not isinstance(self._n_features, int):
-            # compute number of features for the univariate tests
-            x_ref = (
-                self.x_ref
-                if self.preprocess_fn is None or self.x_ref_preprocessed
-                else self.preprocess_fn(self._x_ref[0:1])
-            )
-            # infer features from preprocessed reference data
-            shape = x_ref.shape if isinstance(x_ref, Array) else as_numpy(x_ref).shape
-            self._n_features = int(math.prod(shape[1:]))  # Multiplies all channel sizes after first
+        if self._n_features is None:
+            self._n_features = int(math.prod(self._data[0].shape))
         return self._n_features
-    @preprocess_x
-    def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
+    def score(self, data: Embeddings | Array) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
         """
         Calculates p-values and test statistics per feature.
         Parameters
         ----------
-        x : ArrayLike
-            Batch of instances
+        data : Embeddings or Array
+            Batch of instances to score.
         Returns
         -------
         tuple[NDArray, NDArray]
             Feature level p-values and test statistics
         """
-        x_np = to_numpy(x)
-        x_np = x_np.reshape(x_np.shape[0], -1)
-        x_ref_np = as_numpy(self.x_ref)
-        x_ref_np = x_ref_np.reshape(x_ref_np.shape[0], -1)
+        x_np = self._encode(data)
         p_val = np.zeros(self.n_features, dtype=np.float32)
         dist = np.zeros_like(p_val)
         for f in range(self.n_features):
-            dist[f], p_val[f] = self._score_fn(x_ref_np[:, f], x_np[:, f])
+            dist[f], p_val[f] = self._score_fn(self.x_ref[:, f], x_np[:, f])
         return p_val, dist
     @abstractmethod
     def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]: ...
-    def _apply_correction(self, p_vals: NDArray) -> tuple[bool, float]:
+    def _apply_correction(self, p_vals: NDArray[np.float32]) -> tuple[bool, float]:
         """
         Apply the specified correction method (Bonferroni or FDR) to the p-values.
@@ -343,20 +199,16 @@ class BaseDriftUnivariate(BaseDrift):
             raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
     @set_metadata
-    @preprocess_x
-    @update_x_ref
-    def predict(
-        self,
-        x: ArrayLike,
-    ) -> DriftOutput:
+    @update_strategy
+    def predict(self, data: Embeddings | Array) -> DriftOutput:
         """
         Predict whether a batch of data has drifted from the reference data and update
         reference data using specified update strategy.
         Parameters
         ----------
-        x : ArrayLike
-            Batch of instances.
+        data : Embeddings or Array
+            Batch of instances to predict drift on.
         Returns
         -------
@@ -365,7 +217,7 @@ class BaseDriftUnivariate(BaseDrift):
             p-values, threshold after multivariate correction if needed and test :term:`statistics<Statistics>`.
         """
         # compute drift scores
-        p_vals, dist = self.score(x)
+        p_vals, dist = self.score(data)
         feature_drift = (p_vals < self.p_val).astype(np.bool_)
         drift_pred, threshold = self._apply_correction(p_vals)

dataeval/detectors/drift/_cvm.py CHANGED Viewed

@@ -10,14 +10,15 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, Literal
+from typing import Literal
 import numpy as np
 from numpy.typing import NDArray
 from scipy.stats import cramervonmises_2samp
+from dataeval.data._embeddings import Embeddings
 from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 class DriftCVM(BaseDriftUnivariate):
@@ -31,40 +32,32 @@ class DriftCVM(BaseDriftUnivariate):
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Embeddings or Array
         Data used as reference distribution.
-    p_val : float | None, default 0.05
+    p_val : float or None, default 0.05
         :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy | None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preprocess_fn : Callable | None, default None
-        Function to preprocess the data before computing the data drift metrics.
-        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    correction : "bonferroni" | "fdr", default "bonferroni"
+    correction : "bonferroni" or "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
-    n_features : int | None, default None
-        Number of features used in the statistical test. No need to pass it if no
-        preprocessing takes place. In case of a preprocessing step, this can also
-        be inferred automatically but could be more expensive to compute.
+    n_features : int or None, default None
+        Number of features used in the univariate drift tests. If not provided, it will
+        be inferred from the data.
     Example
     -------
-    >>> from functools import partial
-    >>> from dataeval.detectors.drift import preprocess_drift
+    >>> from dataeval.data import Embeddings
-    Use a preprocess function to encode images before testing for drift
+    Use Embeddings to encode images before testing for drift
-    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
-    >>> drift = DriftCVM(train_images, preprocess_fn=preprocess_fn)
+    >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
+    >>> drift = DriftCVM(train_emb)
     Test incoming images for drift
@@ -74,20 +67,16 @@ class DriftCVM(BaseDriftUnivariate):
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
         n_features: int | None = None,
     ) -> None:
         super().__init__(
-            x_ref=x_ref,
+            data=data,
             p_val=p_val,
-            x_ref_preprocessed=x_ref_preprocessed,
-            update_x_ref=update_x_ref,
-            preprocess_fn=preprocess_fn,
+            update_strategy=update_strategy,
             correction=correction,
             n_features=n_features,
         )

dataeval 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl