PyPI - dataeval - Versions diffs - 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

dataeval/__init__.py +1 -1
dataeval/data/__init__.py +19 -0
dataeval/data/_embeddings.py +345 -0
dataeval/{utils/data → data}/_images.py +2 -2
dataeval/{utils/data → data}/_metadata.py +8 -7
dataeval/{utils/data → data}/_selection.py +22 -9
dataeval/{utils/data → data}/_split.py +1 -1
dataeval/data/selections/__init__.py +19 -0
dataeval/data/selections/_classbalance.py +37 -0
dataeval/data/selections/_classfilter.py +109 -0
dataeval/{utils/data → data}/selections/_indices.py +1 -1
dataeval/{utils/data → data}/selections/_limit.py +1 -1
dataeval/{utils/data → data}/selections/_prioritize.py +3 -3
dataeval/{utils/data → data}/selections/_reverse.py +1 -1
dataeval/{utils/data → data}/selections/_shuffle.py +3 -3
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +55 -203
dataeval/detectors/drift/_cvm.py +19 -30
dataeval/detectors/drift/_ks.py +18 -30
dataeval/detectors/drift/_mmd.py +189 -53
dataeval/detectors/drift/_uncertainty.py +52 -56
dataeval/detectors/drift/updates.py +13 -12
dataeval/detectors/linters/duplicates.py +6 -4
dataeval/detectors/linters/outliers.py +3 -3
dataeval/detectors/ood/ae.py +1 -1
dataeval/metadata/_distance.py +1 -1
dataeval/metadata/_ood.py +4 -4
dataeval/metrics/bias/_balance.py +1 -1
dataeval/metrics/bias/_diversity.py +1 -1
dataeval/metrics/bias/_parity.py +1 -1
dataeval/metrics/stats/_base.py +7 -7
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +2 -2
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/_bias.py +1 -1
dataeval/typing.py +53 -19
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +18 -7
dataeval/utils/data/__init__.py +5 -20
dataeval/utils/data/_dataset.py +6 -4
dataeval/utils/data/collate.py +2 -0
dataeval/utils/datasets/__init__.py +17 -0
dataeval/utils/{data/datasets → datasets}/_base.py +10 -7
dataeval/utils/{data/datasets → datasets}/_cifar10.py +11 -11
dataeval/utils/{data/datasets → datasets}/_milco.py +44 -16
dataeval/utils/{data/datasets → datasets}/_mnist.py +11 -7
dataeval/utils/{data/datasets → datasets}/_ships.py +10 -6
dataeval/utils/{data/datasets → datasets}/_voc.py +43 -22
dataeval/utils/torch/_internal.py +12 -35
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/METADATA +2 -3
dataeval-1.0.0.dist-info/RECORD +107 -0
dataeval/detectors/drift/_torch.py +0 -222
dataeval/utils/data/_embeddings.py +0 -186
dataeval/utils/data/datasets/__init__.py +0 -17
dataeval/utils/data/selections/__init__.py +0 -17
dataeval/utils/data/selections/_classfilter.py +0 -59
dataeval-0.84.0.dist-info/RECORD +0 -106
/dataeval/{utils/data → data}/_targets.py +0 -0
/dataeval/utils/{metadata.py → data/metadata.py} +0 -0
/dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
/dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/WHEEL +0 -0

dataeval/detectors/drift/_ks.py CHANGED Viewed

@@ -10,14 +10,15 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, Literal
+from typing import Literal
 import numpy as np
 from numpy.typing import NDArray
 from scipy.stats import ks_2samp
+from dataeval.data._embeddings import Embeddings
 from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 class DriftKS(BaseDriftUnivariate):
@@ -31,43 +32,34 @@ class DriftKS(BaseDriftUnivariate):
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Embeddings or Array
         Data used as reference distribution.
-    p_val : float | None, default 0.05
+    p_val : float or None, default 0.05
         :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy | None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preprocess_fn : Callable | None, default None
-        Function to preprocess the data before computing the data :term:`drift<Drift>` metrics.
-        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    correction : "bonferroni" | "fdr", default "bonferroni"
+    correction : "bonferroni" or "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
-    alternative : "two-sided" | "less" | "greater", default "two-sided"
+    alternative : "two-sided", "less" or "greater", default "two-sided"
         Defines the alternative hypothesis. Options are 'two-sided', 'less' or
         'greater'.
     n_features : int | None, default None
-        Number of features used in the statistical test. No need to pass it if no
-        preprocessing takes place. In case of a preprocessing step, this can also
-        be inferred automatically but could be more expensive to compute.
+        Number of features used in the univariate drift tests. If not provided, it will
+        be inferred from the data.
     Example
     -------
-    >>> from functools import partial
-    >>> from dataeval.detectors.drift import preprocess_drift
+    >>> from dataeval.data import Embeddings
-    Use a preprocess function to encode images before testing for drift
+    Use Embeddings to encode images before testing for drift
-    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
-    >>> drift = DriftKS(train_images, preprocess_fn=preprocess_fn)
+    >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
+    >>> drift = DriftKS(train_emb)
     Test incoming images for drift
@@ -77,21 +69,17 @@ class DriftKS(BaseDriftUnivariate):
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
         alternative: Literal["two-sided", "less", "greater"] = "two-sided",
         n_features: int | None = None,
     ) -> None:
         super().__init__(
-            x_ref=x_ref,
+            data=data,
             p_val=p_val,
-            x_ref_preprocessed=x_ref_preprocessed,
-            update_x_ref=update_x_ref,
-            preprocess_fn=preprocess_fn,
+            update_strategy=update_strategy,
             correction=correction,
             n_features=n_features,
         )

dataeval/detectors/drift/_mmd.py CHANGED Viewed

@@ -10,16 +10,16 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable
+from typing import Any, Callable
 import torch
 from dataeval.config import DeviceLike, get_device
-from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
-from dataeval.detectors.drift._torch import GaussianRBF, mmd2_from_kernel_matrix
+from dataeval.data._embeddings import Embeddings
+from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy, update_strategy
 from dataeval.outputs import DriftMMDOutput
 from dataeval.outputs._base import set_metadata
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
 class DriftMMD(BaseDrift):
@@ -29,29 +29,20 @@ class DriftMMD(BaseDrift):
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Embeddings or Array
         Data used as reference distribution.
     p_val : float or None, default 0.05
         :term:`P-value` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy or None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preprocess_fn : Callable or None, default None
-        Function to preprocess the data before computing the data drift metrics.
-        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    sigma : ArrayLike or None, default None
+    sigma : Array or None, default None
         Optionally set the internal GaussianRBF kernel bandwidth. Can also pass multiple
         bandwidth values as an array. The kernel evaluation is then averaged over
         those bandwidths.
-    configure_kernel_from_x_ref : bool, default True
-        Whether to already configure the kernel bandwidth from the reference data.
     n_permutations : int, default 100
         Number of permutations used in the permutation test.
     device : DeviceLike or None, default None
@@ -60,13 +51,12 @@ class DriftMMD(BaseDrift):
     Example
     -------
-    >>> from functools import partial
-    >>> from dataeval.detectors.drift import preprocess_drift
+    >>> from dataeval.data import Embeddings
-    Use a preprocess function to encode images before testing for drift
+    Use Embeddings to encode images before testing for drift
-    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
-    >>> drift = DriftMMD(train_images, preprocess_fn=preprocess_fn)
+    >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
+    >>> drift = DriftMMD(train_emb)
     Test incoming images for drift
@@ -76,21 +66,14 @@ class DriftMMD(BaseDrift):
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[..., ArrayLike] | None = None,
-        sigma: ArrayLike | None = None,
-        configure_kernel_from_x_ref: bool = True,
+        update_strategy: UpdateStrategy | None = None,
+        sigma: Array | None = None,
         n_permutations: int = 100,
         device: DeviceLike | None = None,
     ) -> None:
-        super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
-        self._infer_sigma = configure_kernel_from_x_ref
-        if configure_kernel_from_x_ref and sigma is not None:
-            self._infer_sigma = False
+        super().__init__(data, p_val, update_strategy)
         self.n_permutations = n_permutations  # nb of iterations through permutation test
@@ -102,23 +85,20 @@ class DriftMMD(BaseDrift):
         self._kernel = GaussianRBF(sigma_tensor).to(self.device)
         # compute kernel matrix for the reference data
-        if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
-            x = torch.as_tensor(self.x_ref, device=self.device)
-            self._k_xx = self._kernel(x, x, infer_sigma=self._infer_sigma)
-            self._infer_sigma = False
+        if isinstance(sigma_tensor, torch.Tensor):
+            self._k_xx = self._kernel(self.x_ref, self.x_ref)
         else:
-            self._k_xx, self._infer_sigma = None, True
+            self._k_xx = None
-    def _kernel_matrix(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    def _kernel_matrix(self, x: Array, y: Array) -> torch.Tensor:
         """Compute and return full kernel matrix between arrays x and y."""
-        k_xy = self._kernel(x, y, self._infer_sigma)
-        k_xx = self._k_xx if self._k_xx is not None and self.update_x_ref is None else self._kernel(x, x)
+        k_xy = self._kernel(x, y)
+        k_xx = self._k_xx if self._k_xx is not None and self.update_strategy is None else self._kernel(x, x)
         k_yy = self._kernel(y, y)
         kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
         return kernel_mat
-    @preprocess_x
-    def score(self, x: ArrayLike) -> tuple[float, float, float]:
+    def score(self, data: Embeddings | Array) -> tuple[float, float, float]:
         """
         Compute the :term:`p-value<P-Value>` resulting from a permutation test using the maximum mean
         discrepancy as a distance measure between the reference data and the data to
@@ -126,8 +106,8 @@ class DriftMMD(BaseDrift):
         Parameters
         ----------
-        x : ArrayLike
-            Batch of instances.
+        data : Embeddings or Array
+            Batch of instances to score.
         Returns
         -------
@@ -135,10 +115,9 @@ class DriftMMD(BaseDrift):
             p-value obtained from the permutation test, MMD^2 between the reference and test set,
             and MMD^2 threshold above which :term:`drift<Drift>` is flagged
         """
-        x_ref = torch.as_tensor(self.x_ref, device=self.device)
-        x_test = torch.as_tensor(x, device=self.device)
+        x_test = self._encode(data)
         n = x_test.shape[0]
-        kernel_mat = self._kernel_matrix(x_ref, x_test)
+        kernel_mat = self._kernel_matrix(self.x_ref, x_test)
         kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())  # zero diagonal
         mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
         mmd2_permuted = torch.tensor(
@@ -152,17 +131,16 @@ class DriftMMD(BaseDrift):
         return float(p_val.item()), float(mmd2.item()), float(distance_threshold.item())
     @set_metadata
-    @preprocess_x
-    @update_x_ref
-    def predict(self, x: ArrayLike) -> DriftMMDOutput:
+    @update_strategy
+    def predict(self, data: Embeddings | Array) -> DriftMMDOutput:
         """
         Predict whether a batch of data has drifted from the reference data and then
         updates reference data using specified strategy.
         Parameters
         ----------
-        x : ArrayLike
-            Batch of instances.
+        data : Embeddings or Array
+            Batch of instances to predict drift on.
         Returns
         -------
@@ -171,8 +149,166 @@ class DriftMMD(BaseDrift):
             threshold and MMD metric.
         """
         # compute drift scores
-        p_val, dist, distance_threshold = self.score(x)
+        p_val, dist, distance_threshold = self.score(data)
         drift_pred = bool(p_val < self.p_val)
         # populate drift dict
         return DriftMMDOutput(drift_pred, self.p_val, p_val, dist, distance_threshold)
+@torch.jit.script
+def _squared_pairwise_distance(
+    x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
+) -> torch.Tensor:  # pragma: no cover - torch.jit.script code is compiled and copied
+    """
+    PyTorch pairwise squared Euclidean distance between samples x and y.
+    Parameters
+    ----------
+    x : torch.Tensor
+        Batch of instances of shape [Nx, features].
+    y : torch.Tensor
+        Batch of instances of shape [Ny, features].
+    a_min : float
+        Lower bound to clip distance values.
+    Returns
+    -------
+    torch.Tensor
+        Pairwise squared Euclidean distance [Nx, Ny].
+    """
+    x2 = x.pow(2).sum(dim=-1, keepdim=True)
+    y2 = y.pow(2).sum(dim=-1, keepdim=True)
+    dist = torch.addmm(y2.transpose(-2, -1), x, y.transpose(-2, -1), alpha=-2).add_(x2)
+    return dist.clamp_min_(a_min)
+def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
+    """
+    Bandwidth estimation using the median heuristic `Gretton2012`
+    Parameters
+    ----------
+    x : torch.Tensor
+        Tensor of instances with dimension [Nx, features].
+    y : torch.Tensor
+        Tensor of instances with dimension [Ny, features].
+    dist : torch.Tensor
+        Tensor with dimensions [Nx, Ny], containing the pairwise distances
+        between `x` and `y`.
+    Returns
+    -------
+    torch.Tensor
+        The computed bandwidth, `sigma`.
+    """
+    n = min(x.shape[0], y.shape[0])
+    n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
+    n_median = n + (torch.prod(torch.as_tensor(dist.shape)) - n) // 2 - 1
+    sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
+    return sigma
+class GaussianRBF(torch.nn.Module):
+    """
+    Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
+    A forward pass takes a batch of instances x [Nx, features] and
+    y [Ny, features] and returns the kernel matrix [Nx, Ny].
+    Parameters
+    ----------
+    sigma : torch.Tensor | None, default None
+        Bandwidth used for the kernel. Needn't be specified if being inferred or
+        trained. Can pass multiple values to eval kernel with and then average.
+    init_sigma_fn : Callable | None, default None
+        Function used to compute the bandwidth ``sigma``. Used when ``sigma`` is to be
+        inferred. The function's signature should take in the tensors ``x``, ``y`` and
+        ``dist`` and return ``sigma``. If ``None``, it is set to ``sigma_median``.
+    trainable : bool, default False
+        Whether or not to track gradients w.r.t. `sigma` to allow it to be trained.
+    """
+    def __init__(
+        self,
+        sigma: torch.Tensor | None = None,
+        init_sigma_fn: Callable | None = None,
+        trainable: bool = False,
+    ) -> None:
+        super().__init__()
+        init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
+        self.config: dict[str, Any] = {
+            "sigma": sigma,
+            "trainable": trainable,
+            "init_sigma_fn": init_sigma_fn,
+        }
+        if sigma is None:
+            self.log_sigma: torch.nn.Parameter = torch.nn.Parameter(torch.empty(1), requires_grad=trainable)
+            self.init_required: bool = True
+        else:
+            sigma = sigma.reshape(-1)  # [Ns,]
+            self.log_sigma: torch.nn.Parameter = torch.nn.Parameter(sigma.log(), requires_grad=trainable)
+            self.init_required: bool = False
+        self.init_sigma_fn = init_sigma_fn
+        self.trainable = trainable
+    @property
+    def sigma(self) -> torch.Tensor:
+        return self.log_sigma.exp()
+    def forward(
+        self,
+        x: Array,
+        y: Array,
+        infer_sigma: bool = False,
+    ) -> torch.Tensor:
+        x, y = torch.as_tensor(x), torch.as_tensor(y)
+        dist = _squared_pairwise_distance(x.flatten(1), y.flatten(1))  # [Nx, Ny]
+        if infer_sigma or self.init_required:
+            if self.trainable and infer_sigma:
+                raise ValueError("Gradients cannot be computed w.r.t. an inferred sigma value")
+            sigma = self.init_sigma_fn(x, y, dist)
+            with torch.no_grad():
+                self.log_sigma.copy_(sigma.log().clone())
+            self.init_required: bool = False
+        gamma = 1.0 / (2.0 * self.sigma**2)  # [Ns,]
+        # TODO: do matrix multiplication after all?
+        kernel_mat = torch.exp(-torch.cat([(g * dist)[None, :, :] for g in gamma], dim=0))  # [Ns, Nx, Ny]
+        return kernel_mat.mean(dim=0)  # [Nx, Ny]
+def mmd2_from_kernel_matrix(
+    kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
+) -> torch.Tensor:
+    """
+    Compute maximum mean discrepancy (MMD^2) between 2 samples x and y from the
+    full kernel matrix between the samples.
+    Parameters
+    ----------
+    kernel_mat : torch.Tensor
+        Kernel matrix between samples x and y.
+    m : int
+        Number of instances in y.
+    permute : bool, default False
+        Whether to permute the row indices. Used for permutation tests.
+    zero_diag : bool, default True
+        Whether to zero out the diagonal of the kernel matrix.
+    Returns
+    -------
+    torch.Tensor
+        MMD^2 between the samples from the kernel matrix.
+    """
+    n = kernel_mat.shape[0] - m
+    if zero_diag:
+        kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())
+    if permute:
+        idx = torch.randperm(kernel_mat.shape[0])
+        kernel_mat = kernel_mat[idx][:, idx]
+    k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
+    c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
+    mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
+    return mmd2

dataeval/detectors/drift/_uncertainty.py CHANGED Viewed

@@ -10,33 +10,32 @@ from __future__ import annotations
 __all__ = []
-from functools import partial
-from typing import Callable, Literal
+from typing import Literal, Sequence, cast
 import numpy as np
-from numpy.typing import NDArray
+import torch
 from scipy.special import softmax
 from scipy.stats import entropy
-from dataeval.config import get_device
-from dataeval.detectors.drift._base import UpdateStrategy
+from dataeval.config import DeviceLike, get_device
+from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy
 from dataeval.detectors.drift._ks import DriftKS
-from dataeval.detectors.drift._torch import preprocess_drift
 from dataeval.outputs import DriftOutput
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array, Transform
+from dataeval.utils._array import as_numpy
+from dataeval.utils.torch._internal import predict_batch
 def classifier_uncertainty(
-    x: NDArray[np.float64],
-    model_fn: Callable,
+    preds: Array,
     preds_type: Literal["probs", "logits"] = "probs",
-) -> NDArray[np.float64]:
+) -> torch.Tensor:
     """
     Evaluate model_fn on x and transform predictions to prediction uncertainties.
     Parameters
     ----------
-    x : np.ndarray
+    x : Array
         Batch of instances.
     model_fn : Callable
         Function that evaluates a :term:`classification<Classification>` model on x in a single call (contains
@@ -50,23 +49,21 @@ def classifier_uncertainty(
     NDArray
         A scalar indication of uncertainty of the model on each instance in x.
     """
-    preds = model_fn(x)
+    preds_np = as_numpy(preds)
     if preds_type == "probs":
-        if np.abs(1 - np.sum(preds, axis=-1)).mean() > 1e-6:
+        if np.abs(1 - np.sum(preds_np, axis=-1)).mean() > 1e-6:
             raise ValueError("Probabilities across labels should sum to 1")
-        probs = preds
+        probs = preds_np
     elif preds_type == "logits":
-        probs = softmax(preds, axis=-1)
+        probs = softmax(preds_np, axis=-1)
     else:
         raise NotImplementedError("Only prediction types 'probs' and 'logits' supported.")
-    uncertainties = entropy(probs, axis=-1)
-    return uncertainties[:, None]  # Detectors expect N x d  # type: ignore
+    uncertainties = cast(np.ndarray, entropy(probs, axis=-1))
+    return torch.as_tensor(uncertainties[:, None])
-class DriftUncertainty:
+class DriftUncertainty(BaseDrift):
     """
     Test for a change in the number of instances falling into regions on which \
         the model is uncertain.
@@ -75,29 +72,27 @@ class DriftUncertainty:
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Array
         Data used as reference distribution.
     model : Callable
         :term:`Classification` model outputting class probabilities (or logits)
     p_val : float, default 0.05
         :term:`P-Value` used for the significance of the test.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy or None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
+    correction : "bonferroni" or "fdr", default "bonferroni"
+        Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
+        Discovery Rate).
     preds_type : "probs" or "logits", default "probs"
         Type of prediction output by the model. Options are 'probs' (in [0,1]) or
         'logits' (in [-inf,inf]).
     batch_size : int, default 32
         Batch size used to evaluate model. Only relevant when backend has been
         specified for batch prediction.
-    preprocess_batch_fn : Callable or None, default None
-        Optional batch preprocessing function. For example to convert a list of
-        objects to a batch which can be processed by the model.
+    transforms : Transform, Sequence[Transform] or None, default None
+        Transform(s) to apply to the data.
     device : DeviceLike or None, default None
         Device type used. The default None tries to use the GPU and falls back on
         CPU if needed. Can be specified by passing either 'cuda' or 'cpu'.
@@ -120,46 +115,47 @@ class DriftUncertainty:
     def __init__(
         self,
-        x_ref: ArrayLike,
-        model: Callable,
+        data: Array,
+        model: torch.nn.Module,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
+        update_strategy: UpdateStrategy | None = None,
+        correction: Literal["bonferroni", "fdr"] = "bonferroni",
         preds_type: Literal["probs", "logits"] = "probs",
         batch_size: int = 32,
-        preprocess_batch_fn: Callable | None = None,
-        device: str | None = None,
+        transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
+        device: DeviceLike | None = None,
     ) -> None:
-        def model_fn(x: NDArray) -> NDArray:
-            return preprocess_drift(
-                x,
-                model,  # type: ignore
-                batch_size=batch_size,
-                preprocess_batch_fn=preprocess_batch_fn,
-                device=get_device(device),
-            )
-        preprocess_fn = partial(
-            classifier_uncertainty,
-            model_fn=model_fn,
-            preds_type=preds_type,
-        )
+        self.model: torch.nn.Module = model
+        self.device: torch.device = get_device(device)
+        self.batch_size: int = batch_size
+        self.preds_type: Literal["probs", "logits"] = preds_type
+        self._transforms = (
+            [] if transforms is None else [transforms] if isinstance(transforms, Transform) else transforms
+        )
         self._detector = DriftKS(
-            x_ref=x_ref,
+            data=self._preprocess(data).cpu().numpy(),
             p_val=p_val,
-            x_ref_preprocessed=x_ref_preprocessed,
-            update_x_ref=update_x_ref,
-            preprocess_fn=preprocess_fn,  # type: ignore
+            update_strategy=update_strategy,
+            correction=correction,
         )
-    def predict(self, x: ArrayLike) -> DriftOutput:
+    def _transform(self, x: torch.Tensor) -> torch.Tensor:
+        for transform in self._transforms:
+            x = transform(x)
+        return x
+    def _preprocess(self, x: Array) -> torch.Tensor:
+        preds = predict_batch(x, self.model, self.device, self.batch_size, self._transform)
+        return classifier_uncertainty(preds, self.preds_type)
+    def predict(self, x: Array) -> DriftOutput:
         """
         Predict whether a batch of data has drifted from the reference data.
         Parameters
         ----------
-        x : ArrayLike
+        x : Array
             Batch of instances.
         Returns
@@ -168,4 +164,4 @@ class DriftUncertainty:
             Dictionary containing the drift prediction, :term:`p-value<P-Value>`, and threshold
             statistics.
         """
-        return self._detector.predict(x)
+        return self._detector.predict(self._preprocess(x).cpu().numpy())

dataeval 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

dataeval 0.84.0py3-none-any.whl → 1.0.0py3-none-any.whl