PyPI - dataeval - Versions diffs - 0.84.0__tar.gz → 0.84.1__tar.gz - Mend

dataeval 0.84.0tar.gz → 0.84.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

{dataeval-0.84.0 → dataeval-0.84.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.84.0
+Version: 0.84.1
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT
@@ -82,8 +82,7 @@ using MAITE-compliant datasets and models.
 **Python versions:** 3.9 - 3.12
-**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
-*Gradient*
+**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*
 Choose your preferred method of installation below or follow our
 [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).

{dataeval-0.84.0 → dataeval-0.84.1}/README.md RENAMED Viewed

@@ -40,8 +40,7 @@ using MAITE-compliant datasets and models.
 **Python versions:** 3.9 - 3.12
-**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
-*Gradient*
+**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*
 Choose your preferred method of installation below or follow our
 [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).

{dataeval-0.84.0 → dataeval-0.84.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dataeval"
-version = "0.84.0" # dynamic
+version = "0.84.1" # dynamic
 description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
 license = "MIT"
 readme = "README.md"

{dataeval-0.84.0 → dataeval-0.84.1}/src/dataeval/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.84.0"
+__version__ = "0.84.1"
 import logging

{dataeval-0.84.0 → dataeval-0.84.1}/src/dataeval/detectors/drift/__init__.py RENAMED Viewed

@@ -9,14 +9,14 @@ __all__ = [
     "DriftMMDOutput",
     "DriftOutput",
     "DriftUncertainty",
-    "preprocess_drift",
+    "UpdateStrategy",
     "updates",
 ]
 from dataeval.detectors.drift import updates
+from dataeval.detectors.drift._base import UpdateStrategy
 from dataeval.detectors.drift._cvm import DriftCVM
 from dataeval.detectors.drift._ks import DriftKS
 from dataeval.detectors.drift._mmd import DriftMMD
-from dataeval.detectors.drift._torch import preprocess_drift
 from dataeval.detectors.drift._uncertainty import DriftUncertainty
 from dataeval.outputs._drift import DriftMMDOutput, DriftOutput

dataeval-0.84.1/src/dataeval/detectors/drift/_base.py ADDED Viewed

@@ -0,0 +1,226 @@
+"""
+Source code derived from Alibi-Detect 0.11.4
+https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
+Original code Copyright (c) 2023 Seldon Technologies Ltd
+Licensed under Apache Software License (Apache 2.0)
+"""
+from __future__ import annotations
+__all__ = []
+import math
+from abc import abstractmethod
+from functools import wraps
+from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.outputs import DriftOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import Array
+from dataeval.utils._array import as_numpy, flatten
+from dataeval.utils.data import Embeddings
+R = TypeVar("R")
+@runtime_checkable
+class UpdateStrategy(Protocol):
+    """
+    Protocol for reference dataset update strategy for drift detectors
+    """
+    def __call__(self, x_ref: NDArray[np.float32], x_new: NDArray[np.float32], count: int) -> NDArray[np.float32]: ...
+def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
+    """Decorator to update x_ref with x using selected update methodology"""
+    @wraps(fn)
+    def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
+        output = fn(self, data, *args, **kwargs)
+        # update reference dataset
+        if self.update_strategy is not None:
+            self._x_ref = self.update_strategy(self.x_ref, self._encode(data), self.n)
+            self.n += len(data)
+        return output
+    return _
+class BaseDrift:
+    p_val: float
+    update_strategy: UpdateStrategy | None
+    correction: Literal["bonferroni", "fdr"]
+    n: int
+    def __init__(
+        self,
+        data: Embeddings | Array,
+        p_val: float = 0.05,
+        update_strategy: UpdateStrategy | None = None,
+        correction: Literal["bonferroni", "fdr"] = "bonferroni",
+    ) -> None:
+        # Type checking
+        if update_strategy is not None and not isinstance(update_strategy, UpdateStrategy):
+            raise ValueError("`update_strategy` is not a valid UpdateStrategy class.")
+        if correction not in ["bonferroni", "fdr"]:
+            raise ValueError("`correction` must be `bonferroni` or `fdr`.")
+        self._data = data
+        self.p_val = p_val
+        self.update_strategy = update_strategy
+        self.correction = correction
+        self.n = len(data)
+        self._x_ref: NDArray[np.float32] | None = None
+    @property
+    def x_ref(self) -> NDArray[np.float32]:
+        """
+        Retrieve the reference data of the drift detector.
+        Returns
+        -------
+        NDArray[np.float32]
+            The reference data as a 32-bit floating point numpy array.
+        """
+        if self._x_ref is None:
+            self._x_ref = self._encode(self._data)
+        return self._x_ref
+    def _encode(self, data: Embeddings | Array) -> NDArray[np.float32]:
+        array = (
+            data.to_numpy().astype(np.float32)
+            if isinstance(data, Embeddings)
+            else self._data.new(data).to_numpy().astype(np.float32)
+            if isinstance(self._data, Embeddings)
+            else as_numpy(data).astype(np.float32)
+        )
+        return flatten(array)
+class BaseDriftUnivariate(BaseDrift):
+    def __init__(
+        self,
+        data: Embeddings | Array,
+        p_val: float = 0.05,
+        update_strategy: UpdateStrategy | None = None,
+        correction: Literal["bonferroni", "fdr"] = "bonferroni",
+        n_features: int | None = None,
+    ) -> None:
+        super().__init__(data, p_val, update_strategy, correction)
+        self._n_features = n_features
+    @property
+    def n_features(self) -> int:
+        """
+        Get the number of features in the reference data.
+        If the number of features is not provided during initialization, it will be inferred
+        from the reference data (``x_ref``).
+        Returns
+        -------
+        int
+            Number of features in the reference data.
+        """
+        # lazy process n_features as needed
+        if self._n_features is None:
+            self._n_features = int(math.prod(self._data[0].shape))
+        return self._n_features
+    def score(self, data: Embeddings | Array) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
+        """
+        Calculates p-values and test statistics per feature.
+        Parameters
+        ----------
+        data : Embeddings or Array
+            Batch of instances to score.
+        Returns
+        -------
+        tuple[NDArray, NDArray]
+            Feature level p-values and test statistics
+        """
+        x_np = self._encode(data)
+        p_val = np.zeros(self.n_features, dtype=np.float32)
+        dist = np.zeros_like(p_val)
+        for f in range(self.n_features):
+            dist[f], p_val[f] = self._score_fn(self.x_ref[:, f], x_np[:, f])
+        return p_val, dist
+    @abstractmethod
+    def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]: ...
+    def _apply_correction(self, p_vals: NDArray[np.float32]) -> tuple[bool, float]:
+        """
+        Apply the specified correction method (Bonferroni or FDR) to the p-values.
+        If the correction method is Bonferroni, the threshold for detecting :term:`drift<Drift>`
+        is divided by the number of features. For FDR, the correction is applied
+        using the Benjamini-Hochberg procedure.
+        Parameters
+        ----------
+        p_vals : NDArray
+            Array of p-values from the univariate tests for each feature.
+        Returns
+        -------
+        tuple[bool, float]
+            A tuple containing a boolean indicating if drift was detected and the
+            threshold after correction.
+        """
+        if self.correction == "bonferroni":
+            threshold = self.p_val / self.n_features
+            drift_pred = bool((p_vals < threshold).any())
+            return drift_pred, threshold
+        elif self.correction == "fdr":
+            n = p_vals.shape[0]
+            i = np.arange(n) + np.int_(1)
+            p_sorted = np.sort(p_vals)
+            q_threshold = self.p_val * i / n
+            below_threshold = p_sorted < q_threshold
+            try:
+                idx_threshold = int(np.where(below_threshold)[0].max())
+            except ValueError:  # sorted p-values not below thresholds
+                return bool(below_threshold.any()), q_threshold.min()
+            return bool(below_threshold.any()), q_threshold[idx_threshold]
+        else:
+            raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
+    @set_metadata
+    @update_strategy
+    def predict(self, data: Embeddings | Array) -> DriftOutput:
+        """
+        Predict whether a batch of data has drifted from the reference data and update
+        reference data using specified update strategy.
+        Parameters
+        ----------
+        data : Embeddings or Array
+            Batch of instances to predict drift on.
+        Returns
+        -------
+        DriftOutput
+            Dictionary containing the :term:`drift<Drift>` prediction and optionally the feature level
+            p-values, threshold after multivariate correction if needed and test :term:`statistics<Statistics>`.
+        """
+        # compute drift scores
+        p_vals, dist = self.score(data)
+        feature_drift = (p_vals < self.p_val).astype(np.bool_)
+        drift_pred, threshold = self._apply_correction(p_vals)
+        return DriftOutput(
+            drift_pred, threshold, float(np.mean(p_vals)), float(np.mean(dist)), feature_drift, self.p_val, p_vals, dist
+        )

{dataeval-0.84.0 → dataeval-0.84.1}/src/dataeval/detectors/drift/_cvm.py RENAMED Viewed

@@ -10,14 +10,15 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, Literal
+from typing import Literal
 import numpy as np
 from numpy.typing import NDArray
 from scipy.stats import cramervonmises_2samp
 from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
+from dataeval.utils.data._embeddings import Embeddings
 class DriftCVM(BaseDriftUnivariate):
@@ -31,40 +32,32 @@ class DriftCVM(BaseDriftUnivariate):
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Embeddings or Array
         Data used as reference distribution.
-    p_val : float | None, default 0.05
+    p_val : float or None, default 0.05
         :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy | None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preprocess_fn : Callable | None, default None
-        Function to preprocess the data before computing the data drift metrics.
-        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    correction : "bonferroni" | "fdr", default "bonferroni"
+    correction : "bonferroni" or "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
-    n_features : int | None, default None
-        Number of features used in the statistical test. No need to pass it if no
-        preprocessing takes place. In case of a preprocessing step, this can also
-        be inferred automatically but could be more expensive to compute.
+    n_features : int or None, default None
+        Number of features used in the univariate drift tests. If not provided, it will
+        be inferred from the data.
     Example
     -------
-    >>> from functools import partial
-    >>> from dataeval.detectors.drift import preprocess_drift
+    >>> from dataeval.utils.data import Embeddings
-    Use a preprocess function to encode images before testing for drift
+    Use Embeddings to encode images before testing for drift
-    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
-    >>> drift = DriftCVM(train_images, preprocess_fn=preprocess_fn)
+    >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
+    >>> drift = DriftCVM(train_emb)
     Test incoming images for drift
@@ -74,20 +67,16 @@ class DriftCVM(BaseDriftUnivariate):
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
         n_features: int | None = None,
     ) -> None:
         super().__init__(
-            x_ref=x_ref,
+            data=data,
             p_val=p_val,
-            x_ref_preprocessed=x_ref_preprocessed,
-            update_x_ref=update_x_ref,
-            preprocess_fn=preprocess_fn,
+            update_strategy=update_strategy,
             correction=correction,
             n_features=n_features,
         )

{dataeval-0.84.0 → dataeval-0.84.1}/src/dataeval/detectors/drift/_ks.py RENAMED Viewed

@@ -10,14 +10,15 @@ from __future__ import annotations
 __all__ = []
-from typing import Callable, Literal
+from typing import Literal
 import numpy as np
 from numpy.typing import NDArray
 from scipy.stats import ks_2samp
 from dataeval.detectors.drift._base import BaseDriftUnivariate, UpdateStrategy
-from dataeval.typing import ArrayLike
+from dataeval.typing import Array
+from dataeval.utils.data._embeddings import Embeddings
 class DriftKS(BaseDriftUnivariate):
@@ -31,43 +32,34 @@ class DriftKS(BaseDriftUnivariate):
     Parameters
     ----------
-    x_ref : ArrayLike
+    data : Embeddings or Array
         Data used as reference distribution.
-    p_val : float | None, default 0.05
+    p_val : float or None, default 0.05
         :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
-    x_ref_preprocessed : bool, default False
-        Whether the given reference data ``x_ref`` has been preprocessed yet.
-        If ``True``, only the test data ``x`` will be preprocessed at prediction time.
-        If ``False``, the reference data will also be preprocessed.
-    update_x_ref : UpdateStrategy | None, default None
+    update_strategy : UpdateStrategy or None, default None
         Reference data can optionally be updated using an UpdateStrategy class. Update
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
-    preprocess_fn : Callable | None, default None
-        Function to preprocess the data before computing the data :term:`drift<Drift>` metrics.
-        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    correction : "bonferroni" | "fdr", default "bonferroni"
+    correction : "bonferroni" or "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
-    alternative : "two-sided" | "less" | "greater", default "two-sided"
+    alternative : "two-sided", "less" or "greater", default "two-sided"
         Defines the alternative hypothesis. Options are 'two-sided', 'less' or
         'greater'.
     n_features : int | None, default None
-        Number of features used in the statistical test. No need to pass it if no
-        preprocessing takes place. In case of a preprocessing step, this can also
-        be inferred automatically but could be more expensive to compute.
+        Number of features used in the univariate drift tests. If not provided, it will
+        be inferred from the data.
     Example
     -------
-    >>> from functools import partial
-    >>> from dataeval.detectors.drift import preprocess_drift
+    >>> from dataeval.utils.data import Embeddings
-    Use a preprocess function to encode images before testing for drift
+    Use Embeddings to encode images before testing for drift
-    >>> preprocess_fn = partial(preprocess_drift, model=encoder, batch_size=64)
-    >>> drift = DriftKS(train_images, preprocess_fn=preprocess_fn)
+    >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
+    >>> drift = DriftKS(train_emb)
     Test incoming images for drift
@@ -77,21 +69,17 @@ class DriftKS(BaseDriftUnivariate):
     def __init__(
         self,
-        x_ref: ArrayLike,
+        data: Embeddings | Array,
         p_val: float = 0.05,
-        x_ref_preprocessed: bool = False,
-        update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        update_strategy: UpdateStrategy | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
         alternative: Literal["two-sided", "less", "greater"] = "two-sided",
         n_features: int | None = None,
     ) -> None:
         super().__init__(
-            x_ref=x_ref,
+            data=data,
             p_val=p_val,
-            x_ref_preprocessed=x_ref_preprocessed,
-            update_x_ref=update_x_ref,
-            preprocess_fn=preprocess_fn,
+            update_strategy=update_strategy,
             correction=correction,
             n_features=n_features,
         )

dataeval 0.84.0__tar.gz → 0.84.1__tar.gz

dataeval 0.84.0tar.gz → 0.84.1tar.gz