PyPI - dataeval - Versions diffs - 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl - Mend

dataeval 0.72.0py3-none-any.whl → 0.72.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +10 -11
dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
dataeval/detectors/ood/__init__.py +8 -16
dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
dataeval/{_internal/interop.py → interop.py} +12 -7
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
dataeval/metrics/bias/metadata.py +275 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -4
dataeval/utils/image.py +71 -0
dataeval/utils/shared.py +151 -0
dataeval/utils/split_dataset.py +486 -0
dataeval/utils/tensorflow/__init__.py +9 -7
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +49 -43
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
dataeval-0.72.2.dist-info/RECORD +72 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -7
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.0.dist-info/RECORD +0 -80
/dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.72.0"
+__version__ = "0.72.2"
 from importlib.util import find_spec
@@ -8,16 +8,16 @@ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("te
 del find_spec
-from . import detectors, metrics  # noqa: E402
+from dataeval import detectors, metrics  # noqa: E402
 __all__ = ["detectors", "metrics"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from . import workflows
+    from dataeval import workflows
     __all__ += ["workflows"]
 if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from . import utils
+    from dataeval import utils
     __all__ += ["utils"]

dataeval/detectors/__init__.py CHANGED Viewed

@@ -3,12 +3,13 @@ Detectors can determine if a dataset or individual images in a dataset are indic
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from . import drift, linters
+from dataeval.detectors import drift, linters
 __all__ = ["drift", "linters"]
 if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
-    from . import ood
+    from dataeval.detectors import ood
     __all__ += ["ood"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -1,21 +1,20 @@
 """
-Drift detectors identify if the statistical properties of the data has changed.
+:term:`Drift` detectors identify if the statistical properties of the data has changed.
 """
 from dataeval import _IS_TORCH_AVAILABLE
-from dataeval._internal.detectors.drift.base import DriftOutput
-from dataeval._internal.detectors.drift.cvm import DriftCVM
-from dataeval._internal.detectors.drift.ks import DriftKS
-from . import updates
+from dataeval.detectors.drift import updates
+from dataeval.detectors.drift.base import DriftOutput
+from dataeval.detectors.drift.cvm import DriftCVM
+from dataeval.detectors.drift.ks import DriftKS
 __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from dataeval._internal.detectors.drift.mmd import DriftMMD, DriftMMDOutput
-    from dataeval._internal.detectors.drift.torch import preprocess_drift
-    from dataeval._internal.detectors.drift.uncertainty import DriftUncertainty
+    from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
+    from dataeval.detectors.drift.torch import preprocess_drift
+    from dataeval.detectors.drift.uncertainty import DriftUncertainty
-    from . import kernels
+    __all__ += ["DriftMMD", "DriftMMDOutput", "DriftUncertainty", "preprocess_drift"]
-    __all__ += ["DriftMMD", "DriftMMDOutput", "DriftUncertainty", "kernels", "preprocess_drift"]
+del _IS_TORCH_AVAILABLE

dataeval/{_internal/detectors → detectors}/drift/base.py RENAMED Viewed

@@ -8,16 +8,38 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftOutput"]
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import wraps
-from typing import Callable, Literal
+from typing import Any, Callable, Literal, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.interop import as_numpy, to_numpy
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import as_numpy, to_numpy
+from dataeval.output import OutputMetadata, set_metadata
+R = TypeVar("R")
+class UpdateStrategy(ABC):
+    """
+    Updates reference dataset for drift detector
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __init__(self, n: int) -> None:
+        self.n = n
+    @abstractmethod
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        """Abstract implementation of update strategy"""
 @dataclass(frozen=True)
@@ -47,7 +69,7 @@ class DriftOutput(DriftBaseOutput):
     Attributes
     ----------
     is_drift : bool
-        Drift prediction for the images
+        :term:`Drift` prediction for the images
     threshold : float
         Threshold after multivariate correction if needed
     feature_drift : NDArray
@@ -70,9 +92,11 @@ class DriftOutput(DriftBaseOutput):
     distances: NDArray[np.float32]
-def update_x_ref(fn):
+def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
+    """Decorator to update x_ref with x using selected update methodology"""
     @wraps(fn)
-    def _(self, x, *args, **kwargs):
+    def _(self, x, *args, **kwargs) -> R:
         output = fn(self, x, *args, **kwargs)
         # update reference dataset
@@ -86,9 +110,11 @@ def update_x_ref(fn):
     return _
-def preprocess_x(fn):
+def preprocess_x(fn: Callable[..., R]) -> Callable[..., R]:
+    """Decorator to run preprocess_fn on x before calling wrapped function"""
     @wraps(fn)
-    def _(self, x, *args, **kwargs):
+    def _(self, x, *args, **kwargs) -> R:
         if self._x_refcount == 0:
             self._x = self._preprocess(x)
         self._x_refcount += 1
@@ -101,73 +127,9 @@ def preprocess_x(fn):
     return _
-class UpdateStrategy(ABC):
-    """
-    Updates reference dataset for drift detector
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __init__(self, n: int):
-        self.n = n
-    @abstractmethod
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        """Abstract implementation of update strategy"""
-class LastSeenUpdate(UpdateStrategy):
-    """
-    Updates reference dataset for drift detector using last seen method.
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        x_updated = np.concatenate([x_ref, x], axis=0)
-        return x_updated[-self.n :]
-class ReservoirSamplingUpdate(UpdateStrategy):
-    """
-    Updates reference dataset for drift detector using reservoir sampling method.
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        if x.shape[0] + count <= self.n:
-            return np.concatenate([x_ref, x], axis=0)
-        n_ref = x_ref.shape[0]
-        output_size = min(self.n, n_ref + x.shape[0])
-        shape = (output_size,) + x.shape[1:]
-        x_reservoir = np.zeros(shape, dtype=x_ref.dtype)
-        x_reservoir[:n_ref] = x_ref
-        for item in x:
-            count += 1
-            if n_ref < self.n:
-                x_reservoir[n_ref, :] = item
-                n_ref += 1
-            else:
-                r = np.random.randint(0, count)
-                if r < self.n:
-                    x_reservoir[r, :] = item
-        return x_reservoir
 class BaseDrift:
     """
-    A generic drift detection component for preprocessing data and applying statistical correction.
+    A generic :term:`drift<Drift>` detection component for preprocessing data and applying statistical correction.
     This class handles common tasks related to drift detection, such as preprocessing
     the reference data (`x_ref`), performing statistical correction (e.g., Bonferroni, FDR),
@@ -223,7 +185,7 @@ class BaseDrift:
         p_val: float = 0.05,
         x_ref_preprocessed: bool = False,
         update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        preprocess_fn: Callable[..., ArrayLike] | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
     ) -> None:
         # Type checking
@@ -235,20 +197,20 @@ class BaseDrift:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
         self._x_ref = to_numpy(x_ref)
-        self.x_ref_preprocessed = x_ref_preprocessed
+        self.x_ref_preprocessed: bool = x_ref_preprocessed
         # Other attributes
         self.p_val = p_val
         self.update_x_ref = update_x_ref
         self.preprocess_fn = preprocess_fn
         self.correction = correction
-        self.n = len(self._x_ref)
+        self.n: int = len(self._x_ref)
         # Ref counter for preprocessed x
         self._x_refcount = 0
     @property
-    def x_ref(self) -> NDArray:
+    def x_ref(self) -> NDArray[Any]:
         """
         Retrieve the reference data, applying preprocessing if not already done.
@@ -266,7 +228,7 @@ class BaseDrift:
     def _preprocess(self, x: ArrayLike) -> ArrayLike:
         """
-        Preprocess the given data before computing the drift scores.
+        Preprocess the given data before computing the :term:`drift<Drift>` scores.
         Parameters
         ----------
@@ -285,12 +247,13 @@ class BaseDrift:
 class BaseDriftUnivariate(BaseDrift):
     """
-    Base class for drift detection methods using univariate statistical tests.
+    Base class for :term:`drift<Drift>` detection methods using univariate statistical tests.
     This class inherits from `BaseDrift` and serves as a generic component for detecting
     distribution drift in univariate features. If the number of features `n_features` is greater
     than 1, a multivariate correction method (e.g., Bonferroni or FDR) is applied to control
-    the false positive rate, ensuring it does not exceed the specified p-value.
+    the :term:`false positive rate<False Positive Rate (FP)>`, ensuring it does not exceed the specified
+    :term:`p-value<P-Value>`.
     Parameters
     ----------
@@ -312,28 +275,14 @@ class BaseDriftUnivariate(BaseDrift):
     Attributes
     ----------
-    _n_features : int | None
-        Number of features in the data. If not provided, it is lazily inferred from the
-        input data and any preprocessing function.
     p_val : float
         The significance level for drift detection.
     correction : str
-        The method for controlling the false discovery rate or applying a Bonferroni correction.
+        The method for controlling the :term:`False Discovery Rate (FDR)` or applying a Bonferroni correction.
     update_x_ref : UpdateStrategy | None
         Strategy for updating the reference data if applicable.
     preprocess_fn : Callable | None
         Function used for preprocessing input data before drift detection.
-    Methods
-    -------
-    n_features:
-        Property that returns the number of features, inferring it if necessary.
-    score(x):
-        Abstract method to compute univariate feature scores after preprocessing.
-    _apply_correction(p_vals):
-        Apply a statistical correction to p-values to account for multiple testing.
-    predict(x):
-        Predict whether drift has occurred on a batch of data, applying multivariate correction if needed.
     """
     def __init__(
@@ -393,19 +342,19 @@ class BaseDriftUnivariate(BaseDrift):
         Parameters
         ----------
         x : ArrayLike
-            The batch of data to calculate univariate drift scores for each feature.
+            The batch of data to calculate univariate :term:`drift<Drift>` scores for each feature.
         Returns
         -------
         tuple[NDArray, NDArray]
-            A tuple containing p-values and distance statistics for each feature.
+            A tuple containing p-values and distance :term:`statistics<Statistics>` for each feature.
         """
     def _apply_correction(self, p_vals: NDArray) -> tuple[bool, float]:
         """
         Apply the specified correction method (Bonferroni or FDR) to the p-values.
-        If the correction method is Bonferroni, the threshold for detecting drift
+        If the correction method is Bonferroni, the threshold for detecting :term:`drift<Drift>`
         is divided by the number of features. For FDR, the correction is applied
         using the Benjamini-Hochberg procedure.
@@ -426,7 +375,7 @@ class BaseDriftUnivariate(BaseDrift):
             return drift_pred, threshold
         elif self.correction == "fdr":
             n = p_vals.shape[0]
-            i = np.arange(n) + 1
+            i = np.arange(n) + np.int_(1)
             p_sorted = np.sort(p_vals)
             q_threshold = self.p_val * i / n
             below_threshold = p_sorted < q_threshold
@@ -438,7 +387,7 @@ class BaseDriftUnivariate(BaseDrift):
         else:
             raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
-    @set_metadata("dataeval.detectors")
+    @set_metadata()
     @preprocess_x
     @update_x_ref
     def predict(
@@ -457,8 +406,8 @@ class BaseDriftUnivariate(BaseDrift):
         Returns
         -------
         DriftOutput
-            Dictionary containing the drift prediction and optionally the feature level
-            p-values, threshold after multivariate correction if needed and test statistics.
+            Dictionary containing the :term:`drift<Drift>` prediction and optionally the feature level
+            p-values, threshold after multivariate correction if needed and test :term:`statistics<Statistics>`.
         """
         # compute drift scores
         p_vals, dist = self.score(x)

dataeval/{_internal/detectors → detectors}/drift/cvm.py RENAMED Viewed

@@ -8,32 +8,33 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftCVM"]
 from typing import Callable, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import cramervonmises_2samp
-from dataeval._internal.interop import to_numpy
-from .base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.interop import to_numpy
 class DriftCVM(BaseDriftUnivariate):
     """
-    Drift detector employing the Cramér-von Mises (CVM) distribution test.
+    :term:`Drift` detector employing the :term:`Cramér-von Mises (CVM) Drift Detection` test.
     The CVM test detects changes in the distribution of continuous
     univariate data. For multivariate data, a separate CVM test is applied to each
     feature, and the obtained p-values are aggregated via the Bonferroni or
-    False Discovery Rate (FDR) corrections.
+    :term:`False Discovery Rate (FDR)` corrections.
     Parameters
     ----------
     x_ref : ArrayLike
         Data used as reference distribution.
     p_val : float | None, default 0.05
-        p-value used for significance of the statistical test for each feature.
+        :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
     x_ref_preprocessed : bool, default False
@@ -46,7 +47,7 @@ class DriftCVM(BaseDriftUnivariate):
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
     preprocess_fn : Callable | None, default None
         Function to preprocess the data before computing the data drift metrics.
-        Typically a dimensionality reduction technique.
+        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
     correction : "bonferroni" | "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
@@ -79,7 +80,7 @@ class DriftCVM(BaseDriftUnivariate):
     @preprocess_x
     def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
         """
-        Performs the two-sample Cramér-von Mises test(s), computing the p-value and
+        Performs the two-sample Cramér-von Mises test(s), computing the :term:`p-value<P-value>` and
         test statistic per feature.
         Parameters

dataeval/{_internal/detectors → detectors}/drift/ks.py RENAMED Viewed

@@ -8,23 +8,24 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftKS"]
 from typing import Callable, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import ks_2samp
-from dataeval._internal.interop import to_numpy
-from .base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.interop import to_numpy
 class DriftKS(BaseDriftUnivariate):
     """
-    Drift detector employing the Kolmogorov-Smirnov (KS) distribution test.
+    :term:`Drift` detector employing the Kolmogorov-Smirnov (KS) distribution test.
     The KS test detects changes in the maximum distance between two data
-    distributions with Bonferroni or False Discovery Rate (FDR) correction
+    distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
     for multivariate data.
     Parameters
@@ -32,7 +33,7 @@ class DriftKS(BaseDriftUnivariate):
     x_ref : ArrayLike
         Data used as reference distribution.
     p_val : float | None, default 0.05
-        p-value used for significance of the statistical test for each feature.
+        :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
     x_ref_preprocessed : bool, default False
@@ -44,8 +45,8 @@ class DriftKS(BaseDriftUnivariate):
         using the last n instances seen by the detector with LastSeenUpdateStrategy
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
     preprocess_fn : Callable | None, default None
-        Function to preprocess the data before computing the data drift metrics.
-        Typically a dimensionality reduction technique.
+        Function to preprocess the data before computing the data :term:`drift<Drift>` metrics.
+        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
     correction : "bonferroni" | "fdr", default "bonferroni"
         Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
         Discovery Rate).
@@ -85,7 +86,7 @@ class DriftKS(BaseDriftUnivariate):
     @preprocess_x
     def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
         """
-        Compute KS scores and statistics per feature.
+        Compute KS scores and :term:Statistics` per feature.
         Parameters
         ----------
@@ -95,7 +96,7 @@ class DriftKS(BaseDriftUnivariate):
         Returns
         -------
         tuple[NDArray, NDArray]
-            Feature level p-values and KS statistic
+            Feature level :term:p-values and KS statistic
         """
         x = to_numpy(x)
         x = x.reshape(x.shape[0], -1)

dataeval/{_internal/detectors → detectors}/drift/mmd.py RENAMED Viewed

@@ -8,30 +8,31 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftMMD", "DriftMMDOutput"]
 from dataclasses import dataclass
 from typing import Callable
 import torch
 from numpy.typing import ArrayLike
-from dataeval._internal.interop import as_numpy
-from dataeval._internal.output import set_metadata
-from .base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
-from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
+from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
+from dataeval.detectors.drift.torch import _GaussianRBF, _mmd2_from_kernel_matrix, get_device
+from dataeval.interop import as_numpy
+from dataeval.output import set_metadata
 @dataclass(frozen=True)
 class DriftMMDOutput(DriftBaseOutput):
     """
-    Output class for :class:`DriftMMD` drift detector
+    Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
     Attributes
     ----------
     is_drift : bool
         Drift prediction for the images
     threshold : float
-        P-value used for significance of the permutation test
+        :term:`P-Value` used for significance of the permutation test
     p_val : float
         P-value obtained from the permutation test
     distance : float
@@ -49,14 +50,14 @@ class DriftMMDOutput(DriftBaseOutput):
 class DriftMMD(BaseDrift):
     """
-    Maximum Mean Discrepancy (MMD) data drift detector using a permutation test.
+    :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm using a permutation test.
     Parameters
     ----------
     x_ref : ArrayLike
         Data used as reference distribution.
     p_val : float | None, default 0.05
-        p-value used for significance of the statistical test for each feature.
+        :term:`P-value` used for significance of the statistical test for each feature.
         If the FDR correction method is used, this corresponds to the acceptable
         q-value.
     x_ref_preprocessed : bool, default False
@@ -69,11 +70,9 @@ class DriftMMD(BaseDrift):
         or via reservoir sampling with ReservoirSamplingUpdateStrategy.
     preprocess_fn : Callable | None, default None
         Function to preprocess the data before computing the data drift metrics.
-        Typically a dimensionality reduction technique.
-    kernel : Callable, default GaussianRBF
-        Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
+        Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
     sigma : ArrayLike | None, default None
-        Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
+        Optionally set the internal GaussianRBF kernel bandwidth. Can also pass multiple
         bandwidth values as an array. The kernel evaluation is then averaged over
         those bandwidths.
     configure_kernel_from_x_ref : bool, default True
@@ -91,48 +90,47 @@ class DriftMMD(BaseDrift):
         p_val: float = 0.05,
         x_ref_preprocessed: bool = False,
         update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
-        kernel: Callable = GaussianRBF,
+        preprocess_fn: Callable[..., ArrayLike] | None = None,
         sigma: ArrayLike | None = None,
         configure_kernel_from_x_ref: bool = True,
         n_permutations: int = 100,
-        device: str | None = None,
+        device: str | torch.device | None = None,
     ) -> None:
         super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
-        self.infer_sigma = configure_kernel_from_x_ref
+        self._infer_sigma = configure_kernel_from_x_ref
         if configure_kernel_from_x_ref and sigma is not None:
-            self.infer_sigma = False
+            self._infer_sigma = False
         self.n_permutations = n_permutations  # nb of iterations through permutation test
         # set device
-        self.device = get_device(device)
+        self.device: torch.device = get_device(device)
         # initialize kernel
         sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
-        self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
+        self._kernel = _GaussianRBF(sigma_tensor).to(self.device)
         # compute kernel matrix for the reference data
-        if self.infer_sigma or isinstance(sigma_tensor, torch.Tensor):
+        if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
             x = torch.from_numpy(self.x_ref).to(self.device)
-            self.k_xx = self.kernel(x, x, infer_sigma=self.infer_sigma)
-            self.infer_sigma = False
+            self._k_xx = self._kernel(x, x, infer_sigma=self._infer_sigma)
+            self._infer_sigma = False
         else:
-            self.k_xx, self.infer_sigma = None, True
+            self._k_xx, self._infer_sigma = None, True
     def _kernel_matrix(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         """Compute and return full kernel matrix between arrays x and y."""
-        k_xy = self.kernel(x, y, self.infer_sigma)
-        k_xx = self.k_xx if self.k_xx is not None and self.update_x_ref is None else self.kernel(x, x)
-        k_yy = self.kernel(y, y)
+        k_xy = self._kernel(x, y, self._infer_sigma)
+        k_xx = self._k_xx if self._k_xx is not None and self.update_x_ref is None else self._kernel(x, x)
+        k_yy = self._kernel(y, y)
         kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
         return kernel_mat
     @preprocess_x
     def score(self, x: ArrayLike) -> tuple[float, float, float]:
         """
-        Compute the p-value resulting from a permutation test using the maximum mean
+        Compute the :term:`p-value<P-Value>` resulting from a permutation test using the maximum mean
         discrepancy as a distance measure between the reference data and the data to
         be tested.
@@ -145,25 +143,25 @@ class DriftMMD(BaseDrift):
         -------
         tuple(float, float, float)
             p-value obtained from the permutation test, MMD^2 between the reference and test set,
-            and MMD^2 threshold above which drift is flagged
+            and MMD^2 threshold above which :term:`drift<Drift>` is flagged
         """
         x = as_numpy(x)
         x_ref = torch.from_numpy(self.x_ref).to(self.device)
         n = x.shape[0]
         kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
         kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())  # zero diagonal
-        mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
+        mmd2 = _mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
         mmd2_permuted = torch.Tensor(
-            [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
+            [_mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
         )
         mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
         p_val = (mmd2 <= mmd2_permuted).float().mean()
         # compute distance threshold
         idx_threshold = int(self.p_val * len(mmd2_permuted))
         distance_threshold = torch.sort(mmd2_permuted, descending=True).values[idx_threshold]
-        return p_val.numpy().item(), mmd2.numpy().item(), distance_threshold.numpy()
+        return p_val.numpy().item(), mmd2.numpy().item(), distance_threshold.numpy().item()
-    @set_metadata("dataeval.detectors")
+    @set_metadata()
     @preprocess_x
     @update_x_ref
     def predict(self, x: ArrayLike) -> DriftMMDOutput:
@@ -179,7 +177,8 @@ class DriftMMD(BaseDrift):
         Returns
         -------
         DriftMMDOutput
-            Output class containing the drift prediction, p-value, threshold and MMD metric.
+            Output class containing the :term:`drift<Drift>` prediction, :term:`p-value<P-Value>`,
+            threshold and MMD metric.
         """
         # compute drift scores
         p_val, dist, distance_threshold = self.score(x)

dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl

dataeval 0.72.0py3-none-any.whl → 0.72.2py3-none-any.whl