PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
dataeval/detectors/ood/aegmm.py +66 -0
dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
dataeval/detectors/ood/vaegmm.py +75 -0
dataeval/interop.py +56 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
dataeval/metrics/bias/metadata.py +358 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -3
dataeval/utils/image.py +71 -0
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
dataeval-0.73.0.dist-info/RECORD +73 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/detectors/ood/aegmm.py +0 -78
dataeval/_internal/detectors/ood/vaegmm.py +0 -89
dataeval/_internal/interop.py +0 -49
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.72.1"
+__version__ = "0.73.0"
 from importlib.util import find_spec
@@ -8,16 +8,16 @@ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("te
 del find_spec
-from . import detectors, metrics  # noqa: E402
+from dataeval import detectors, metrics  # noqa: E402
 __all__ = ["detectors", "metrics"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from . import workflows
+    from dataeval import workflows
     __all__ += ["workflows"]
 if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from . import utils
+    from dataeval import utils
     __all__ += ["utils"]

dataeval/detectors/__init__.py CHANGED Viewed

@@ -3,12 +3,13 @@ Detectors can determine if a dataset or individual images in a dataset are indic
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from . import drift, linters
+from dataeval.detectors import drift, linters
 __all__ = ["drift", "linters"]
 if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
-    from . import ood
+    from dataeval.detectors import ood
     __all__ += ["ood"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -3,19 +3,18 @@
 """
 from dataeval import _IS_TORCH_AVAILABLE
-from dataeval._internal.detectors.drift.base import DriftOutput
-from dataeval._internal.detectors.drift.cvm import DriftCVM
-from dataeval._internal.detectors.drift.ks import DriftKS
-from . import updates
+from dataeval.detectors.drift import updates
+from dataeval.detectors.drift.base import DriftOutput
+from dataeval.detectors.drift.cvm import DriftCVM
+from dataeval.detectors.drift.ks import DriftKS
 __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
-    from dataeval._internal.detectors.drift.mmd import DriftMMD, DriftMMDOutput
-    from dataeval._internal.detectors.drift.torch import preprocess_drift
-    from dataeval._internal.detectors.drift.uncertainty import DriftUncertainty
+    from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
+    from dataeval.detectors.drift.torch import preprocess_drift
+    from dataeval.detectors.drift.uncertainty import DriftUncertainty
-    from . import kernels
+    __all__ += ["DriftMMD", "DriftMMDOutput", "DriftUncertainty", "preprocess_drift"]
-    __all__ += ["DriftMMD", "DriftMMDOutput", "DriftUncertainty", "kernels", "preprocess_drift"]
+del _IS_TORCH_AVAILABLE

dataeval/{_internal/detectors → detectors}/drift/base.py RENAMED Viewed

@@ -8,16 +8,38 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftOutput"]
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import wraps
-from typing import Callable, Literal
+from typing import Any, Callable, Literal, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.interop import as_numpy, to_numpy
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import as_numpy, to_numpy
+from dataeval.output import OutputMetadata, set_metadata
+R = TypeVar("R")
+class UpdateStrategy(ABC):
+    """
+    Updates reference dataset for drift detector
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __init__(self, n: int) -> None:
+        self.n = n
+    @abstractmethod
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        """Abstract implementation of update strategy"""
 @dataclass(frozen=True)
@@ -70,9 +92,11 @@ class DriftOutput(DriftBaseOutput):
     distances: NDArray[np.float32]
-def update_x_ref(fn):
+def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
+    """Decorator to update x_ref with x using selected update methodology"""
     @wraps(fn)
-    def _(self, x, *args, **kwargs):
+    def _(self, x, *args, **kwargs) -> R:
         output = fn(self, x, *args, **kwargs)
         # update reference dataset
@@ -86,9 +110,11 @@ def update_x_ref(fn):
     return _
-def preprocess_x(fn):
+def preprocess_x(fn: Callable[..., R]) -> Callable[..., R]:
+    """Decorator to run preprocess_fn on x before calling wrapped function"""
     @wraps(fn)
-    def _(self, x, *args, **kwargs):
+    def _(self, x, *args, **kwargs) -> R:
         if self._x_refcount == 0:
             self._x = self._preprocess(x)
         self._x_refcount += 1
@@ -101,70 +127,6 @@ def preprocess_x(fn):
     return _
-class UpdateStrategy(ABC):
-    """
-    Updates reference dataset for :term:`drift<Drift>` detector
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __init__(self, n: int):
-        self.n = n
-    @abstractmethod
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        """Abstract implementation of update strategy"""
-class LastSeenUpdate(UpdateStrategy):
-    """
-    Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        x_updated = np.concatenate([x_ref, x], axis=0)
-        return x_updated[-self.n :]
-class ReservoirSamplingUpdate(UpdateStrategy):
-    """
-    Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.
-    Parameters
-    ----------
-    n : int
-        Update with last n instances seen by the detector.
-    """
-    def __call__(self, x_ref: NDArray, x: NDArray, count: int) -> NDArray:
-        if x.shape[0] + count <= self.n:
-            return np.concatenate([x_ref, x], axis=0)
-        n_ref = x_ref.shape[0]
-        output_size = min(self.n, n_ref + x.shape[0])
-        shape = (output_size,) + x.shape[1:]
-        x_reservoir = np.zeros(shape, dtype=x_ref.dtype)
-        x_reservoir[:n_ref] = x_ref
-        for item in x:
-            count += 1
-            if n_ref < self.n:
-                x_reservoir[n_ref, :] = item
-                n_ref += 1
-            else:
-                r = np.random.randint(0, count)
-                if r < self.n:
-                    x_reservoir[r, :] = item
-        return x_reservoir
 class BaseDrift:
     """
     A generic :term:`drift<Drift>` detection component for preprocessing data and applying statistical correction.
@@ -223,7 +185,7 @@ class BaseDrift:
         p_val: float = 0.05,
         x_ref_preprocessed: bool = False,
         update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
+        preprocess_fn: Callable[..., ArrayLike] | None = None,
         correction: Literal["bonferroni", "fdr"] = "bonferroni",
     ) -> None:
         # Type checking
@@ -235,20 +197,20 @@ class BaseDrift:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
         self._x_ref = to_numpy(x_ref)
-        self.x_ref_preprocessed = x_ref_preprocessed
+        self.x_ref_preprocessed: bool = x_ref_preprocessed
         # Other attributes
         self.p_val = p_val
         self.update_x_ref = update_x_ref
         self.preprocess_fn = preprocess_fn
         self.correction = correction
-        self.n = len(self._x_ref)
+        self.n: int = len(self._x_ref)
         # Ref counter for preprocessed x
         self._x_refcount = 0
     @property
-    def x_ref(self) -> NDArray:
+    def x_ref(self) -> NDArray[Any]:
         """
         Retrieve the reference data, applying preprocessing if not already done.
@@ -313,9 +275,6 @@ class BaseDriftUnivariate(BaseDrift):
     Attributes
     ----------
-    _n_features : int | None
-        Number of features in the data. If not provided, it is lazily inferred from the
-        input data and any preprocessing function.
     p_val : float
         The significance level for drift detection.
     correction : str
@@ -324,17 +283,6 @@ class BaseDriftUnivariate(BaseDrift):
         Strategy for updating the reference data if applicable.
     preprocess_fn : Callable | None
         Function used for preprocessing input data before drift detection.
-    Methods
-    -------
-    n_features:
-        Property that returns the number of features, inferring it if necessary.
-    score(x):
-        Abstract method to compute univariate feature scores after preprocessing.
-    _apply_correction(p_vals):
-        Apply a statistical correction to p-values to account for multiple testing.
-    predict(x):
-        Predict whether drift has occurred on a batch of data, applying multivariate correction if needed.
     """
     def __init__(
@@ -427,7 +375,7 @@ class BaseDriftUnivariate(BaseDrift):
             return drift_pred, threshold
         elif self.correction == "fdr":
             n = p_vals.shape[0]
-            i = np.arange(n) + 1
+            i = np.arange(n) + np.int_(1)
             p_sorted = np.sort(p_vals)
             q_threshold = self.p_val * i / n
             below_threshold = p_sorted < q_threshold
@@ -439,7 +387,7 @@ class BaseDriftUnivariate(BaseDrift):
         else:
             raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
-    @set_metadata("dataeval.detectors")
+    @set_metadata()
     @preprocess_x
     @update_x_ref
     def predict(

dataeval/{_internal/detectors → detectors}/drift/cvm.py RENAMED Viewed

@@ -8,15 +8,16 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftCVM"]
 from typing import Callable, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import cramervonmises_2samp
-from dataeval._internal.interop import to_numpy
-from .base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.interop import to_numpy
 class DriftCVM(BaseDriftUnivariate):

dataeval/{_internal/detectors → detectors}/drift/ks.py RENAMED Viewed

@@ -8,15 +8,16 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftKS"]
 from typing import Callable, Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import ks_2samp
-from dataeval._internal.interop import to_numpy
-from .base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.detectors.drift.base import BaseDriftUnivariate, UpdateStrategy, preprocess_x
+from dataeval.interop import to_numpy
 class DriftKS(BaseDriftUnivariate):

dataeval/{_internal/detectors → detectors}/drift/mmd.py RENAMED Viewed

@@ -8,17 +8,18 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftMMD", "DriftMMDOutput"]
 from dataclasses import dataclass
 from typing import Callable
 import torch
 from numpy.typing import ArrayLike
-from dataeval._internal.interop import as_numpy
-from dataeval._internal.output import set_metadata
-from .base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
-from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
+from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
+from dataeval.detectors.drift.torch import _GaussianRBF, _mmd2_from_kernel_matrix, get_device
+from dataeval.interop import as_numpy
+from dataeval.output import set_metadata
 @dataclass(frozen=True)
@@ -70,10 +71,8 @@ class DriftMMD(BaseDrift):
     preprocess_fn : Callable | None, default None
         Function to preprocess the data before computing the data drift metrics.
         Typically a :term:`dimensionality reduction<Dimensionality Reduction>` technique.
-    kernel : Callable, default GaussianRBF
-        Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
     sigma : ArrayLike | None, default None
-        Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
+        Optionally set the internal GaussianRBF kernel bandwidth. Can also pass multiple
         bandwidth values as an array. The kernel evaluation is then averaged over
         those bandwidths.
     configure_kernel_from_x_ref : bool, default True
@@ -91,41 +90,40 @@ class DriftMMD(BaseDrift):
         p_val: float = 0.05,
         x_ref_preprocessed: bool = False,
         update_x_ref: UpdateStrategy | None = None,
-        preprocess_fn: Callable[[ArrayLike], ArrayLike] | None = None,
-        kernel: Callable = GaussianRBF,
+        preprocess_fn: Callable[..., ArrayLike] | None = None,
         sigma: ArrayLike | None = None,
         configure_kernel_from_x_ref: bool = True,
         n_permutations: int = 100,
-        device: str | None = None,
+        device: str | torch.device | None = None,
     ) -> None:
         super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
-        self.infer_sigma = configure_kernel_from_x_ref
+        self._infer_sigma = configure_kernel_from_x_ref
         if configure_kernel_from_x_ref and sigma is not None:
-            self.infer_sigma = False
+            self._infer_sigma = False
         self.n_permutations = n_permutations  # nb of iterations through permutation test
         # set device
-        self.device = get_device(device)
+        self.device: torch.device = get_device(device)
         # initialize kernel
         sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
-        self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
+        self._kernel = _GaussianRBF(sigma_tensor).to(self.device)
         # compute kernel matrix for the reference data
-        if self.infer_sigma or isinstance(sigma_tensor, torch.Tensor):
+        if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
             x = torch.from_numpy(self.x_ref).to(self.device)
-            self.k_xx = self.kernel(x, x, infer_sigma=self.infer_sigma)
-            self.infer_sigma = False
+            self._k_xx = self._kernel(x, x, infer_sigma=self._infer_sigma)
+            self._infer_sigma = False
         else:
-            self.k_xx, self.infer_sigma = None, True
+            self._k_xx, self._infer_sigma = None, True
     def _kernel_matrix(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         """Compute and return full kernel matrix between arrays x and y."""
-        k_xy = self.kernel(x, y, self.infer_sigma)
-        k_xx = self.k_xx if self.k_xx is not None and self.update_x_ref is None else self.kernel(x, x)
-        k_yy = self.kernel(y, y)
+        k_xy = self._kernel(x, y, self._infer_sigma)
+        k_xx = self._k_xx if self._k_xx is not None and self.update_x_ref is None else self._kernel(x, x)
+        k_yy = self._kernel(y, y)
         kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
         return kernel_mat
@@ -152,9 +150,9 @@ class DriftMMD(BaseDrift):
         n = x.shape[0]
         kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
         kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())  # zero diagonal
-        mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
+        mmd2 = _mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
         mmd2_permuted = torch.Tensor(
-            [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
+            [_mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
         )
         mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
         p_val = (mmd2 <= mmd2_permuted).float().mean()
@@ -163,7 +161,7 @@ class DriftMMD(BaseDrift):
         distance_threshold = torch.sort(mmd2_permuted, descending=True).values[idx_threshold]
         return p_val.numpy().item(), mmd2.numpy().item(), distance_threshold.numpy().item()
-    @set_metadata("dataeval.detectors")
+    @set_metadata()
     @preprocess_x
     @update_x_ref
     def predict(self, x: ArrayLike) -> DriftMMDOutput:

dataeval/{_internal/detectors → detectors}/drift/torch.py RENAMED Viewed

@@ -8,8 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = []
 from functools import partial
-from typing import Callable
+from typing import Any, Callable
 import numpy as np
 import torch
@@ -42,7 +44,7 @@ def get_device(device: str | torch.device | None = None) -> torch.device:
     return torch_device
-def mmd2_from_kernel_matrix(
+def _mmd2_from_kernel_matrix(
     kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
 ) -> torch.Tensor:
     """
@@ -78,13 +80,13 @@ def mmd2_from_kernel_matrix(
 def predict_batch(
-    x: NDArray | torch.Tensor,
+    x: NDArray[Any] | torch.Tensor,
     model: Callable | nn.Module | nn.Sequential,
     device: torch.device | None = None,
     batch_size: int = int(1e10),
     preprocess_fn: Callable | None = None,
     dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray | torch.Tensor | tuple:
+) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
     """
     Make batch predictions on a model.
@@ -154,13 +156,13 @@ def predict_batch(
 def preprocess_drift(
-    x: NDArray,
+    x: NDArray[Any],
     model: nn.Module,
-    device: torch.device | None = None,
+    device: str | torch.device | None = None,
     preprocess_batch_fn: Callable | None = None,
     batch_size: int = int(1e10),
     dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray | torch.Tensor | tuple:
+) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
     """
     Prediction function used for preprocessing step of drift detector.
@@ -189,7 +191,7 @@ def preprocess_drift(
     return predict_batch(
         x,
         model,
-        device=device,
+        device=get_device(device),
         batch_size=batch_size,
         preprocess_fn=preprocess_batch_fn,
         dtype=dtype,
@@ -197,7 +199,7 @@ def preprocess_drift(
 @torch.jit.script
-def squared_pairwise_distance(
+def _squared_pairwise_distance(
     x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
 ) -> torch.Tensor:  # pragma: no cover - torch.jit.script code is compiled and copied
     """
@@ -249,7 +251,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
     return sigma
-class GaussianRBF(nn.Module):
+class _GaussianRBF(nn.Module):
     """
     Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
@@ -303,7 +305,7 @@ class GaussianRBF(nn.Module):
         infer_sigma: bool = False,
     ) -> torch.Tensor:
         x, y = torch.as_tensor(x), torch.as_tensor(y)
-        dist = squared_pairwise_distance(x.flatten(1), y.flatten(1))  # [Nx, Ny]
+        dist = _squared_pairwise_distance(x.flatten(1), y.flatten(1))  # [Nx, Ny]
         if infer_sigma or self.init_required:
             if self.trainable and infer_sigma:

dataeval/{_internal/detectors → detectors}/drift/uncertainty.py RENAMED Viewed

@@ -8,6 +8,8 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftUncertainty"]
 from functools import partial
 from typing import Callable, Literal
@@ -16,16 +18,16 @@ from numpy.typing import ArrayLike, NDArray
 from scipy.special import softmax
 from scipy.stats import entropy
-from .base import DriftOutput, UpdateStrategy
-from .ks import DriftKS
-from .torch import get_device, preprocess_drift
+from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
+from dataeval.detectors.drift.ks import DriftKS
+from dataeval.detectors.drift.torch import get_device, preprocess_drift
 def classifier_uncertainty(
-    x: NDArray,
+    x: NDArray[np.float64],
     model_fn: Callable,
     preds_type: Literal["probs", "logits"] = "probs",
-) -> NDArray:
+) -> NDArray[np.float64]:
     """
     Evaluate model_fn on x and transform predictions to prediction uncertainties.

dataeval/detectors/drift/updates.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
+for drift.
+"""
+from __future__ import annotations
+__all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
+from typing import Any
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.detectors.drift.base import UpdateStrategy
+class LastSeenUpdate(UpdateStrategy):
+    """
+    Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        x_updated = np.concatenate([x_ref, x], axis=0)
+        return x_updated[-self.n :]
+class ReservoirSamplingUpdate(UpdateStrategy):
+    """
+    Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        if x.shape[0] + count <= self.n:
+            return np.concatenate([x_ref, x], axis=0)
+        n_ref = x_ref.shape[0]
+        output_size = min(self.n, n_ref + x.shape[0])
+        shape = (output_size,) + x.shape[1:]
+        x_reservoir = np.zeros(shape, dtype=x_ref.dtype)
+        x_reservoir[:n_ref] = x_ref
+        for item in x:
+            count += 1
+            if n_ref < self.n:
+                x_reservoir[n_ref, :] = item
+                n_ref += 1
+            else:
+                r = np.random.randint(0, count)
+                if r < self.n:
+                    x_reservoir[r, :] = item
+        return x_reservoir

dataeval/detectors/linters/__init__.py CHANGED Viewed

@@ -2,9 +2,9 @@
 Linters help identify potential issues in training and test data and are an important aspect of data cleaning.
 """
-from dataeval._internal.detectors.clusterer import Clusterer, ClustererOutput
-from dataeval._internal.detectors.duplicates import Duplicates, DuplicatesOutput
-from dataeval._internal.detectors.outliers import Outliers, OutliersOutput
+from dataeval.detectors.linters.clusterer import Clusterer, ClustererOutput
+from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
+from dataeval.detectors.linters.outliers import Outliers, OutliersOutput
 __all__ = [
     "Clusterer",

dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl