PyPI - dataeval - Versions diffs - 0.73.0__tar.gz → 0.74.0__tar.gz - Mend

dataeval 0.73.0tar.gz → 0.74.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

{dataeval-0.73.0 → dataeval-0.74.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.73.0
+Version: 0.74.0
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT
@@ -23,7 +23,6 @@ Classifier: Topic :: Scientific/Engineering
 Provides-Extra: all
 Provides-Extra: tensorflow
 Provides-Extra: torch
-Requires-Dist: hdbscan (>=0.8.36)
 Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
 Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
 Requires-Dist: numpy (>1.24.3)

{dataeval-0.73.0 → dataeval-0.74.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dataeval"
-version = "0.73.0" # dynamic
+version = "0.74.0" # dynamic
 description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
 license = "MIT"
 readme = "README.md"
@@ -42,7 +42,6 @@ packages = [
 [tool.poetry.dependencies]
 # required
 python = ">=3.9,<3.13"
-hdbscan = {version = ">=0.8.36"}
 numpy = {version = ">1.24.3"}
 pillow = {version = ">=10.3.0"}
 scipy = {version = ">=1.10"}
@@ -69,8 +68,7 @@ all = ["matplotlib", "markupsafe", "tensorflow", "tensorflow_probability", "tf-k
 optional = true
 [tool.poetry.group.dev.dependencies]
-tox = {version = "*"}
-tox-uv = {version = "*"}
+nox = {version = "*", extras = ["uv"]}
 uv = {version = "*"}
 poetry = {version = "*"}
 poetry-lock-groups-plugin = {version = "*"}
@@ -122,7 +120,6 @@ files = ["src/dataeval/__init__.py"]
 name = "dataeval"
 [tool.poetry2conda.dependencies]
-nvidia-cudnn-cu11 = { name = "cudnn" }
 tensorflow_probability = { name = "tensorflow-probability" }
 torch = { name = "pytorch" }
 xxhash = { name = "python-xxhash" }
@@ -145,6 +142,9 @@ parallel = true
 exclude_also = [
   "raise NotImplementedError",
   "if TYPE_CHECKING:",
+  "if _IS_TENSORFLOW_AVAILABLE",
+  "if _IS_TORCH_AVAILABLE",
+  "if _IS_TORCHVISION_AVAILABLE",
 ]
 include = ["*/src/dataeval/*"]
 omit = [
@@ -164,6 +164,7 @@ exclude = [
   "*env*",
   "output",
   "_build",
+  ".nox",
   ".tox",
   "prototype",
 ]

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.73.0"
+__version__ = "0.74.0"
 from importlib.util import find_spec
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics  # noqa: E402
 __all__ = ["detectors", "metrics"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval import workflows
     __all__ += ["workflows"]
-if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
     from dataeval import utils
     __all__ += ["utils"]

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/__init__.py RENAMED Viewed

@@ -7,7 +7,7 @@ from dataeval.detectors import drift, linters
 __all__ = ["drift", "linters"]
-if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE:
     from dataeval.detectors import ood
     __all__ += ["ood"]

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/drift/__init__.py RENAMED Viewed

@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
 __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
     from dataeval.detectors.drift.torch import preprocess_drift
     from dataeval.detectors.drift.uncertainty import DriftUncertainty

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/drift/base.py RENAMED Viewed

@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.interop import as_numpy, to_numpy
+from dataeval.interop import as_numpy
 from dataeval.output import OutputMetadata, set_metadata
 R = TypeVar("R")
@@ -196,7 +196,7 @@ class BaseDrift:
         if correction not in ["bonferroni", "fdr"]:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
-        self._x_ref = to_numpy(x_ref)
+        self._x_ref = as_numpy(x_ref)
         self.x_ref_preprocessed: bool = x_ref_preprocessed
         # Other attributes

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/drift/torch.py RENAMED Viewed

@@ -10,7 +10,6 @@ from __future__ import annotations
 __all__ = []
-from functools import partial
 from typing import Any, Callable
 import numpy as np
@@ -18,30 +17,7 @@ import torch
 import torch.nn as nn
 from numpy.typing import NDArray
-def get_device(device: str | torch.device | None = None) -> torch.device:
-    """
-    Instantiates a PyTorch device object.
-    Parameters
-    ----------
-    device : str | torch.device | None, default None
-        Either ``None``, a str ('gpu' or 'cpu') indicating the device to choose, or an
-        already instantiated device object. If ``None``, the GPU is selected if it is
-        detected, otherwise the CPU is used as a fallback.
-    Returns
-    -------
-    The instantiated device object.
-    """
-    if isinstance(device, torch.device):  # Already a torch device
-        return device
-    else:  # Instantiate device
-        if device is None or device.lower() in ["gpu", "cuda"]:
-            torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        else:
-            torch_device = torch.device("cpu")
-    return torch_device
+from dataeval.utils.torch.utils import get_device, predict_batch
 def _mmd2_from_kernel_matrix(
@@ -79,82 +55,6 @@ def _mmd2_from_kernel_matrix(
     return mmd2
-def predict_batch(
-    x: NDArray[Any] | torch.Tensor,
-    model: Callable | nn.Module | nn.Sequential,
-    device: torch.device | None = None,
-    batch_size: int = int(1e10),
-    preprocess_fn: Callable | None = None,
-    dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
-    """
-    Make batch predictions on a model.
-    Parameters
-    ----------
-    x : np.ndarray | torch.Tensor
-        Batch of instances.
-    model : Callable | nn.Module | nn.Sequential
-        PyTorch model.
-    device : torch.device | None, default None
-        Device type used. The default None tries to use the GPU and falls back on CPU.
-        Can be specified by passing either torch.device('cuda') or torch.device('cpu').
-    batch_size : int, default 1e10
-        Batch size used during prediction.
-    preprocess_fn : Callable | None, default None
-        Optional preprocessing function for each batch.
-    dtype : np.dtype | torch.dtype, default np.float32
-        Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
-    Returns
-    -------
-    NDArray | torch.Tensor | tuple
-        Numpy array, torch tensor or tuples of those with model outputs.
-    """
-    device = get_device(device)
-    if isinstance(x, np.ndarray):
-        x = torch.from_numpy(x)
-    n = len(x)
-    n_minibatch = int(np.ceil(n / batch_size))
-    return_np = not isinstance(dtype, torch.dtype)
-    preds = []
-    with torch.no_grad():
-        for i in range(n_minibatch):
-            istart, istop = i * batch_size, min((i + 1) * batch_size, n)
-            x_batch = x[istart:istop]
-            if isinstance(preprocess_fn, Callable):
-                x_batch = preprocess_fn(x_batch)
-            preds_tmp = model(x_batch.to(device))
-            if isinstance(preds_tmp, (list, tuple)):
-                if len(preds) == 0:  # init tuple with lists to store predictions
-                    preds = tuple([] for _ in range(len(preds_tmp)))
-                for j, p in enumerate(preds_tmp):
-                    if isinstance(p, torch.Tensor):
-                        p = p.cpu()
-                    preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
-            elif isinstance(preds_tmp, (np.ndarray, torch.Tensor)):
-                if isinstance(preds_tmp, torch.Tensor):
-                    preds_tmp = preds_tmp.cpu()
-                if isinstance(preds, tuple):
-                    preds = list(preds)
-                preds.append(
-                    preds_tmp
-                    if not return_np or isinstance(preds_tmp, np.ndarray)  # type: ignore
-                    else preds_tmp.numpy()
-                )
-            else:
-                raise TypeError(
-                    f"Model output type {type(preds_tmp)} not supported. The model \
-                    output type needs to be one of list, tuple, NDArray or \
-                    torch.Tensor."
-                )
-    concat = partial(np.concatenate, axis=0) if return_np else partial(torch.cat, dim=0)
-    out: tuple | np.ndarray | torch.Tensor = (
-        tuple(concat(p) for p in preds) if isinstance(preds, tuple) else concat(preds)  # type: ignore
-    )
-    return out
 def preprocess_drift(
     x: NDArray[Any],
     model: nn.Module,

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/linters/clusterer.py RENAMED Viewed

@@ -480,7 +480,7 @@ class Clusterer:
             samples = self.clusters[level][cluster_id].samples
             if len(samples) >= self._min_num_samples_per_cluster:
                 duplicates_std.append(self.clusters[level][cluster_id].dist_std)
-        diag_mask = np.ones_like(self._sqdmat, dtype=bool)
+        diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
         np.fill_diagonal(diag_mask, 0)
         diag_mask = np.triu(diag_mask)

dataeval-0.74.0/src/dataeval/detectors/ood/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
+"""
+from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
+from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
+__all__ = ["OODOutput", "OODScoreOutput"]
+if _IS_TENSORFLOW_AVAILABLE:
+    from dataeval.detectors.ood.ae import OOD_AE
+    from dataeval.detectors.ood.aegmm import OOD_AEGMM
+    from dataeval.detectors.ood.llr import OOD_LLR
+    from dataeval.detectors.ood.vae import OOD_VAE
+    from dataeval.detectors.ood.vaegmm import OOD_VAEGMM
+    __all__ += ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM"]
+elif _IS_TORCH_AVAILABLE:
+    from dataeval.detectors.ood.ae_torch import OOD_AE
+    __all__ += ["OOD_AE", "OODOutput"]

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/ood/ae.py RENAMED Viewed

@@ -15,7 +15,8 @@ from typing import TYPE_CHECKING, Callable
 import numpy as np
 from numpy.typing import ArrayLike
-from dataeval.detectors.ood.base import OODBase, OODScoreOutput
+from dataeval.detectors.ood.base import OODScoreOutput
+from dataeval.detectors.ood.base_tf import OODBase
 from dataeval.interop import as_numpy
 from dataeval.utils.lazy import lazyload
 from dataeval.utils.tensorflow._internal.utils import predict_batch

dataeval-0.74.0/src/dataeval/detectors/ood/ae_torch.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""
+Adapted for Pytorch from
+Source code derived from Alibi-Detect 0.11.4
+https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
+Original code Copyright (c) 2023 Seldon Technologies Ltd
+Licensed under Apache Software License (Apache 2.0)
+"""
+from __future__ import annotations
+from typing import Callable
+import numpy as np
+import torch
+from numpy.typing import ArrayLike
+from dataeval.detectors.ood.base import OODScoreOutput
+from dataeval.detectors.ood.base_torch import OODBase
+from dataeval.interop import as_numpy
+from dataeval.utils.torch.utils import predict_batch
+class OOD_AE(OODBase):
+    """
+    Autoencoder based out-of-distribution detector.
+    Parameters
+    ----------
+    model : AriaAutoencoder
+        An Autoencoder model.
+    """
+    def __init__(self, model: torch.nn.Module, device: str | torch.device | None = None) -> None:
+        super().__init__(model, device)
+    def fit(
+        self,
+        x_ref: ArrayLike,
+        threshold_perc: float,
+        loss_fn: Callable[..., torch.nn.Module] | None = None,
+        optimizer: torch.optim.Optimizer | None = None,
+        epochs: int = 20,
+        batch_size: int = 64,
+        verbose: bool = False,
+    ) -> None:
+        if loss_fn is None:
+            loss_fn = torch.nn.MSELoss()
+        if optimizer is None:
+            optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
+        super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
+    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
+        self._validate(X := as_numpy(X))
+        # reconstruct instances
+        X_recon = predict_batch(X, self.model, batch_size=batch_size)
+        # compute feature and instance level scores
+        fscore = np.power(X - X_recon, 2)
+        fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
+        n_score_features = int(np.ceil(fscore_flat.shape[1]))
+        sorted_fscore = np.sort(fscore_flat, axis=1)
+        sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
+        iscore = np.mean(sorted_fscore_perc, axis=1)
+        return OODScoreOutput(iscore, fscore)

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/ood/aegmm.py RENAMED Viewed

@@ -14,7 +14,8 @@ from typing import TYPE_CHECKING, Callable
 from numpy.typing import ArrayLike
-from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
+from dataeval.detectors.ood.base import OODScoreOutput
+from dataeval.detectors.ood.base_tf import OODBaseGMM
 from dataeval.interop import to_numpy
 from dataeval.utils.lazy import lazyload
 from dataeval.utils.tensorflow._internal.gmm import gmm_energy
@@ -32,7 +33,7 @@ else:
     tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
-class OOD_AEGMM(OODGMMBase):
+class OOD_AEGMM(OODBaseGMM):
     """
     AE with Gaussian Mixture Model based outlier detector.
@@ -62,5 +63,5 @@ class OOD_AEGMM(OODGMMBase):
     def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
         self._validate(X := to_numpy(X))
         _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
-        energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
+        energy, _ = gmm_energy(z, self._gmm_params, return_mean=False)
         return OODScoreOutput(energy.numpy())  # type: ignore

{dataeval-0.73.0 → dataeval-0.74.0}/src/dataeval/detectors/ood/base.py RENAMED Viewed

@@ -12,23 +12,14 @@ __all__ = ["OODOutput", "OODScoreOutput"]
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Literal, cast
+from typing import Callable, Generic, Literal, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from dataeval.interop import to_numpy
 from dataeval.output import OutputMetadata, set_metadata
-from dataeval.utils.lazy import lazyload
-from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
-from dataeval.utils.tensorflow._internal.trainer import trainer
-if TYPE_CHECKING:
-    import tensorflow as tf
-    import tf_keras as keras
-else:
-    tf = lazyload("tensorflow")
-    keras = lazyload("tf_keras")
+from dataeval.utils.gmm import GaussianMixtureModelParams
 @dataclass(frozen=True)
@@ -85,16 +76,62 @@ class OODScoreOutput(OutputMetadata):
         return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
-class OODBase(ABC):
-    def __init__(self, model: keras.Model) -> None:
-        self.model = model
+TGMMData = TypeVar("TGMMData")
+class OODGMMMixin(Generic[TGMMData]):
+    _gmm_params: GaussianMixtureModelParams[TGMMData]
+TModel = TypeVar("TModel", bound=Callable)
+TLossFn = TypeVar("TLossFn", bound=Callable)
+TOptimizer = TypeVar("TOptimizer")
+class OODFitMixin(Generic[TLossFn, TOptimizer], ABC):
+    @abstractmethod
+    def fit(
+        self,
+        x_ref: ArrayLike,
+        threshold_perc: float,
+        loss_fn: TLossFn | None,
+        optimizer: TOptimizer | None,
+        epochs: int,
+        batch_size: int,
+        verbose: bool,
+    ) -> None:
+        """
+        Train the model and infer the threshold value.
+        Parameters
+        ----------
+        x_ref : ArrayLike
+            Training data.
+        threshold_perc : float, default 100.0
+            Percentage of reference data that is normal.
+        loss_fn : TLossFn
+            Loss function used for training.
+        optimizer : TOptimizer
+            Optimizer used for training.
+        epochs : int, default 20
+            Number of training epochs.
+        batch_size : int, default 64
+            Batch size used for training.
+        verbose : bool, default True
+            Whether to print training progress.
+        """
-        self._ref_score: OODScoreOutput
-        self._threshold_perc: float
-        self._data_info: tuple[tuple, type] | None = None
-        if not isinstance(model, keras.Model):
-            raise TypeError("Model should be of type 'keras.Model'.")
+class OODBaseMixin(Generic[TModel], ABC):
+    _ref_score: OODScoreOutput
+    _threshold_perc: float
+    _data_info: tuple[tuple, type] | None = None
+    def __init__(
+        self,
+        model: TModel,
+    ) -> None:
+        self.model = model
     def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
         if not isinstance(X, np.ndarray):
@@ -107,9 +144,8 @@ class OODBase(ABC):
             raise RuntimeError(f"Expect data of type: {self._data_info[1]} and shape: {self._data_info[0]}. \
                                Provided data is type: {check_data_info[1]} and shape: {check_data_info[0]}.")
-    def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
-        attrs = ["_data_info", "_threshold_perc", "_ref_score"]
-        attrs = attrs if additional_attrs is None else attrs + additional_attrs
+    def _validate_state(self, X: NDArray) -> None:
+        attrs = [k for c in self.__class__.mro()[:-1][::-1] if hasattr(c, "__annotations__") for k in c.__annotations__]
         if not all(hasattr(self, attr) for attr in attrs) or any(getattr(self, attr) for attr in attrs) is None:
             raise RuntimeError("Metric needs to be `fit` before method call.")
         self._validate(X)
@@ -140,52 +176,6 @@ class OODBase(ABC):
     def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
         return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
-    def fit(
-        self,
-        x_ref: ArrayLike,
-        threshold_perc: float,
-        loss_fn: Callable[..., tf.Tensor],
-        optimizer: keras.optimizers.Optimizer,
-        epochs: int,
-        batch_size: int,
-        verbose: bool,
-    ) -> None:
-        """
-        Train the model and infer the threshold value.
-        Parameters
-        ----------
-        x_ref : ArrayLike
-            Training data.
-        threshold_perc : float, default 100.0
-            Percentage of reference data that is normal.
-        loss_fn : Callable | None, default None
-            Loss function used for training.
-        optimizer : Optimizer, default keras.optimizers.Adam
-            Optimizer used for training.
-        epochs : int, default 20
-            Number of training epochs.
-        batch_size : int, default 64
-            Batch size used for training.
-        verbose : bool, default True
-            Whether to print training progress.
-        """
-        # Train the model
-        trainer(
-            model=self.model,
-            loss_fn=loss_fn,
-            x_train=to_numpy(x_ref),
-            optimizer=optimizer,
-            epochs=epochs,
-            batch_size=batch_size,
-            verbose=verbose,
-        )
-        # Infer the threshold values
-        self._ref_score = self.score(x_ref, batch_size)
-        self._threshold_perc = threshold_perc
     @set_metadata()
     def predict(
         self,
@@ -215,43 +205,3 @@ class OODBase(ABC):
         score = self.score(X, batch_size=batch_size)
         ood_pred = score.get(ood_type) > self._threshold_score(ood_type)
         return OODOutput(is_ood=ood_pred, **score.dict())
-class OODGMMBase(OODBase):
-    def __init__(self, model: keras.Model) -> None:
-        super().__init__(model)
-        self.gmm_params: GaussianMixtureModelParams
-    def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
-        if additional_attrs is None:
-            additional_attrs = ["gmm_params"]
-        super()._validate_state(X, additional_attrs)
-    def fit(
-        self,
-        x_ref: ArrayLike,
-        threshold_perc: float,
-        loss_fn: Callable[..., tf.Tensor],
-        optimizer: keras.optimizers.Optimizer,
-        epochs: int,
-        batch_size: int,
-        verbose: bool,
-    ) -> None:
-        # Train the model
-        trainer(
-            model=self.model,
-            loss_fn=loss_fn,
-            x_train=to_numpy(x_ref),
-            optimizer=optimizer,
-            epochs=epochs,
-            batch_size=batch_size,
-            verbose=verbose,
-        )
-        # Calculate the GMM parameters
-        _, z, gamma = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
-        self.gmm_params = gmm_params(z, gamma)
-        # Infer the threshold values
-        self._ref_score = self.score(x_ref, batch_size)
-        self._threshold_perc = threshold_perc

dataeval 0.73.0__tar.gz → 0.74.0__tar.gz

dataeval 0.73.0tar.gz → 0.74.0tar.gz