PyPI - dataeval - Versions diffs - 0.73.1__tar.gz → 0.74.1__tar.gz - Mend

dataeval 0.73.1tar.gz → 0.74.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{dataeval-0.73.1 → dataeval-0.74.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.73.1
+Version: 0.74.1
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT
@@ -21,18 +21,12 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Scientific/Engineering
 Provides-Extra: all
-Provides-Extra: tensorflow
 Provides-Extra: torch
-Requires-Dist: hdbscan (>=0.8.36)
-Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
-Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
-Requires-Dist: numpy (>1.24.3)
+Requires-Dist: matplotlib ; extra == "all"
+Requires-Dist: numpy (>=1.24.3)
 Requires-Dist: pillow (>=10.3.0)
 Requires-Dist: scikit-learn (>=1.5.0)
 Requires-Dist: scipy (>=1.10)
-Requires-Dist: tensorflow (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
-Requires-Dist: tensorflow_probability (>=0.24,<0.25) ; extra == "tensorflow" or extra == "all"
-Requires-Dist: tf-keras (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
 Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
 Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
 Requires-Dist: tqdm

{dataeval-0.73.1 → dataeval-0.74.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dataeval"
-version = "0.73.1" # dynamic
+version = "0.74.1" # dynamic
 description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
 license = "MIT"
 readme = "README.md"
@@ -42,8 +42,7 @@ packages = [
 [tool.poetry.dependencies]
 # required
 python = ">=3.9,<3.13"
-hdbscan = {version = ">=0.8.36"}
-numpy = {version = ">1.24.3"}
+numpy = {version = ">=1.24.3"}
 pillow = {version = ">=10.3.0"}
 scipy = {version = ">=1.10"}
 scikit-learn = {version = ">=1.5.0"}
@@ -53,17 +52,12 @@ xxhash = {version = ">=3.3"}
 # optional
 matplotlib = {version = "*", optional = true}
-markupsafe = {version = "<3.0.2", optional = true}
-tensorflow = {version = ">=2.16,<2.18", optional = true}
-tensorflow_probability = {version = ">=0.24,<0.25", optional = true}
-tf-keras = {version = ">=2.16,<2.18", optional = true}
 torch = {version = ">=2.2.0", source = "pytorch", optional = true}
 torchvision = {version = ">=0.17.0", source = "pytorch", optional = true}
 [tool.poetry.extras]
-tensorflow = ["markupsafe", "tensorflow", "tensorflow_probability", "tf-keras"]
-torch = ["torch", "torchvision", "matplotlib"]
-all = ["matplotlib", "markupsafe", "tensorflow", "tensorflow_probability", "tf-keras", "torch", "torchvision"]
+torch = ["torch", "torchvision"]
+all = ["matplotlib", "torch", "torchvision"]
 [tool.poetry.group.dev]
 optional = true
@@ -89,6 +83,7 @@ pyright = {version = "*", extras = ["nodejs"]}
 maite = {version = "*"}
 pandas = {version = "*"}
 seaborn = {version = "*"}
+numpy = {version = ">=2.0.2"}
 # docs
 certifi = {version = ">=2024.07.04"}
 enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
@@ -105,7 +100,7 @@ markupsafe = {version = "<3.0.2", optional = true}
 [[tool.poetry.source]]
 name = "pytorch"
-url = "https://download.pytorch.org/whl/cu124"
+url = "https://download.pytorch.org/whl/cu118"
 priority = "explicit"
 [tool.poetry-dynamic-versioning]
@@ -121,7 +116,6 @@ files = ["src/dataeval/__init__.py"]
 name = "dataeval"
 [tool.poetry2conda.dependencies]
-tensorflow_probability = { name = "tensorflow-probability" }
 torch = { name = "pytorch" }
 xxhash = { name = "python-xxhash" }
@@ -142,8 +136,6 @@ parallel = true
 [tool.coverage.report]
 exclude_also = [
   "raise NotImplementedError",
-  "if TYPE_CHECKING:",
-  "if _IS_TENSORFLOW_AVAILABLE",
   "if _IS_TORCH_AVAILABLE",
   "if _IS_TORCHVISION_AVAILABLE",
 ]
@@ -151,7 +143,6 @@ include = ["*/src/dataeval/*"]
 omit = [
   "*/torch/blocks.py",
   "*/torch/utils.py",
-  "*/tensorflow/_internal/models.py",
 ]
 fail_under = 90

dataeval-0.74.1/src/dataeval/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+__version__ = "0.74.1"
+from importlib.util import find_spec
+_IS_TORCH_AVAILABLE = find_spec("torch") is not None
+_IS_TORCHVISION_AVAILABLE = find_spec("torchvision") is not None
+del find_spec
+from dataeval import detectors, metrics  # noqa: E402
+__all__ = ["detectors", "metrics"]
+if _IS_TORCH_AVAILABLE:
+    from dataeval import utils, workflows
+    __all__ += ["utils", "workflows"]

dataeval-0.74.1/src/dataeval/detectors/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Detectors can determine if a dataset or individual images in a dataset are indicative of a specific issue.
+"""
+from dataeval.detectors import drift, linters, ood
+__all__ = ["drift", "linters", "ood"]

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/drift/base.py RENAMED Viewed

@@ -19,7 +19,7 @@ import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from dataeval.interop import as_numpy
-from dataeval.output import OutputMetadata, set_metadata
+from dataeval.output import Output, set_metadata
 R = TypeVar("R")
@@ -43,7 +43,7 @@ class UpdateStrategy(ABC):
 @dataclass(frozen=True)
-class DriftBaseOutput(OutputMetadata):
+class DriftBaseOutput(Output):
     """
     Base output class for Drift detector classes
@@ -387,7 +387,7 @@ class BaseDriftUnivariate(BaseDrift):
         else:
             raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
-    @set_metadata()
+    @set_metadata
     @preprocess_x
     @update_x_ref
     def predict(

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/drift/mmd.py RENAMED Viewed

@@ -161,7 +161,7 @@ class DriftMMD(BaseDrift):
         distance_threshold = torch.sort(mmd2_permuted, descending=True).values[idx_threshold]
         return p_val.numpy().item(), mmd2.numpy().item(), distance_threshold.numpy().item()
-    @set_metadata()
+    @set_metadata
     @preprocess_x
     @update_x_ref
     def predict(self, x: ArrayLike) -> DriftMMDOutput:

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/drift/torch.py RENAMED Viewed

@@ -10,7 +10,6 @@ from __future__ import annotations
 __all__ = []
-from functools import partial
 from typing import Any, Callable
 import numpy as np
@@ -18,30 +17,7 @@ import torch
 import torch.nn as nn
 from numpy.typing import NDArray
-def get_device(device: str | torch.device | None = None) -> torch.device:
-    """
-    Instantiates a PyTorch device object.
-    Parameters
-    ----------
-    device : str | torch.device | None, default None
-        Either ``None``, a str ('gpu' or 'cpu') indicating the device to choose, or an
-        already instantiated device object. If ``None``, the GPU is selected if it is
-        detected, otherwise the CPU is used as a fallback.
-    Returns
-    -------
-    The instantiated device object.
-    """
-    if isinstance(device, torch.device):  # Already a torch device
-        return device
-    else:  # Instantiate device
-        if device is None or device.lower() in ["gpu", "cuda"]:
-            torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        else:
-            torch_device = torch.device("cpu")
-    return torch_device
+from dataeval.utils.torch.utils import get_device, predict_batch
 def _mmd2_from_kernel_matrix(
@@ -79,82 +55,6 @@ def _mmd2_from_kernel_matrix(
     return mmd2
-def predict_batch(
-    x: NDArray[Any] | torch.Tensor,
-    model: Callable | nn.Module | nn.Sequential,
-    device: torch.device | None = None,
-    batch_size: int = int(1e10),
-    preprocess_fn: Callable | None = None,
-    dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
-    """
-    Make batch predictions on a model.
-    Parameters
-    ----------
-    x : np.ndarray | torch.Tensor
-        Batch of instances.
-    model : Callable | nn.Module | nn.Sequential
-        PyTorch model.
-    device : torch.device | None, default None
-        Device type used. The default None tries to use the GPU and falls back on CPU.
-        Can be specified by passing either torch.device('cuda') or torch.device('cpu').
-    batch_size : int, default 1e10
-        Batch size used during prediction.
-    preprocess_fn : Callable | None, default None
-        Optional preprocessing function for each batch.
-    dtype : np.dtype | torch.dtype, default np.float32
-        Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
-    Returns
-    -------
-    NDArray | torch.Tensor | tuple
-        Numpy array, torch tensor or tuples of those with model outputs.
-    """
-    device = get_device(device)
-    if isinstance(x, np.ndarray):
-        x = torch.from_numpy(x)
-    n = len(x)
-    n_minibatch = int(np.ceil(n / batch_size))
-    return_np = not isinstance(dtype, torch.dtype)
-    preds = []
-    with torch.no_grad():
-        for i in range(n_minibatch):
-            istart, istop = i * batch_size, min((i + 1) * batch_size, n)
-            x_batch = x[istart:istop]
-            if isinstance(preprocess_fn, Callable):
-                x_batch = preprocess_fn(x_batch)
-            preds_tmp = model(x_batch.to(device))
-            if isinstance(preds_tmp, (list, tuple)):
-                if len(preds) == 0:  # init tuple with lists to store predictions
-                    preds = tuple([] for _ in range(len(preds_tmp)))
-                for j, p in enumerate(preds_tmp):
-                    if isinstance(p, torch.Tensor):
-                        p = p.cpu()
-                    preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
-            elif isinstance(preds_tmp, (np.ndarray, torch.Tensor)):
-                if isinstance(preds_tmp, torch.Tensor):
-                    preds_tmp = preds_tmp.cpu()
-                if isinstance(preds, tuple):
-                    preds = list(preds)
-                preds.append(
-                    preds_tmp
-                    if not return_np or isinstance(preds_tmp, np.ndarray)  # type: ignore
-                    else preds_tmp.numpy()
-                )
-            else:
-                raise TypeError(
-                    f"Model output type {type(preds_tmp)} not supported. The model \
-                    output type needs to be one of list, tuple, NDArray or \
-                    torch.Tensor."
-                )
-    concat = partial(np.concatenate, axis=0) if return_np else partial(torch.cat, dim=0)
-    out: tuple | np.ndarray | torch.Tensor = (
-        tuple(concat(p) for p in preds) if isinstance(preds, tuple) else concat(preds)  # type: ignore
-    )
-    return out
 def preprocess_drift(
     x: NDArray[Any],
     model: nn.Module,

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/linters/clusterer.py RENAMED Viewed

@@ -11,12 +11,12 @@ from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import pdist, squareform
 from dataeval.interop import to_numpy
-from dataeval.output import OutputMetadata, set_metadata
+from dataeval.output import Output, set_metadata
 from dataeval.utils.shared import flatten
 @dataclass(frozen=True)
-class ClustererOutput(OutputMetadata):
+class ClustererOutput(Output):
     """
     Output class for :class:`Clusterer` lint detector
@@ -495,7 +495,7 @@ class Clusterer:
         return exact_dupes, near_dupes
     # TODO: Move data input to evaluate from class
-    @set_metadata(["data"])
+    @set_metadata(state=["data"])
     def evaluate(self) -> ClustererOutput:
         """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/linters/duplicates.py RENAMED Viewed

@@ -9,7 +9,7 @@ from numpy.typing import ArrayLike
 from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
 from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
-from dataeval.output import OutputMetadata, set_metadata
+from dataeval.output import Output, set_metadata
 DuplicateGroup = list[int]
 DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
@@ -17,7 +17,7 @@ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateG
 @dataclass(frozen=True)
-class DuplicatesOutput(Generic[TIndexCollection], OutputMetadata):
+class DuplicatesOutput(Generic[TIndexCollection], Output):
     """
     Output class for :class:`Duplicates` lint detector
@@ -89,7 +89,7 @@ class Duplicates:
     @overload
     def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
-    @set_metadata(["only_exact"])
+    @set_metadata(state=["only_exact"])
     def from_stats(
         self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
     ) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
@@ -138,7 +138,7 @@ class Duplicates:
         return DuplicatesOutput(**duplicates)
-    @set_metadata(["only_exact"])
+    @set_metadata(state=["only_exact"])
     def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
         """
         Returns duplicate image indices for both exact matches and near matches

{dataeval-0.73.1 → dataeval-0.74.1}/src/dataeval/detectors/linters/outliers.py RENAMED Viewed

@@ -14,7 +14,7 @@ from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
 from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
 from dataeval.metrics.stats.pixelstats import PixelStatsOutput
 from dataeval.metrics.stats.visualstats import VisualStatsOutput
-from dataeval.output import OutputMetadata, set_metadata
+from dataeval.output import Output, set_metadata
 IndexIssueMap = dict[int, dict[str, float]]
 OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
@@ -22,7 +22,7 @@ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
 @dataclass(frozen=True)
-class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
+class OutliersOutput(Generic[TIndexIssueMap], Output):
     """
     Output class for :class:`Outliers` lint detector
@@ -159,7 +159,7 @@ class Outliers:
     @overload
     def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
-    @set_metadata(["outlier_method", "outlier_threshold"])
+    @set_metadata(state=["outlier_method", "outlier_threshold"])
     def from_stats(
         self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
     ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
@@ -228,7 +228,7 @@ class Outliers:
         return OutliersOutput(output_list)
-    @set_metadata(["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
+    @set_metadata(state=["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
     def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
         """
         Returns indices of Outliers with the issues identified for each

dataeval-0.74.1/src/dataeval/detectors/ood/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
+"""
+from dataeval import _IS_TORCH_AVAILABLE
+from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
+__all__ = ["OODOutput", "OODScoreOutput"]
+if _IS_TORCH_AVAILABLE:
+    from dataeval.detectors.ood.ae_torch import OOD_AE
+    __all__ += ["OOD_AE"]
+del _IS_TORCH_AVAILABLE

dataeval-0.73.1/src/dataeval/detectors/ood/ae.py → dataeval-0.74.1/src/dataeval/detectors/ood/ae_torch.py RENAMED Viewed

@@ -1,4 +1,6 @@
 """
+Adapted for Pytorch from
 Source code derived from Alibi-Detect 0.11.4
 https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
@@ -8,55 +10,48 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-__all__ = ["OOD_AE"]
-from typing import TYPE_CHECKING, Callable
+from typing import Callable
 import numpy as np
+import torch
 from numpy.typing import ArrayLike
-from dataeval.detectors.ood.base import OODBase, OODScoreOutput
+from dataeval.detectors.ood.base import OODScoreOutput
+from dataeval.detectors.ood.base_torch import OODBase
 from dataeval.interop import as_numpy
-from dataeval.utils.lazy import lazyload
-from dataeval.utils.tensorflow._internal.utils import predict_batch
-if TYPE_CHECKING:
-    import tensorflow as tf
-    import tf_keras as keras
-    import dataeval.utils.tensorflow._internal.models as tf_models
-else:
-    tf = lazyload("tensorflow")
-    keras = lazyload("tf_keras")
-    tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
+from dataeval.utils.torch.utils import predict_batch
 class OOD_AE(OODBase):
     """
-    Autoencoder-based :term:`out of distribution<Out-of-distribution (OOD)>` detector.
+    Autoencoder based out-of-distribution detector.
     Parameters
     ----------
-    model : AE
-       An :term:`autoencoder<Autoencoder>` model.
+    model : AriaAutoencoder
+        An Autoencoder model.
     """
-    def __init__(self, model: tf_models.AE) -> None:
-        super().__init__(model)
+    def __init__(self, model: torch.nn.Module, device: str | torch.device | None = None) -> None:
+        super().__init__(model, device)
     def fit(
         self,
         x_ref: ArrayLike,
-        threshold_perc: float = 100.0,
-        loss_fn: Callable[..., tf.Tensor] | None = None,
-        optimizer: keras.optimizers.Optimizer | None = None,
+        threshold_perc: float,
+        loss_fn: Callable[..., torch.nn.Module] | None = None,
+        optimizer: torch.optim.Optimizer | None = None,
         epochs: int = 20,
         batch_size: int = 64,
-        verbose: bool = True,
+        verbose: bool = False,
     ) -> None:
         if loss_fn is None:
-            loss_fn = keras.losses.MeanSquaredError()
-        super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
+            loss_fn = torch.nn.MSELoss()
+        if optimizer is None:
+            optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
+        super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
     def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
         self._validate(X := as_numpy(X))

dataeval 0.73.1__tar.gz → 0.74.1__tar.gz

dataeval 0.73.1tar.gz → 0.74.1tar.gz