PyPI - dataeval - Versions diffs - 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

dataeval/__init__.py +3 -3
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +40 -85
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -5
dataeval/detectors/linters/duplicates.py +13 -36
dataeval/detectors/linters/outliers.py +23 -148
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +30 -9
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/mixin.py +21 -7
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +6 -0
dataeval/metadata/_distance.py +167 -0
dataeval/metadata/_ood.py +217 -0
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +6 -4
dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
dataeval/metrics/bias/_coverage.py +98 -0
dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
dataeval/metrics/estimators/__init__.py +15 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
dataeval/metrics/estimators/_clusterer.py +44 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
dataeval/metrics/stats/__init__.py +16 -13
dataeval/metrics/stats/{base.py → _base.py} +82 -133
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
dataeval/metrics/stats/_dimensionstats.py +75 -0
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +131 -0
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
dataeval/outputs/__init__.py +53 -0
dataeval/{output.py → outputs/_base.py} +55 -25
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +184 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +387 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +234 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +14 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +6 -6
dataeval/utils/data/__init__.py +26 -0
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +104 -0
dataeval/utils/data/_images.py +68 -0
dataeval/utils/data/_metadata.py +360 -0
dataeval/utils/data/_selection.py +126 -0
dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
dataeval/utils/data/_targets.py +85 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_types.py +52 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +57 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +51 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +11 -346
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
dataeval-0.82.0.dist-info/RECORD +104 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/metrics/bias/coverage.py +0 -194
dataeval/metrics/stats/datasetstats.py +0 -202
dataeval/metrics/stats/dimensionstats.py +0 -115
dataeval/metrics/stats/labelstats.py +0 -210
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.1.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0

dataeval/detectors/linters/__init__.py CHANGED Viewed

@@ -3,14 +3,12 @@ Linters help identify potential issues in training and test data and are an impo
 """
 __all__ = [
-    "Clusterer",
-    "ClustererOutput",
     "Duplicates",
     "DuplicatesOutput",
     "Outliers",
     "OutliersOutput",
 ]
-from dataeval.detectors.linters.clusterer import Clusterer, ClustererOutput
-from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
-from dataeval.detectors.linters.outliers import Outliers, OutliersOutput
+from dataeval.detectors.linters.duplicates import Duplicates
+from dataeval.detectors.linters.outliers import Outliers
+from dataeval.outputs._linters import DuplicatesOutput, OutliersOutput

dataeval/detectors/linters/duplicates.py CHANGED Viewed

@@ -2,39 +2,15 @@ from __future__ import annotations
 __all__ = []
-from dataclasses import dataclass
-from typing import Generic, Iterable, Sequence, TypeVar, overload
+from typing import Any, Sequence, overload
-from numpy.typing import ArrayLike
-from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
-from dataeval.output import Output, set_metadata
-DuplicateGroup = list[int]
-DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
-TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
-@dataclass(frozen=True)
-class DuplicatesOutput(Generic[TIndexCollection], Output):
-    """
-    Output class for :class:`Duplicates` lint detector.
-    Attributes
-    ----------
-    exact : list[list[int] | dict[int, list[int]]]
-        Indices of images that are exact matches
-    near: list[list[int] | dict[int, list[int]]]
-        Indices of images that are near matches
-    - For a single dataset, indices are returned as a list of index groups.
-    - For multiple datasets, indices are returned as dictionaries where the key is the
-      index of the dataset, and the value is the list index groups from that dataset.
-    """
-    exact: list[TIndexCollection]
-    near: list[TIndexCollection]
+from dataeval.metrics.stats import hashstats
+from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
+from dataeval.outputs import DuplicatesOutput, HashStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.outputs._linters import DatasetDuplicateGroupMap, DuplicateGroup
+from dataeval.typing import Array, Dataset
+from dataeval.utils.data._images import Images
 class Duplicates:
@@ -134,14 +110,14 @@ class Duplicates:
         return DuplicatesOutput(**duplicates)
     @set_metadata(state=["only_exact"])
-    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
+    def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> DuplicatesOutput[DuplicateGroup]:
         """
         Returns duplicate image indices for both exact matches and near matches
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
-            A dataset of images in an ArrayLike format or the output(s) from a hashstats analysis
+        data : Iterable[Array], shape - (N, C, H, W) | Dataset[tuple[Array, Any, Any]]
+            A dataset of images in an Array format or the output(s) from a hashstats analysis
         Returns
         -------
@@ -158,6 +134,7 @@ class Duplicates:
         >>> all_dupes.evaluate(duplicate_images)
         DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
         """  # noqa: E501
-        self.stats = hashstats(data)
+        images = Images(data) if isinstance(data, Dataset) else data
+        self.stats = hashstats(images)
         duplicates = self._get_duplicates(self.stats.dict())
         return DuplicatesOutput(**duplicates)

dataeval/detectors/linters/outliers.py CHANGED Viewed

@@ -2,141 +2,19 @@ from __future__ import annotations
 __all__ = []
-import contextlib
-from dataclasses import dataclass
-from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
+from typing import Any, Literal, Sequence, overload
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
-from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
-from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
-from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
-from dataeval.metrics.stats.labelstats import LabelStatsOutput
-from dataeval.metrics.stats.pixelstats import PixelStatsOutput
-from dataeval.metrics.stats.visualstats import VisualStatsOutput
-from dataeval.output import Output, set_metadata
-with contextlib.suppress(ImportError):
-    import pandas as pd
-IndexIssueMap = dict[int, dict[str, float]]
-OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
-TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
-def _reorganize_by_class_and_metric(result, lstats):
-    """Flip result from grouping by image to grouping by class and metric"""
-    metrics = {}
-    class_wise = {label: {} for label in lstats.image_indices_per_label}
-    # Group metrics and calculate class-wise counts
-    for img, group in result.items():
-        for extreme in group:
-            metrics.setdefault(extreme, []).append(img)
-            for label, images in lstats.image_indices_per_label.items():
-                if img in images:
-                    class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
-    return metrics, class_wise
-def _create_table(metrics, class_wise):
-    """Create table for displaying the results"""
-    max_class_length = max(len(str(label)) for label in class_wise) + 2
-    max_total = max(len(metrics[group]) for group in metrics) + 2
-    table_header = " | ".join(
-        [f"{'Class':>{max_class_length}}"]
-        + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
-        + [f"{'Total':<{max_total}}"]
-    )
-    table_rows = []
-    for class_cat, results in class_wise.items():
-        table_value = [f"{class_cat:>{max_class_length}}"]
-        total = 0
-        for group in sorted(metrics.keys()):
-            count = results.get(group, 0)
-            table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
-            total += count
-        table_value.append(f"{total:^{max_total}}")
-        table_rows.append(" | ".join(table_value))
-    table = [table_header] + table_rows
-    return table
-def _create_pandas_dataframe(class_wise):
-    """Create data for pandas dataframe"""
-    data = []
-    for label, metrics_dict in class_wise.items():
-        row = {"Class": label}
-        total = sum(metrics_dict.values())
-        row.update(metrics_dict)  # Add metric counts
-        row["Total"] = total
-        data.append(row)
-    return data
-@dataclass(frozen=True)
-class OutliersOutput(Generic[TIndexIssueMap], Output):
-    """
-    Output class for :class:`Outliers` lint detector.
+from numpy.typing import NDArray
-    Attributes
-    ----------
-    issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
-        Indices of image Outliers with their associated issue type and calculated values.
-    - For a single dataset, a dictionary containing the indices of outliers and
-      a dictionary showing the issues and calculated values for the given index.
-    - For multiple stats outputs, a list of dictionaries containing the indices of
-      outliers and their associated issues and calculated values.
-    """
-    issues: TIndexIssueMap
-    def __len__(self) -> int:
-        if isinstance(self.issues, dict):
-            return len(self.issues)
-        else:
-            return sum(len(d) for d in self.issues)
-    def to_table(self, labelstats: LabelStatsOutput) -> str:
-        if isinstance(self.issues, dict):
-            metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
-            listed_table = _create_table(metrics, classwise)
-            table = "\n".join(listed_table)
-        else:
-            outertable = []
-            for d in self.issues:
-                metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
-                listed_table = _create_table(metrics, classwise)
-                str_table = "\n".join(listed_table)
-                outertable.append(str_table)
-            table = "\n\n".join(outertable)
-        return table
-    def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
-        import pandas as pd
-        if isinstance(self.issues, dict):
-            _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
-            data = _create_pandas_dataframe(classwise)
-            df = pd.DataFrame(data)
-        else:
-            df_list = []
-            for i, d in enumerate(self.issues):
-                _, classwise = _reorganize_by_class_and_metric(d, labelstats)
-                data = _create_pandas_dataframe(classwise)
-                single_df = pd.DataFrame(data)
-                single_df["Dataset"] = i
-                df_list.append(single_df)
-            df = pd.concat(df_list)
-        return df
+from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
+from dataeval.metrics.stats._imagestats import imagestats
+from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
+from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
+from dataeval.typing import Array, Dataset
+from dataeval.utils.data._images import Images
 def _get_outlier_mask(
@@ -226,7 +104,7 @@ class Outliers:
         outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
         outlier_threshold: float | None = None,
     ):
-        self.stats: DatasetStatsOutput
+        self.stats: ImageStatsOutput
         self.use_dimension = use_dimension
         self.use_pixel = use_pixel
         self.use_visual = use_visual
@@ -247,23 +125,23 @@ class Outliers:
         return dict(sorted(flagged_images.items()))
     @overload
-    def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
+    def from_stats(self, stats: OutlierStatsOutput | ImageStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
     @overload
     def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
     @set_metadata(state=["outlier_method", "outlier_threshold"])
     def from_stats(
-        self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
+        self, stats: OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
     ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
         """
         Returns indices of Outliers with the issues identified for each.
         Parameters
         ----------
-        stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
+        stats : OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
             The output(s) from a dimensionstats, pixelstats, or visualstats metric
-            analysis or an aggregate DatasetStatsOutput
+            analysis or an aggregate ImageStatsOutput
         Returns
         -------
@@ -290,11 +168,7 @@ class Outliers:
         >>> results.issues[1]
         {}
         """  # noqa: E501
-        if isinstance(stats, DatasetStatsOutput):
-            outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
-            return OutliersOutput(outliers)
-        if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
+        if isinstance(stats, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
             return OutliersOutput(self._get_outliers(stats.dict()))
         if not isinstance(stats, Sequence):
@@ -305,7 +179,7 @@ class Outliers:
         stats_map: dict[type, list[int]] = {}
         for i, stats_output in enumerate(stats):
             if not isinstance(
-                stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
+                stats_output, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
             ):
                 raise TypeError(
                     "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
@@ -323,14 +197,14 @@ class Outliers:
         return OutliersOutput(output_list)
     @set_metadata(state=["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
-    def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
+    def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> OutliersOutput[IndexIssueMap]:
         """
         Returns indices of Outliers with the issues identified for each
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (C, H, W)
-            A dataset of images in an ArrayLike format
+        data : Iterable[Array], shape - (C, H, W)
+            A dataset of images in an Array format
         Returns
         -------
@@ -347,8 +221,9 @@ class Outliers:
         >>> list(results.issues)
         [10, 12]
         >>> results.issues[10]
-        {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
+        {'contrast': 1.25, 'zeros': 0.05493, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}
         """
-        self.stats = datasetstats(images=data)
+        images = Images(data) if isinstance(data, Dataset) else data
+        self.stats = imagestats(images)
         outliers = self._get_outliers(self.stats.dict())
         return OutliersOutput(outliers)

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -5,4 +5,4 @@ Out-of-distribution (OOD) detectors identify data that is different from the dat
 __all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
 from dataeval.detectors.ood.ae import OOD_AE
-from dataeval.detectors.ood.output import OODOutput, OODScoreOutput
+from dataeval.outputs._ood import OODOutput, OODScoreOutput

dataeval/detectors/ood/ae.py CHANGED Viewed

@@ -16,12 +16,12 @@ from typing import Callable
 import numpy as np
 import torch
-from numpy.typing import ArrayLike
+from numpy.typing import NDArray
 from dataeval.detectors.ood.base import OODBase
-from dataeval.detectors.ood.output import OODScoreOutput
-from dataeval.interop import as_numpy
-from dataeval.utils.torch.internal import predict_batch
+from dataeval.outputs import OODScoreOutput
+from dataeval.typing import ArrayLike
+from dataeval.utils.torch._internal import predict_batch
 class OOD_AE(OODBase):
@@ -30,8 +30,31 @@ class OOD_AE(OODBase):
     Parameters
     ----------
-    model : Autoencoder
-        An Autoencoder model.
+    model : torch.nn.Module
+        An autoencoder model to use for encoding and reconstruction of images
+        for detection of out-of-distribution samples.
+    device : str or torch.Device or None, default None
+        The device to use for the detector. None will default to the global
+        configuration selection if set, otherwise "cuda" then "cpu" by availability.
+    Example
+    -------
+    Perform out-of-distribution detection on test data.
+    >>> from dataeval.utils.torch.models import AE
+    >>> input_shape = train_images[0].shape
+    >>> ood = OOD_AE(AE(input_shape))
+    Train the autoencoder using the training data.
+    >>> ood.fit(train_images, threshold_perc=99, epochs=20)
+    Test for out-of-distribution samples on the test data.
+    >>> output = ood.predict(test_images)
+    >>> output.is_ood
+    array([ True,  True, False,  True,  True,  True,  True,  True])
     """
     def __init__(self, model: torch.nn.Module, device: str | torch.device | None = None) -> None:
@@ -55,9 +78,7 @@ class OOD_AE(OODBase):
         super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
-    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
-        self._validate(X := as_numpy(X))
+    def _score(self, X: NDArray[np.float32], batch_size: int = int(1e10)) -> OODScoreOutput:
         # reconstruct instances
         X_recon = predict_batch(X, self.model, batch_size=batch_size)

dataeval/detectors/ood/base.py CHANGED Viewed

@@ -13,12 +13,13 @@ __all__ = []
 from typing import Callable, cast
 import torch
-from numpy.typing import ArrayLike
+from dataeval.config import get_device
 from dataeval.detectors.ood.mixin import OODBaseMixin, OODFitMixin, OODGMMMixin
-from dataeval.interop import to_numpy
-from dataeval.utils.torch.gmm import GaussianMixtureModelParams, gmm_params
-from dataeval.utils.torch.internal import get_device, trainer
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import to_numpy
+from dataeval.utils.torch._gmm import GaussianMixtureModelParams, gmm_params
+from dataeval.utils.torch._internal import trainer
 class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.nn.Module], torch.optim.Optimizer]):

dataeval/detectors/ood/mixin.py CHANGED Viewed

@@ -1,17 +1,17 @@
 from __future__ import annotations
-from dataeval.detectors.ood.output import OODOutput, OODScoreOutput
 __all__ = []
 from abc import ABC, abstractmethod
 from typing import Callable, Generic, Literal, TypeVar
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.interop import to_numpy
-from dataeval.output import set_metadata
+from dataeval.outputs import OODOutput, OODScoreOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import as_numpy, to_numpy
 TGMMParams = TypeVar("TGMMParams")
@@ -73,6 +73,9 @@ class OODBaseMixin(Generic[TModel], ABC):
     def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
         if not isinstance(X, np.ndarray):
             raise TypeError("Dataset should of type: `NDArray`.")
+        if np.min(X) < 0 or np.max(X) > 1:
+            raise ValueError("Embeddings must be on the unit interval [0-1].")
         return X.shape[1:], X.dtype.type
     def _validate(self, X: NDArray) -> None:
@@ -90,7 +93,7 @@ class OODBaseMixin(Generic[TModel], ABC):
         self._validate(X)
     @abstractmethod
-    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput: ...
+    def _score(self, X: NDArray[np.float32], batch_size: int = int(1e10)) -> OODScoreOutput: ...
     @set_metadata
     def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
@@ -105,11 +108,17 @@ class OODBaseMixin(Generic[TModel], ABC):
             Number of instances to process in each batch.
             Use a smaller batch size if your dataset is large or if you encounter memory issues.
+        Raises
+        ------
+        ValueError
+            X input data must be unit interval [0-1].
         Returns
         -------
         OODScoreOutput
             An object containing the instance-level and feature-level OOD scores.
         """
+        self._validate(X := as_numpy(X).astype(np.float32))
         return self._score(X, batch_size)
     def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
@@ -134,12 +143,17 @@ class OODBaseMixin(Generic[TModel], ABC):
         ood_type : "feature" | "instance", default "instance"
             Predict out-of-distribution at the 'feature' or 'instance' level.
+        Raises
+        ------
+        ValueError
+            X input data must be unit interval [0-1].
         Returns
         -------
         Dictionary containing the outlier predictions for the selected level,
         and the OOD scores for the data including both 'instance' and 'feature' (if present) level scores.
         """
-        self._validate_state(X := to_numpy(X))
+        self._validate_state(X := to_numpy(X).astype(np.float32))
         # compute outlier scores
         score = self.score(X, batch_size=batch_size)
         ood_pred = score.get(ood_type) > self._threshold_score(ood_type)

dataeval/detectors/ood/vae.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""
+Adapted for Pytorch from
+Source code derived from Alibi-Detect 0.11.4
+https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
+Original code Copyright (c) 2023 Seldon Technologies Ltd
+Licensed under Apache Software License (Apache 2.0)
+"""
+from __future__ import annotations
+__all__ = []
+from typing import Callable
+import numpy as np
+import torch
+from dataeval.detectors.ood.base import OODBase
+from dataeval.outputs import OODScoreOutput
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import as_numpy
+from dataeval.utils.torch._internal import predict_batch
+class OOD_VAE(OODBase):
+    """
+    Autoencoder based out-of-distribution detector.
+    Parameters
+    ----------
+    model : Autoencoder
+        An Autoencoder model.
+    """
+    def __init__(self, model: torch.nn.Module, device: str | torch.device | None = None) -> None:
+        super().__init__(model, device)
+    def fit(
+        self,
+        x_ref: ArrayLike,
+        threshold_perc: float,
+        loss_fn: Callable[..., torch.nn.Module] | None = None,
+        optimizer: torch.optim.Optimizer | None = None,
+        epochs: int = 20,
+        batch_size: int = 64,
+        verbose: bool = False,
+    ) -> None:
+        if loss_fn is None:
+            loss_fn = torch.nn.MSELoss()
+        if optimizer is None:
+            optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)
+        super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
+    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
+        self._validate(X := as_numpy(X))
+        # reconstruct instances
+        X_recon = predict_batch(X, self.model, batch_size=batch_size)[0]  # don't need mu or logvar from model
+        # compute feature and instance level scores
+        fscore = np.power(X.reshape((len(X), -1)) - X_recon, 2)
+        # fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
+        # n_score_features = int(np.ceil(fscore_flat.shape[1]))
+        # sorted_fscore = np.sort(fscore_flat, axis=1)
+        # sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
+        # iscore = np.mean(sorted_fscore_perc, axis=1)
+        iscore = np.sum(fscore, axis=1)
+        return OODScoreOutput(iscore, fscore)

dataeval/metadata/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Explanatory functions using metadata and additional features such as ood or drift"""
+__all__ = ["most_deviated_factors", "metadata_distance"]
+from dataeval.metadata._distance import metadata_distance
+from dataeval.metadata._ood import most_deviated_factors

dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl