PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
dataeval/detectors/ood/aegmm.py +66 -0
dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
dataeval/detectors/ood/vaegmm.py +75 -0
dataeval/interop.py +56 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
dataeval/metrics/bias/metadata.py +358 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -3
dataeval/utils/image.py +71 -0
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
dataeval-0.73.0.dist-info/RECORD +73 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/detectors/ood/aegmm.py +0 -78
dataeval/_internal/detectors/ood/vaegmm.py +0 -89
dataeval/_internal/interop.py +0 -49
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/{_internal/detectors → detectors/linters}/clusterer.py RENAMED Viewed

@@ -1,16 +1,18 @@
 from __future__ import annotations
+__all__ = ["ClustererOutput", "Clusterer"]
 from dataclasses import dataclass
-from typing import Iterable, NamedTuple, cast
+from typing import Any, Iterable, NamedTuple, cast
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import pdist, squareform
-from dataeval._internal.interop import to_numpy
-from dataeval._internal.metrics.utils import flatten
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import to_numpy
+from dataeval.output import OutputMetadata, set_metadata
+from dataeval.utils.shared import flatten
 @dataclass(frozen=True)
@@ -36,7 +38,7 @@ class ClustererOutput(OutputMetadata):
     potential_duplicates: list[list[int]]
-def extend_linkage(link_arr: NDArray) -> NDArray:
+def _extend_linkage(link_arr: NDArray) -> NDArray:
     """
     Adds a column to the linkage matrix link_arr that tracks the new id assigned
     to each row
@@ -60,10 +62,10 @@ def extend_linkage(link_arr: NDArray) -> NDArray:
     return arr
-class Cluster:
+class _Cluster:
     __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
-    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
+    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
         self.merged = merged
         self.samples = np.array(samples, dtype=np.int32)
         self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -85,8 +87,8 @@ class Cluster:
             self.out1 = dist > out1
             self.out2 = dist > out2
-    def copy(self) -> Cluster:
-        return Cluster(False, self.samples, self.sample_dist, True)
+    def copy(self) -> _Cluster:
+        return _Cluster(False, self.samples, self.sample_dist, True)
     def __repr__(self) -> str:
         _params = {
@@ -98,32 +100,32 @@ class Cluster:
         return f"{self.__class__.__name__}(**{repr(_params)})"
-class Clusters(dict[int, dict[int, Cluster]]):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+class _Clusters(dict[int, dict[int, _Cluster]]):
+    def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
+        super().__init__(*args)
         self.max_level: int = 1
-class ClusterPosition(NamedTuple):
+class _ClusterPosition(NamedTuple):
     """Keeps track of a cluster's level and ID"""
     level: int
     cid: int
-class ClusterMergeEntry:
+class _ClusterMergeEntry:
     __slots__ = "level", "outer_cluster", "inner_cluster", "status"
-    def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
+    def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
         self.level = level
         self.outer_cluster = outer_cluster
         self.inner_cluster = inner_cluster
         self.status = status
-    def __lt__(self, value: ClusterMergeEntry) -> bool:
+    def __lt__(self, value: _ClusterMergeEntry) -> bool:
         return self.level.__lt__(value.level)
-    def __gt__(self, value: ClusterMergeEntry) -> bool:
+    def __gt__(self, value: _ClusterMergeEntry) -> bool:
         return self.level.__gt__(value.level)
@@ -153,36 +155,36 @@ class Clusterer:
     >>> cluster = Clusterer(dataset)
     """
-    def __init__(self, dataset: ArrayLike):
+    def __init__(self, dataset: ArrayLike) -> None:
         # Allows an update to dataset to reset the state rather than instantiate a new class
         self._on_init(dataset)
     def _on_init(self, dataset: ArrayLike):
-        self._data: NDArray = flatten(to_numpy(dataset))
+        self._data: NDArray[Any] = flatten(to_numpy(dataset))
         self._validate_data(self._data)
         self._num_samples = len(self._data)
-        self._darr: NDArray = pdist(self._data, metric="euclidean")
-        self._sqdmat: NDArray = squareform(self._darr)
-        self._larr: NDArray = extend_linkage(linkage(self._darr))
+        self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
+        self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
+        self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
         self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
         min_num = int(self._num_samples * 0.05)
-        self._min_num_samples_per_cluster = min(max(2, min_num), 100)
+        self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
-        self._clusters = None
-        self._last_good_merge_levels = None
+        self._clusters: _Clusters | None = None
+        self._last_good_merge_levels: dict[int, int] | None = None
     @property
-    def data(self) -> NDArray:
+    def data(self) -> NDArray[Any]:
         return self._data
     @data.setter
-    def data(self, x: ArrayLike):
+    def data(self, x: ArrayLike) -> None:
         self._on_init(x)
     @property
-    def clusters(self) -> Clusters:
+    def clusters(self) -> _Clusters:
         if self._clusters is None:
             self._clusters = self._create_clusters()
         return self._clusters
@@ -209,11 +211,11 @@ class Clusterer:
         if features < 1:
             raise ValueError(f"Samples should have at least 1 feature; got {features}")
-    def _create_clusters(self) -> Clusters:
+    def _create_clusters(self) -> _Clusters:
         """Generates clusters based on linkage matrix"""
         next_cluster_id = 0
-        cluster_map: dict[int, ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
-        clusters: Clusters = Clusters()
+        cluster_map: dict[int, _ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
+        clusters: _Clusters = _Clusters()
         # Walking through the linkage array to generate clusters
         for arr_i in self._larr:
@@ -240,7 +242,7 @@ class Clusterer:
                 # Update clusters to include previously skipped levels
                 clusters = self._fill_levels(clusters, left, right)
             elif left or right:
-                child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
+                child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
                 cc = clusters[child.level][child.cid]
                 samples = np.concatenate([cc.samples, [other_id]])
                 sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -254,12 +256,12 @@ class Clusterer:
             if level not in clusters:
                 clusters[level] = {}
-            clusters[level][cid] = Cluster(merged, samples, sample_dist)
-            cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
+            clusters[level][cid] = _Cluster(merged, samples, sample_dist)
+            cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
         return clusters
-    def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
+    def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
         # Sets each level's cluster info if it does not exist
         if left.level != right.level:
             (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
@@ -312,7 +314,7 @@ class Clusterer:
         mask2 = mask2_vals < one_std_check
         return np.logical_or(desired_merge, mask2)
-    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
+    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
         """
         Runs through the clusters dictionary determining when clusters merge,
         and how close are those clusters when they merge.
@@ -329,7 +331,7 @@ class Clusterer:
         """
         intra_max = []
         merge_mean = []
-        merge_list: list[ClusterMergeEntry] = []
+        merge_list: list[_ClusterMergeEntry] = []
         for level, cluster_set in self.clusters.items():
             for outer_cluster, cluster in cluster_set.items():
@@ -356,7 +358,7 @@ class Clusterer:
                 # Calculate the corresponding distance stats
                 distance_stats_arr = aggregate_func(distances)
                 merge_mean.append(distance_stats_arr)
-                merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
+                merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
         all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
@@ -493,7 +495,7 @@ class Clusterer:
         return exact_dupes, near_dupes
     # TODO: Move data input to evaluate from class
-    @set_metadata("dataeval.detectors", ["data"])
+    @set_metadata(["data"])
     def evaluate(self) -> ClustererOutput:
         """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`

dataeval/{_internal/detectors → detectors/linters}/duplicates.py RENAMED Viewed

@@ -1,13 +1,15 @@
 from __future__ import annotations
+__all__ = ["DuplicatesOutput", "Duplicates"]
 from dataclasses import dataclass
-from typing import Generic, Iterable, Sequence, TypeVar
+from typing import Generic, Iterable, Sequence, TypeVar, overload
 from numpy.typing import ArrayLike
-from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
+from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
+from dataeval.output import OutputMetadata, set_metadata
 DuplicateGroup = list[int]
 DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
@@ -58,7 +60,7 @@ class Duplicates:
     >>> exact_dupes = Duplicates(only_exact=True)
     """
-    def __init__(self, only_exact: bool = False):
+    def __init__(self, only_exact: bool = False) -> None:
         self.stats: HashStatsOutput
         self.only_exact = only_exact
@@ -81,8 +83,16 @@ class Duplicates:
             "near": sorted(near),
         }
-    @set_metadata("dataeval.detectors", ["only_exact"])
-    def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
+    @overload
+    def from_stats(self, hashes: HashStatsOutput) -> DuplicatesOutput[DuplicateGroup]: ...
+    @overload
+    def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
+    @set_metadata(["only_exact"])
+    def from_stats(
+        self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
+    ) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
         """
         Returns duplicate image indices for both exact matches and near matches
@@ -128,8 +138,8 @@ class Duplicates:
         return DuplicatesOutput(**duplicates)
-    @set_metadata("dataeval.detectors", ["only_exact"])
-    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
+    @set_metadata(["only_exact"])
+    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
         """
         Returns duplicate image indices for both exact matches and near matches

dataeval/{_internal/detectors → detectors/linters}/merged_stats.py RENAMED Viewed

@@ -1,11 +1,13 @@
 from __future__ import annotations
+__all__ = []
 from copy import deepcopy
 from typing import Sequence, TypeVar
 import numpy as np
-from dataeval._internal.metrics.stats.base import BaseStatsOutput
+from dataeval.metrics.stats.base import BaseStatsOutput
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)

dataeval/{_internal/detectors → detectors/linters}/outliers.py RENAMED Viewed

@@ -1,18 +1,20 @@
 from __future__ import annotations
+__all__ = ["OutliersOutput", "Outliers"]
 from dataclasses import dataclass
 from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
-from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
-from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
-from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
-from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
+from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
+from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
+from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
+from dataeval.metrics.stats.pixelstats import PixelStatsOutput
+from dataeval.metrics.stats.visualstats import VisualStatsOutput
+from dataeval.output import OutputMetadata, set_metadata
 IndexIssueMap = dict[int, dict[str, float]]
 OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
@@ -37,7 +39,7 @@ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
     issues: TIndexIssueMap
-    def __len__(self):
+    def __len__(self) -> int:
         if isinstance(self.issues, dict):
             return len(self.issues)
         else:
@@ -157,10 +159,10 @@ class Outliers:
     @overload
     def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
-    @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
+    @set_metadata(["outlier_method", "outlier_threshold"])
     def from_stats(
         self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
-    ) -> OutliersOutput:
+    ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
         """
         Returns indices of Outliers with the issues identified for each
@@ -195,7 +197,7 @@ class Outliers:
         {}
         """  # noqa: E501
         if isinstance(stats, DatasetStatsOutput):
-            outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
+            outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
             return OutliersOutput(outliers)
         if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
@@ -226,16 +228,7 @@ class Outliers:
         return OutliersOutput(output_list)
-    @set_metadata(
-        "dataeval.detectors",
-        [
-            "use_dimension",
-            "use_pixel",
-            "use_visual",
-            "outlier_method",
-            "outlier_threshold",
-        ],
-    )
+    @set_metadata(["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
     def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
         """
         Returns indices of Outliers with the issues identified for each

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -5,11 +5,11 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
 from dataeval import _IS_TENSORFLOW_AVAILABLE
 if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
-    from dataeval._internal.detectors.ood.ae import OOD_AE
-    from dataeval._internal.detectors.ood.aegmm import OOD_AEGMM
-    from dataeval._internal.detectors.ood.base import OODOutput, OODScoreOutput
-    from dataeval._internal.detectors.ood.llr import OOD_LLR
-    from dataeval._internal.detectors.ood.vae import OOD_VAE
-    from dataeval._internal.detectors.ood.vaegmm import OOD_VAEGMM
+    from dataeval.detectors.ood.ae import OOD_AE
+    from dataeval.detectors.ood.aegmm import OOD_AEGMM
+    from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
+    from dataeval.detectors.ood.llr import OOD_LLR
+    from dataeval.detectors.ood.vae import OOD_VAE
+    from dataeval.detectors.ood.vaegmm import OOD_VAEGMM
     __all__ = ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM", "OODOutput", "OODScoreOutput"]

dataeval/{_internal/detectors → detectors}/ood/ae.py RENAMED Viewed

@@ -8,18 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
-from typing import Callable
+__all__ = ["OOD_AE"]
+from typing import TYPE_CHECKING, Callable
 import numpy as np
-import tensorflow as tf
-import tf_keras as keras
 from numpy.typing import ArrayLike
-from dataeval._internal.detectors.ood.base import OODBase, OODScoreOutput
-from dataeval._internal.interop import as_numpy
-from dataeval._internal.models.tensorflow.autoencoder import AE
-from dataeval._internal.models.tensorflow.utils import predict_batch
-from dataeval._internal.output import set_metadata
+from dataeval.detectors.ood.base import OODBase, OODScoreOutput
+from dataeval.interop import as_numpy
+from dataeval.utils.lazy import lazyload
+from dataeval.utils.tensorflow._internal.utils import predict_batch
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import tf_keras as keras
+    import dataeval.utils.tensorflow._internal.models as tf_models
+else:
+    tf = lazyload("tensorflow")
+    keras = lazyload("tf_keras")
+    tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
 class OOD_AE(OODBase):
@@ -32,7 +41,7 @@ class OOD_AE(OODBase):
        An :term:`autoencoder<Autoencoder>` model.
     """
-    def __init__(self, model: AE) -> None:
+    def __init__(self, model: tf_models.AE) -> None:
         super().__init__(model)
     def fit(
@@ -40,7 +49,7 @@ class OOD_AE(OODBase):
         x_ref: ArrayLike,
         threshold_perc: float = 100.0,
         loss_fn: Callable[..., tf.Tensor] | None = None,
-        optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
+        optimizer: keras.optimizers.Optimizer | None = None,
         epochs: int = 20,
         batch_size: int = 64,
         verbose: bool = True,
@@ -49,8 +58,7 @@ class OOD_AE(OODBase):
             loss_fn = keras.losses.MeanSquaredError()
         super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
-    @set_metadata("dataeval.detectors")
-    def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
+    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
         self._validate(X := as_numpy(X))
         # reconstruct instances

dataeval/detectors/ood/aegmm.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""
+Source code derived from Alibi-Detect 0.11.4
+https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
+Original code Copyright (c) 2023 Seldon Technologies Ltd
+Licensed under Apache Software License (Apache 2.0)
+"""
+from __future__ import annotations
+__all__ = ["OOD_AEGMM"]
+from typing import TYPE_CHECKING, Callable
+from numpy.typing import ArrayLike
+from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
+from dataeval.interop import to_numpy
+from dataeval.utils.lazy import lazyload
+from dataeval.utils.tensorflow._internal.gmm import gmm_energy
+from dataeval.utils.tensorflow._internal.loss import LossGMM
+from dataeval.utils.tensorflow._internal.utils import predict_batch
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import tf_keras as keras
+    import dataeval.utils.tensorflow._internal.models as tf_models
+else:
+    tf = lazyload("tensorflow")
+    keras = lazyload("tf_keras")
+    tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
+class OOD_AEGMM(OODGMMBase):
+    """
+    AE with Gaussian Mixture Model based outlier detector.
+    Parameters
+    ----------
+    model : AEGMM
+       An AEGMM model.
+    """
+    def __init__(self, model: tf_models.AEGMM) -> None:
+        super().__init__(model)
+    def fit(
+        self,
+        x_ref: ArrayLike,
+        threshold_perc: float = 100.0,
+        loss_fn: Callable[..., tf.Tensor] | None = None,
+        optimizer: keras.optimizers.Optimizer | None = None,
+        epochs: int = 20,
+        batch_size: int = 64,
+        verbose: bool = True,
+    ) -> None:
+        if loss_fn is None:
+            loss_fn = LossGMM()
+        super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
+    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
+        self._validate(X := to_numpy(X))
+        _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
+        energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
+        return OODScoreOutput(energy.numpy())  # type: ignore

dataeval/{_internal/detectors → detectors}/ood/base.py RENAMED Viewed

@@ -8,19 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["OODOutput", "OODScoreOutput"]
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Callable, Literal, cast
+from typing import TYPE_CHECKING, Callable, Literal, cast
 import numpy as np
-import tensorflow as tf
-import tf_keras as keras
 from numpy.typing import ArrayLike, NDArray
-from dataeval._internal.interop import to_numpy
-from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
-from dataeval._internal.models.tensorflow.trainer import trainer
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import to_numpy
+from dataeval.output import OutputMetadata, set_metadata
+from dataeval.utils.lazy import lazyload
+from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
+from dataeval.utils.tensorflow._internal.trainer import trainer
+if TYPE_CHECKING:
+    import tensorflow as tf
+    import tf_keras as keras
+else:
+    tf = lazyload("tensorflow")
+    keras = lazyload("tf_keras")
 @dataclass(frozen=True)
@@ -61,7 +69,7 @@ class OODScoreOutput(OutputMetadata):
     instance_score: NDArray[np.float32]
     feature_score: NDArray[np.float32] | None = None
-    def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
+    def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
         """
         Returns either the instance or feature score
@@ -107,6 +115,9 @@ class OODBase(ABC):
         self._validate(X)
     @abstractmethod
+    def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput: ...
+    @set_metadata()
     def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
         """
         Compute the :term:`out of distribution<Out-of-distribution (OOD)>` scores for a given dataset.
@@ -124,6 +135,7 @@ class OODBase(ABC):
         OODScoreOutput
             An object containing the instance-level and feature-level OOD scores.
         """
+        return self._score(X, batch_size)
     def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
         return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
@@ -131,12 +143,12 @@ class OODBase(ABC):
     def fit(
         self,
         x_ref: ArrayLike,
-        threshold_perc: float = 100.0,
-        loss_fn: Callable[..., tf.Tensor] | None = None,
-        optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
-        epochs: int = 20,
-        batch_size: int = 64,
-        verbose: bool = True,
+        threshold_perc: float,
+        loss_fn: Callable[..., tf.Tensor],
+        optimizer: keras.optimizers.Optimizer,
+        epochs: int,
+        batch_size: int,
+        verbose: bool,
     ) -> None:
         """
         Train the model and infer the threshold value.
@@ -174,7 +186,7 @@ class OODBase(ABC):
         self._ref_score = self.score(x_ref, batch_size)
         self._threshold_perc = threshold_perc
-    @set_metadata("dataeval.detectors")
+    @set_metadata()
     def predict(
         self,
         X: ArrayLike,
@@ -218,12 +230,12 @@ class OODGMMBase(OODBase):
     def fit(
         self,
         x_ref: ArrayLike,
-        threshold_perc: float = 100.0,
-        loss_fn: Callable[..., tf.Tensor] | None = None,
-        optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
-        epochs: int = 20,
-        batch_size: int = 64,
-        verbose: bool = True,
+        threshold_perc: float,
+        loss_fn: Callable[..., tf.Tensor],
+        optimizer: keras.optimizers.Optimizer,
+        epochs: int,
+        batch_size: int,
+        verbose: bool,
     ) -> None:
         # Train the model
         trainer(

dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.73.0py3-none-any.whl