PyPI - dataeval - Versions diffs - 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl - Mend

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

dataeval/__init__.py +13 -9
dataeval/_internal/detectors/clusterer.py +63 -49
dataeval/_internal/detectors/drift/base.py +248 -51
dataeval/_internal/detectors/drift/cvm.py +28 -26
dataeval/_internal/detectors/drift/ks.py +31 -28
dataeval/_internal/detectors/drift/mmd.py +62 -42
dataeval/_internal/detectors/drift/torch.py +69 -60
dataeval/_internal/detectors/drift/uncertainty.py +32 -32
dataeval/_internal/detectors/duplicates.py +67 -31
dataeval/_internal/detectors/ood/ae.py +15 -29
dataeval/_internal/detectors/ood/aegmm.py +33 -27
dataeval/_internal/detectors/ood/base.py +86 -47
dataeval/_internal/detectors/ood/llr.py +34 -31
dataeval/_internal/detectors/ood/vae.py +32 -31
dataeval/_internal/detectors/ood/vaegmm.py +34 -28
dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
dataeval/_internal/flags.py +44 -21
dataeval/_internal/interop.py +5 -3
dataeval/_internal/metrics/balance.py +42 -5
dataeval/_internal/metrics/ber.py +11 -8
dataeval/_internal/metrics/coverage.py +15 -8
dataeval/_internal/metrics/divergence.py +41 -7
dataeval/_internal/metrics/diversity.py +57 -19
dataeval/_internal/metrics/parity.py +141 -66
dataeval/_internal/metrics/stats.py +330 -313
dataeval/_internal/metrics/uap.py +33 -4
dataeval/_internal/metrics/utils.py +79 -40
dataeval/_internal/models/pytorch/autoencoder.py +127 -22
dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
dataeval/_internal/models/tensorflow/gmm.py +4 -2
dataeval/_internal/models/tensorflow/losses.py +17 -13
dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
dataeval/_internal/models/tensorflow/trainer.py +10 -7
dataeval/_internal/models/tensorflow/utils.py +23 -20
dataeval/_internal/output.py +85 -0
dataeval/_internal/utils.py +5 -3
dataeval/_internal/workflows/sufficiency.py +122 -121
dataeval/detectors/__init__.py +6 -25
dataeval/detectors/drift/__init__.py +16 -0
dataeval/detectors/drift/kernels/__init__.py +6 -0
dataeval/detectors/drift/updates/__init__.py +3 -0
dataeval/detectors/linters/__init__.py +5 -0
dataeval/detectors/ood/__init__.py +11 -0
dataeval/flags/__init__.py +2 -2
dataeval/metrics/__init__.py +2 -26
dataeval/metrics/bias/__init__.py +14 -0
dataeval/metrics/estimators/__init__.py +9 -0
dataeval/metrics/stats/__init__.py +6 -0
dataeval/tensorflow/__init__.py +3 -0
dataeval/tensorflow/loss/__init__.py +3 -0
dataeval/tensorflow/models/__init__.py +5 -0
dataeval/tensorflow/recon/__init__.py +3 -0
dataeval/torch/__init__.py +3 -0
dataeval/{models/torch → torch/models}/__init__.py +1 -2
dataeval/torch/trainer/__init__.py +3 -0
dataeval/utils/__init__.py +3 -6
dataeval/workflows/__init__.py +2 -4
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
dataeval-0.66.0.dist-info/RECORD +72 -0
dataeval/_internal/metrics/base.py +0 -10
dataeval/models/__init__.py +0 -15
dataeval/models/tensorflow/__init__.py +0 -6
dataeval-0.64.0.dist-info/RECORD +0 -60
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,18 +1,22 @@
+__version__ = "0.66.0"
 from importlib.util import find_spec
-from . import detectors, flags, metrics
+_IS_TORCH_AVAILABLE = find_spec("torch") is not None
+_IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("tensorflow_probability") is not None
+del find_spec
-__version__ = "0.64.0"
+from . import detectors, flags, metrics  # noqa: E402
 __all__ = ["detectors", "flags", "metrics"]
-if find_spec("torch") is not None:  # pragma: no cover
-    from . import models, utils, workflows
+if _IS_TORCH_AVAILABLE:  # pragma: no cover
+    from . import torch, utils, workflows
-    __all__ += ["models", "utils", "workflows"]
-elif find_spec("tensorflow") is not None:  # pragma: no cover
-    from . import models
+    __all__ += ["torch", "utils", "workflows"]
-    __all__ += ["models"]
+if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+    from . import tensorflow
-del find_spec
+    __all__ += ["tensorflow"]

dataeval/_internal/detectors/clusterer.py CHANGED Viewed

@@ -1,26 +1,52 @@
-from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, NamedTuple, cast
 import numpy as np
-from numpy.typing import ArrayLike
+from numpy.typing import ArrayLike, NDArray
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import pdist, squareform
 from dataeval._internal.interop import to_numpy
+from dataeval._internal.metrics.utils import flatten
+from dataeval._internal.output import OutputMetadata, set_metadata
-def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
+@dataclass(frozen=True)
+class ClustererOutput(OutputMetadata):
+    """
+    Attributes
+    ----------
+    outliers : List[int]
+        Indices that do not fall within a cluster
+    potential_outliers : List[int]
+        Indices which are near the border between belonging in the cluster and being an outlier
+    duplicates : List[List[int]]
+        Groups of indices that are exact duplicates
+    potential_duplicates : List[List[int]]
+        Groups of indices which are not exact but closely related data points
+    """
+    outliers: list[int]
+    potential_outliers: list[int]
+    duplicates: list[list[int]]
+    potential_duplicates: list[list[int]]
+def extend_linkage(link_arr: NDArray) -> NDArray:
     """
     Adds a column to the linkage matrix link_arr that tracks the new id assigned
     to each row
     Parameters
     ----------
-    link_arr : np.ndarray
+    link_arr : NDArray
         linkage matrix
     Returns
     -------
-    np.ndarray
+    NDArray
         linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
     """
     # Adjusting linkage matrix to accommodate renumbering
@@ -35,7 +61,7 @@ def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
 class Cluster:
     __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
-    def __init__(self, merged: int, samples: np.ndarray, sample_dist: Union[float, np.ndarray], is_copy: bool = False):
+    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
         self.merged = merged
         self.samples = np.array(samples, dtype=np.int32)
         self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -57,7 +83,7 @@ class Cluster:
             self.out1 = dist > out1
             self.out2 = dist > out2
-    def copy(self) -> "Cluster":
+    def copy(self) -> Cluster:
         return Cluster(False, self.samples, self.sample_dist, True)
     def __repr__(self) -> str:
@@ -70,7 +96,7 @@ class Cluster:
         return f"{self.__class__.__name__}(**{repr(_params)})"
-class Clusters(Dict[int, Dict[int, Cluster]]):
+class Clusters(dict[int, dict[int, Cluster]]):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.max_level: int = 1
@@ -92,10 +118,10 @@ class ClusterMergeEntry:
         self.inner_cluster = inner_cluster
         self.status = status
-    def __lt__(self, value: "ClusterMergeEntry") -> bool:
+    def __lt__(self, value: ClusterMergeEntry) -> bool:
         return self.level.__lt__(value.level)
-    def __gt__(self, value: "ClusterMergeEntry") -> bool:
+    def __gt__(self, value: ClusterMergeEntry) -> bool:
         return self.level.__gt__(value.level)
@@ -130,13 +156,13 @@ class Clusterer:
         self._on_init(dataset)
     def _on_init(self, dataset: ArrayLike):
-        self._data: np.ndarray = to_numpy(dataset)
+        self._data: NDArray = flatten(to_numpy(dataset))
         self._validate_data(self._data)
         self._num_samples = len(self._data)
-        self._darr: np.ndarray = pdist(self._data, metric="euclidean")
-        self._sqdmat: np.ndarray = squareform(self._darr)
-        self._larr: np.ndarray = extend_linkage(linkage(self._darr))
+        self._darr: NDArray = pdist(self._data, metric="euclidean")
+        self._sqdmat: NDArray = squareform(self._darr)
+        self._larr: NDArray = extend_linkage(linkage(self._darr))
         self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
         min_num = int(self._num_samples * 0.05)
@@ -146,7 +172,7 @@ class Clusterer:
         self._last_good_merge_levels = None
     @property
-    def data(self) -> np.ndarray:
+    def data(self) -> NDArray:
         return self._data
     @data.setter
@@ -160,16 +186,16 @@ class Clusterer:
         return self._clusters
     @property
-    def last_good_merge_levels(self) -> Dict[int, int]:
+    def last_good_merge_levels(self) -> dict[int, int]:
         if self._last_good_merge_levels is None:
             self._last_good_merge_levels = self._get_last_merge_levels()
         return self._last_good_merge_levels
     @classmethod
-    def _validate_data(cls, x: np.ndarray):
+    def _validate_data(cls, x: NDArray):
         """Checks that the data has the correct size, shape, and format"""
         if not isinstance(x, np.ndarray):
-            raise TypeError(f"Data should be of type np.ndarray; got {type(x)}")
+            raise TypeError(f"Data should be of type NDArray; got {type(x)}")
         if x.ndim != 2:
             raise ValueError(
@@ -184,7 +210,7 @@ class Clusterer:
     def _create_clusters(self) -> Clusters:
         """Generates clusters based on linkage matrix"""
         next_cluster_id = 0
-        cluster_map: Dict[int, ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
+        cluster_map: dict[int, ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
         clusters: Clusters = Clusters()
         # Walking through the linkage array to generate clusters
@@ -212,7 +238,7 @@ class Clusterer:
                 # Update clusters to include previously skipped levels
                 clusters = self._fill_levels(clusters, left, right)
             elif left or right:
-                child, other_id = cast(Tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
+                child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
                 cc = clusters[child.level][child.cid]
                 samples = np.concatenate([cc.samples, [other_id]])
                 sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -240,7 +266,7 @@ class Clusterer:
                 clusters[level_id].setdefault(cid, cluster)
         return clusters
-    def _get_cluster_distances(self) -> np.ndarray:
+    def _get_cluster_distances(self) -> NDArray:
         """Calculates the minimum distances between clusters are each level"""
         # Cluster distance matrix
         max_level = self.clusters.max_level
@@ -261,7 +287,7 @@ class Clusterer:
         return cluster_matrix
-    def _calc_merge_indices(self, merge_mean: List[np.ndarray], intra_max: List[float]) -> np.ndarray:
+    def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
         """
         Determine what clusters should be merged and return their indices
         """
@@ -284,7 +310,7 @@ class Clusterer:
         mask2 = mask2_vals < one_std_check
         return np.logical_or(desired_merge, mask2)
-    def _generate_merge_list(self, cluster_matrix: np.ndarray) -> List[ClusterMergeEntry]:
+    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
         """
         Runs through the clusters dictionary determining when clusters merge,
         and how close are those clusters when they merge.
@@ -301,7 +327,7 @@ class Clusterer:
         """
         intra_max = []
         merge_mean = []
-        merge_list: List[ClusterMergeEntry] = []
+        merge_list: list[ClusterMergeEntry] = []
         for level, cluster_set in self.clusters.items():
             for outer_cluster, cluster in cluster_set.items():
@@ -339,7 +365,7 @@ class Clusterer:
         return merge_list
-    def _get_last_merge_levels(self) -> Dict[int, int]:
+    def _get_last_merge_levels(self) -> dict[int, int]:
         """
         Creates a dictionary for important cluster ids mapped to their last good merge level
@@ -348,7 +374,7 @@ class Clusterer:
         Dict[int, int]
             A mapping of a cluster id to its last good merge level
         """
-        last_merge_levels: Dict[int, int] = {}
+        last_merge_levels: dict[int, int] = {}
         if self._max_clusters <= 1:
             last_merge_levels = {0: int(self._num_samples * 0.1)}
@@ -371,7 +397,7 @@ class Clusterer:
         return last_merge_levels
-    def find_outliers(self, last_merge_levels: Dict[int, int]) -> Tuple[List[int], List[int]]:
+    def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
         """
         Retrieves outliers based on when the sample was added to the cluster
         and how far it was from the cluster when it was added
@@ -415,9 +441,9 @@ class Clusterer:
         return sorted(outliers), sorted(possible_outliers)
-    def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> List[List[int]]:
+    def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
         """Merges and sorts groups of indices that share any common index"""
-        groups: List[List[int]] = []
+        groups: list[list[int]] = []
         for indices in zip(*index_groups):
             indices = set(indices)
             temp = []
@@ -430,7 +456,7 @@ class Clusterer:
             groups = temp
         return sorted(groups)
-    def find_duplicates(self, last_merge_levels: Dict[int, int]) -> Tuple[List[List[int]], List[List[int]]]:
+    def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
         """
         Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
@@ -464,35 +490,23 @@ class Clusterer:
         return exact_dupes, near_dupes
-    def evaluate(self):
+    # TODO: Move data input to evaluate from class
+    @set_metadata("dataeval.detectors", ["data"])
+    def evaluate(self) -> ClustererOutput:
         """Finds and flags indices of the data for outliers and duplicates
         Returns
         -------
-        Dict[str, List[int]]
-            outliers :
-                List of indices that do not fall within a cluster
-            potential_outliers :
-                List of indices which are near the border between belonging in the cluster and being an outlier
-            duplicates :
-                List of groups of indices that are exact duplicates
-            potential_duplicates :
-                List of groups of indices which are not exact but closely related data points
+        ClustererOutput
+            The outliers and duplicate indices found in the data
         Example
         -------
         >>> cluster.evaluate()
-        {'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
+        ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
         """  # noqa: E501
         outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
         duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
-        ret = {
-            "outliers": outliers,
-            "potential_outliers": potential_outliers,
-            "duplicates": duplicates,
-            "potential_duplicates": potential_duplicates,
-        }
-        return ret
+        return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)

dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl