PyPI - dataeval - Versions diffs - 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl - Mend

dataeval 0.72.0py3-none-any.whl → 0.72.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +10 -11
dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
dataeval/detectors/ood/__init__.py +8 -16
dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
dataeval/{_internal/interop.py → interop.py} +12 -7
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
dataeval/metrics/bias/metadata.py +275 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +8 -4
dataeval/utils/image.py +71 -0
dataeval/utils/shared.py +151 -0
dataeval/utils/split_dataset.py +486 -0
dataeval/utils/tensorflow/__init__.py +9 -7
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +49 -43
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
dataeval-0.72.2.dist-info/RECORD +72 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -7
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.0.dist-info/RECORD +0 -80
/dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0

dataeval/{_internal/detectors → detectors}/drift/torch.py RENAMED Viewed

@@ -8,8 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = []
 from functools import partial
-from typing import Callable
+from typing import Any, Callable
 import numpy as np
 import torch
@@ -42,7 +44,7 @@ def get_device(device: str | torch.device | None = None) -> torch.device:
     return torch_device
-def mmd2_from_kernel_matrix(
+def _mmd2_from_kernel_matrix(
     kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
 ) -> torch.Tensor:
     """
@@ -78,13 +80,13 @@ def mmd2_from_kernel_matrix(
 def predict_batch(
-    x: NDArray | torch.Tensor,
+    x: NDArray[Any] | torch.Tensor,
     model: Callable | nn.Module | nn.Sequential,
     device: torch.device | None = None,
     batch_size: int = int(1e10),
     preprocess_fn: Callable | None = None,
     dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray | torch.Tensor | tuple:
+) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
     """
     Make batch predictions on a model.
@@ -102,7 +104,7 @@ def predict_batch(
     preprocess_fn : Callable | None, default None
         Optional preprocessing function for each batch.
     dtype : np.dtype | torch.dtype, default np.float32
-        Model output type, either a numpy or torch dtype, e.g. np.float32 or torch.float32.
+        Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
     Returns
     -------
@@ -154,13 +156,13 @@ def predict_batch(
 def preprocess_drift(
-    x: NDArray,
+    x: NDArray[Any],
     model: nn.Module,
-    device: torch.device | None = None,
+    device: str | torch.device | None = None,
     preprocess_batch_fn: Callable | None = None,
     batch_size: int = int(1e10),
     dtype: type[np.generic] | torch.dtype = np.float32,
-) -> NDArray | torch.Tensor | tuple:
+) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
     """
     Prediction function used for preprocessing step of drift detector.
@@ -179,7 +181,7 @@ def preprocess_drift(
     batch_size : int, default 1e10
         Batch size used during prediction.
     dtype : np.dtype | torch.dtype, default np.float32
-        Model output type, either a numpy or torch dtype, e.g. np.float32 or torch.float32.
+        Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
     Returns
     -------
@@ -189,7 +191,7 @@ def preprocess_drift(
     return predict_batch(
         x,
         model,
-        device=device,
+        device=get_device(device),
         batch_size=batch_size,
         preprocess_fn=preprocess_batch_fn,
         dtype=dtype,
@@ -197,7 +199,7 @@ def preprocess_drift(
 @torch.jit.script
-def squared_pairwise_distance(
+def _squared_pairwise_distance(
     x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
 ) -> torch.Tensor:  # pragma: no cover - torch.jit.script code is compiled and copied
     """
@@ -249,7 +251,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
     return sigma
-class GaussianRBF(nn.Module):
+class _GaussianRBF(nn.Module):
     """
     Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
@@ -303,7 +305,7 @@ class GaussianRBF(nn.Module):
         infer_sigma: bool = False,
     ) -> torch.Tensor:
         x, y = torch.as_tensor(x), torch.as_tensor(y)
-        dist = squared_pairwise_distance(x.flatten(1), y.flatten(1))  # [Nx, Ny]
+        dist = _squared_pairwise_distance(x.flatten(1), y.flatten(1))  # [Nx, Ny]
         if infer_sigma or self.init_required:
             if self.trainable and infer_sigma:

dataeval/{_internal/detectors → detectors}/drift/uncertainty.py RENAMED Viewed

@@ -8,6 +8,8 @@ Licensed under Apache Software License (Apache 2.0)
 from __future__ import annotations
+__all__ = ["DriftUncertainty"]
 from functools import partial
 from typing import Callable, Literal
@@ -16,16 +18,16 @@ from numpy.typing import ArrayLike, NDArray
 from scipy.special import softmax
 from scipy.stats import entropy
-from .base import DriftOutput, UpdateStrategy
-from .ks import DriftKS
-from .torch import get_device, preprocess_drift
+from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
+from dataeval.detectors.drift.ks import DriftKS
+from dataeval.detectors.drift.torch import get_device, preprocess_drift
 def classifier_uncertainty(
-    x: NDArray,
+    x: NDArray[np.float64],
     model_fn: Callable,
     preds_type: Literal["probs", "logits"] = "probs",
-) -> NDArray:
+) -> NDArray[np.float64]:
     """
     Evaluate model_fn on x and transform predictions to prediction uncertainties.
@@ -34,7 +36,7 @@ def classifier_uncertainty(
     x : np.ndarray
         Batch of instances.
     model_fn : Callable
-        Function that evaluates a classification model on x in a single call (contains
+        Function that evaluates a :term:`classification<Classification>` model on x in a single call (contains
         batching logic if necessary).
     preds_type : "probs" | "logits", default "probs"
         Type of prediction output by the model. Options are 'probs' (in [0,1]) or
@@ -73,9 +75,9 @@ class DriftUncertainty:
     x_ref : ArrayLike
         Data used as reference distribution.
     model : Callable
-        Classification model outputting class probabilities (or logits)
+        :term:`Classification` model outputting class probabilities (or logits)
     p_val : float, default 0.05
-        p-value used for the significance of the test.
+        :term:`P-Value` used for the significance of the test.
     x_ref_preprocessed : bool, default False
         Whether the given reference data ``x_ref`` has been preprocessed yet.
         If ``True``, only the test data ``x`` will be preprocessed at prediction time.
@@ -145,6 +147,7 @@ class DriftUncertainty:
         Returns
         -------
         DriftUnvariateOutput
-            Dictionary containing the drift prediction, p-value, and threshold statistics.
+            Dictionary containing the drift prediction, :term:`p-value<P-Value>`, and threshold
+            statistics.
         """
         return self._detector.predict(x)

dataeval/detectors/drift/updates.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
+for drift.
+"""
+from __future__ import annotations
+__all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
+from typing import Any
+import numpy as np
+from numpy.typing import NDArray
+from dataeval.detectors.drift.base import UpdateStrategy
+class LastSeenUpdate(UpdateStrategy):
+    """
+    Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        x_updated = np.concatenate([x_ref, x], axis=0)
+        return x_updated[-self.n :]
+class ReservoirSamplingUpdate(UpdateStrategy):
+    """
+    Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.
+    Parameters
+    ----------
+    n : int
+        Update with last n instances seen by the detector.
+    """
+    def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
+        if x.shape[0] + count <= self.n:
+            return np.concatenate([x_ref, x], axis=0)
+        n_ref = x_ref.shape[0]
+        output_size = min(self.n, n_ref + x.shape[0])
+        shape = (output_size,) + x.shape[1:]
+        x_reservoir = np.zeros(shape, dtype=x_ref.dtype)
+        x_reservoir[:n_ref] = x_ref
+        for item in x:
+            count += 1
+            if n_ref < self.n:
+                x_reservoir[n_ref, :] = item
+                n_ref += 1
+            else:
+                r = np.random.randint(0, count)
+                if r < self.n:
+                    x_reservoir[r, :] = item
+        return x_reservoir

dataeval/detectors/linters/__init__.py CHANGED Viewed

@@ -2,9 +2,9 @@
 Linters help identify potential issues in training and test data and are an important aspect of data cleaning.
 """
-from dataeval._internal.detectors.clusterer import Clusterer, ClustererOutput
-from dataeval._internal.detectors.duplicates import Duplicates, DuplicatesOutput
-from dataeval._internal.detectors.outliers import Outliers, OutliersOutput
+from dataeval.detectors.linters.clusterer import Clusterer, ClustererOutput
+from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
+from dataeval.detectors.linters.outliers import Outliers, OutliersOutput
 __all__ = [
     "Clusterer",

dataeval/{_internal/detectors → detectors/linters}/clusterer.py RENAMED Viewed

@@ -1,16 +1,18 @@
 from __future__ import annotations
+__all__ = ["ClustererOutput", "Clusterer"]
 from dataclasses import dataclass
-from typing import Iterable, NamedTuple, cast
+from typing import Any, Iterable, NamedTuple, cast
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import pdist, squareform
-from dataeval._internal.interop import to_numpy
-from dataeval._internal.metrics.utils import flatten
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.interop import to_numpy
+from dataeval.output import OutputMetadata, set_metadata
+from dataeval.utils.shared import flatten
 @dataclass(frozen=True)
@@ -25,7 +27,7 @@ class ClustererOutput(OutputMetadata):
     potential_outliers : List[int]
         Indices which are near the border between belonging in the cluster and being an outlier
     duplicates : List[List[int]]
-        Groups of indices that are exact duplicates
+        Groups of indices that are exact :term:`duplicates<Duplicates>`
     potential_duplicates : List[List[int]]
         Groups of indices which are not exact but closely related data points
     """
@@ -36,7 +38,7 @@ class ClustererOutput(OutputMetadata):
     potential_duplicates: list[list[int]]
-def extend_linkage(link_arr: NDArray) -> NDArray:
+def _extend_linkage(link_arr: NDArray) -> NDArray:
     """
     Adds a column to the linkage matrix link_arr that tracks the new id assigned
     to each row
@@ -60,10 +62,10 @@ def extend_linkage(link_arr: NDArray) -> NDArray:
     return arr
-class Cluster:
+class _Cluster:
     __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
-    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
+    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
         self.merged = merged
         self.samples = np.array(samples, dtype=np.int32)
         self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -85,8 +87,8 @@ class Cluster:
             self.out1 = dist > out1
             self.out2 = dist > out2
-    def copy(self) -> Cluster:
-        return Cluster(False, self.samples, self.sample_dist, True)
+    def copy(self) -> _Cluster:
+        return _Cluster(False, self.samples, self.sample_dist, True)
     def __repr__(self) -> str:
         _params = {
@@ -98,38 +100,38 @@ class Cluster:
         return f"{self.__class__.__name__}(**{repr(_params)})"
-class Clusters(dict[int, dict[int, Cluster]]):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+class _Clusters(dict[int, dict[int, _Cluster]]):
+    def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
+        super().__init__(*args)
         self.max_level: int = 1
-class ClusterPosition(NamedTuple):
+class _ClusterPosition(NamedTuple):
     """Keeps track of a cluster's level and ID"""
     level: int
     cid: int
-class ClusterMergeEntry:
+class _ClusterMergeEntry:
     __slots__ = "level", "outer_cluster", "inner_cluster", "status"
-    def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
+    def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
         self.level = level
         self.outer_cluster = outer_cluster
         self.inner_cluster = inner_cluster
         self.status = status
-    def __lt__(self, value: ClusterMergeEntry) -> bool:
+    def __lt__(self, value: _ClusterMergeEntry) -> bool:
         return self.level.__lt__(value.level)
-    def __gt__(self, value: ClusterMergeEntry) -> bool:
+    def __gt__(self, value: _ClusterMergeEntry) -> bool:
         return self.level.__gt__(value.level)
 class Clusterer:
     """
-    Uses hierarchical clustering to flag dataset properties of interest like outliers and duplicates
+    Uses hierarchical clustering to flag dataset properties of interest like Outliers and :term:`duplicates<Duplicates>`
     Parameters
     ----------
@@ -153,36 +155,36 @@ class Clusterer:
     >>> cluster = Clusterer(dataset)
     """
-    def __init__(self, dataset: ArrayLike):
+    def __init__(self, dataset: ArrayLike) -> None:
         # Allows an update to dataset to reset the state rather than instantiate a new class
         self._on_init(dataset)
     def _on_init(self, dataset: ArrayLike):
-        self._data: NDArray = flatten(to_numpy(dataset))
+        self._data: NDArray[Any] = flatten(to_numpy(dataset))
         self._validate_data(self._data)
         self._num_samples = len(self._data)
-        self._darr: NDArray = pdist(self._data, metric="euclidean")
-        self._sqdmat: NDArray = squareform(self._darr)
-        self._larr: NDArray = extend_linkage(linkage(self._darr))
+        self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
+        self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
+        self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
         self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
         min_num = int(self._num_samples * 0.05)
-        self._min_num_samples_per_cluster = min(max(2, min_num), 100)
+        self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
-        self._clusters = None
-        self._last_good_merge_levels = None
+        self._clusters: _Clusters | None = None
+        self._last_good_merge_levels: dict[int, int] | None = None
     @property
-    def data(self) -> NDArray:
+    def data(self) -> NDArray[Any]:
         return self._data
     @data.setter
-    def data(self, x: ArrayLike):
+    def data(self, x: ArrayLike) -> None:
         self._on_init(x)
     @property
-    def clusters(self) -> Clusters:
+    def clusters(self) -> _Clusters:
         if self._clusters is None:
             self._clusters = self._create_clusters()
         return self._clusters
@@ -209,11 +211,11 @@ class Clusterer:
         if features < 1:
             raise ValueError(f"Samples should have at least 1 feature; got {features}")
-    def _create_clusters(self) -> Clusters:
+    def _create_clusters(self) -> _Clusters:
         """Generates clusters based on linkage matrix"""
         next_cluster_id = 0
-        cluster_map: dict[int, ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
-        clusters: Clusters = Clusters()
+        cluster_map: dict[int, _ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
+        clusters: _Clusters = _Clusters()
         # Walking through the linkage array to generate clusters
         for arr_i in self._larr:
@@ -240,7 +242,7 @@ class Clusterer:
                 # Update clusters to include previously skipped levels
                 clusters = self._fill_levels(clusters, left, right)
             elif left or right:
-                child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
+                child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
                 cc = clusters[child.level][child.cid]
                 samples = np.concatenate([cc.samples, [other_id]])
                 sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -254,12 +256,12 @@ class Clusterer:
             if level not in clusters:
                 clusters[level] = {}
-            clusters[level][cid] = Cluster(merged, samples, sample_dist)
-            cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
+            clusters[level][cid] = _Cluster(merged, samples, sample_dist)
+            cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
         return clusters
-    def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
+    def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
         # Sets each level's cluster info if it does not exist
         if left.level != right.level:
             (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
@@ -312,7 +314,7 @@ class Clusterer:
         mask2 = mask2_vals < one_std_check
         return np.logical_or(desired_merge, mask2)
-    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
+    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
         """
         Runs through the clusters dictionary determining when clusters merge,
         and how close are those clusters when they merge.
@@ -329,7 +331,7 @@ class Clusterer:
         """
         intra_max = []
         merge_mean = []
-        merge_list: list[ClusterMergeEntry] = []
+        merge_list: list[_ClusterMergeEntry] = []
         for level, cluster_set in self.clusters.items():
             for outer_cluster, cluster in cluster_set.items():
@@ -356,7 +358,7 @@ class Clusterer:
                 # Calculate the corresponding distance stats
                 distance_stats_arr = aggregate_func(distances)
                 merge_mean.append(distance_stats_arr)
-                merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
+                merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
         all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
@@ -401,7 +403,7 @@ class Clusterer:
     def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
         """
-        Retrieves outliers based on when the sample was added to the cluster
+        Retrieves Outliers based on when the sample was added to the cluster
         and how far it was from the cluster when it was added
         Parameters
@@ -470,7 +472,7 @@ class Clusterer:
         Returns
         -------
         Tuple[List[List[int]], List[List[int]]]
-            The exact duplicates and near duplicates as lists of related indices
+            The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
         """
         duplicates_std = []
@@ -493,14 +495,14 @@ class Clusterer:
         return exact_dupes, near_dupes
     # TODO: Move data input to evaluate from class
-    @set_metadata("dataeval.detectors", ["data"])
+    @set_metadata(["data"])
     def evaluate(self) -> ClustererOutput:
-        """Finds and flags indices of the data for outliers and duplicates
+        """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
         Returns
         -------
         ClustererOutput
-            The outliers and duplicate indices found in the data
+            The Outliers and duplicate indices found in the data
         Example
         -------

dataeval/{_internal/detectors → detectors/linters}/duplicates.py RENAMED Viewed

@@ -1,13 +1,15 @@
 from __future__ import annotations
+__all__ = ["DuplicatesOutput", "Duplicates"]
 from dataclasses import dataclass
-from typing import Generic, Iterable, Sequence, TypeVar
+from typing import Generic, Iterable, Sequence, TypeVar, overload
 from numpy.typing import ArrayLike
-from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
-from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
-from dataeval._internal.output import OutputMetadata, set_metadata
+from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
+from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
+from dataeval.output import OutputMetadata, set_metadata
 DuplicateGroup = list[int]
 DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
@@ -37,7 +39,7 @@ class DuplicatesOutput(Generic[TIndexCollection], OutputMetadata):
 class Duplicates:
     """
-    Finds the duplicate images in a dataset using xxhash for exact duplicates
+    Finds the duplicate images in a dataset using xxhash for exact :term:`duplicates<Duplicates>`
     and pchash for near duplicates
     Attributes
@@ -58,7 +60,7 @@ class Duplicates:
     >>> exact_dupes = Duplicates(only_exact=True)
     """
-    def __init__(self, only_exact: bool = False):
+    def __init__(self, only_exact: bool = False) -> None:
         self.stats: HashStatsOutput
         self.only_exact = only_exact
@@ -81,8 +83,16 @@ class Duplicates:
             "near": sorted(near),
         }
-    @set_metadata("dataeval.detectors", ["only_exact"])
-    def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
+    @overload
+    def from_stats(self, hashes: HashStatsOutput) -> DuplicatesOutput[DuplicateGroup]: ...
+    @overload
+    def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
+    @set_metadata(["only_exact"])
+    def from_stats(
+        self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
+    ) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
         """
         Returns duplicate image indices for both exact matches and near matches
@@ -128,8 +138,8 @@ class Duplicates:
         return DuplicatesOutput(**duplicates)
-    @set_metadata("dataeval.detectors", ["only_exact"])
-    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
+    @set_metadata(["only_exact"])
+    def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
         """
         Returns duplicate image indices for both exact matches and near matches

dataeval/{_internal/detectors → detectors/linters}/merged_stats.py RENAMED Viewed

@@ -1,11 +1,13 @@
 from __future__ import annotations
+__all__ = []
 from copy import deepcopy
 from typing import Sequence, TypeVar
 import numpy as np
-from dataeval._internal.metrics.stats.base import BaseStatsOutput
+from dataeval.metrics.stats.base import BaseStatsOutput
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)

dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl

dataeval 0.72.0py3-none-any.whl → 0.72.2py3-none-any.whl