PyPI - dataeval - Versions diffs - 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl - Mend

dataeval 0.76.0py3-none-any.whl → 0.81.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

dataeval/__init__.py +3 -3
dataeval/{output.py → _output.py} +14 -0
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +41 -30
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
dataeval/detectors/drift/updates.py +1 -1
dataeval/detectors/linters/__init__.py +0 -3
dataeval/detectors/linters/duplicates.py +17 -8
dataeval/detectors/linters/outliers.py +52 -43
dataeval/detectors/ood/ae.py +29 -8
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/metadata_ks_compare.py +1 -1
dataeval/detectors/ood/mixin.py +20 -5
dataeval/detectors/ood/output.py +1 -1
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +5 -0
dataeval/metadata/_ood.py +238 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +5 -4
dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
dataeval/metrics/estimators/__init__.py +14 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
dataeval/metrics/estimators/_clusterer.py +104 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
dataeval/metrics/stats/__init__.py +7 -7
dataeval/metrics/stats/{base.py → _base.py} +52 -16
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
dataeval/typing.py +54 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +18 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +4 -4
dataeval/utils/data/__init__.py +22 -0
dataeval/utils/data/_embeddings.py +105 -0
dataeval/utils/data/_images.py +65 -0
dataeval/utils/data/_metadata.py +352 -0
dataeval/utils/data/_selection.py +119 -0
dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
dataeval/utils/data/_targets.py +73 -0
dataeval/utils/data/_types.py +58 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +60 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +198 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/sufficiency.py +10 -9
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
dataeval-0.81.0.dist-info/RECORD +94 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.0.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0

dataeval/metrics/__init__.py CHANGED Viewed

@@ -5,4 +5,4 @@ can then be analyzed in the context of a given problem.
 __all__ = ["bias", "estimators", "stats"]
-from dataeval.metrics import bias, estimators, stats
+from . import bias, estimators, stats

dataeval/metrics/bias/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ __all__ = [
     "BalanceOutput",
     "CoverageOutput",
     "DiversityOutput",
+    "LabelParityOutput",
     "ParityOutput",
     "balance",
     "coverage",
@@ -15,7 +16,7 @@ __all__ = [
     "parity",
 ]
-from dataeval.metrics.bias.balance import BalanceOutput, balance
-from dataeval.metrics.bias.coverage import CoverageOutput, coverage
-from dataeval.metrics.bias.diversity import DiversityOutput, diversity
-from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
+from dataeval.metrics.bias._balance import BalanceOutput, balance
+from dataeval.metrics.bias._coverage import CoverageOutput, coverage
+from dataeval.metrics.bias._diversity import DiversityOutput, diversity
+from dataeval.metrics.bias._parity import LabelParityOutput, ParityOutput, label_parity, parity

dataeval/metrics/bias/{balance.py → _balance.py} RENAMED Viewed

@@ -5,16 +5,17 @@ __all__ = []
 import contextlib
 import warnings
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Literal, overload
 import numpy as np
 import scipy as sp
 from numpy.typing import NDArray
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from dataeval.output import Output, set_metadata
-from dataeval.utils.metadata import Metadata, get_counts
-from dataeval.utils.plot import heatmap
+from dataeval._output import Output, set_metadata
+from dataeval.utils._bin import get_counts
+from dataeval.utils._plot import heatmap
+from dataeval.utils.data import Metadata
 with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
@@ -23,8 +24,8 @@ with contextlib.suppress(ImportError):
 @dataclass(frozen=True)
 class BalanceOutput(Output):
     """
-    Output class for :func:`balance` :term:`bias<Bias>` metric.
+    Output class for :func:`.balance` :term:`bias<Bias>` metric.
     Attributes
     ----------
     balance : NDArray[np.float64]
@@ -35,21 +36,62 @@ class BalanceOutput(Output):
         Estimate of mutual information between metadata factors and individual class labels
     factor_names : list[str]
         Names of each metadata factor
-    class_list : NDArray
-        Array of the class labels present in the dataset
+    class_names : list[str]
+        List of the class labels present in the dataset
     """
     balance: NDArray[np.float64]
     factors: NDArray[np.float64]
     classwise: NDArray[np.float64]
     factor_names: list[str]
-    class_list: NDArray[Any]
+    class_names: list[str]
+    @overload
+    def _by_factor_type(
+        self,
+        attr: Literal["factor_names"],
+        factor_type: Literal["discrete", "continuous", "both"],
+    ) -> list[str]: ...
+    @overload
+    def _by_factor_type(
+        self,
+        attr: Literal["balance", "factors", "classwise"],
+        factor_type: Literal["discrete", "continuous", "both"],
+    ) -> NDArray[np.float64]: ...
+    def _by_factor_type(
+        self,
+        attr: Literal["balance", "factors", "classwise", "factor_names"],
+        factor_type: Literal["discrete", "continuous", "both"],
+    ) -> NDArray[np.float64] | list[str]:
+        # if not filtering by factor_type then just return the requested attribute without mask
+        if factor_type == "both":
+            return getattr(self, attr)
+        # create the mask for the selected factor_type
+        mask_lambda = (
+            (lambda x: "-continuous" not in x) if factor_type == "discrete" else (lambda x: "-discrete" not in x)
+        )
+        # return the masked attribute
+        if attr == "factor_names":
+            return [x.replace(f"-{factor_type}", "") for x in self.factor_names if mask_lambda(x)]
+        else:
+            factor_type_mask = [mask_lambda(x) for x in self.factor_names]
+            if attr == "factors":
+                return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
+            elif attr == "balance":
+                return self.balance[factor_type_mask]
+            elif attr == "classwise":
+                return self.classwise[:, factor_type_mask]
     def plot(
         self,
         row_labels: list[Any] | NDArray[Any] | None = None,
         col_labels: list[Any] | NDArray[Any] | None = None,
         plot_classwise: bool = False,
+        factor_type: Literal["discrete", "continuous", "both"] = "discrete",
     ) -> Figure:
         """
         Plot a heatmap of balance information
@@ -62,15 +104,17 @@ class BalanceOutput(Output):
             List/Array containing the labels for columns in the histogram
         plot_classwise : bool, default False
             Whether to plot per-class balance instead of global balance
+        factor_type : "discrete", "continuous", or "both", default "discrete"
+            Whether to plot discretized values, continuous values, or to include both
         """
         if plot_classwise:
             if row_labels is None:
-                row_labels = self.class_list
+                row_labels = self.class_names
             if col_labels is None:
-                col_labels = self.factor_names
+                col_labels = self._by_factor_type("factor_names", factor_type)
             fig = heatmap(
-                self.classwise,
+                self._by_factor_type("classwise", factor_type),
                 row_labels,
                 col_labels,
                 xlabel="Factors",
@@ -79,13 +123,19 @@ class BalanceOutput(Output):
             )
         else:
             # Combine balance and factors results
-            data = np.concatenate([self.balance[np.newaxis, 1:], self.factors], axis=0)
+            data = np.concatenate(
+                [
+                    self._by_factor_type("balance", factor_type)[np.newaxis, 1:],
+                    self._by_factor_type("factors", factor_type),
+                ],
+                axis=0,
+            )
             # Create a mask for the upper triangle of the symmetrical array, ignoring the diagonal
             mask = np.triu(data + 1, k=0) < 1
             # Finalize the data for the plot, last row is last factor x last factor so it gets dropped
             heat_data = np.where(mask, np.nan, data)[:-1]
             # Creating label array for heat map axes
-            heat_labels = self.factor_names
+            heat_labels = self._by_factor_type("factor_names", factor_type)
             if row_labels is None:
                 row_labels = heat_labels[:-1]
@@ -128,7 +178,7 @@ def balance(
     Parameters
     ----------
     metadata : Metadata
-        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
+        Preprocessed metadata
     num_neighbors : int, default 5
         Number of points to consider as neighbors
@@ -184,7 +234,7 @@ def balance(
     mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
     data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
     discretized_data = data
-    if metadata.continuous_data is not None:
+    if len(metadata.continuous_data):
         data = np.hstack((data, metadata.continuous_data))
         discrete_idx = [metadata.discrete_factor_names.index(name) for name in metadata.continuous_factor_names]
         discretized_data = np.hstack((discretized_data, metadata.discrete_data[:, discrete_idx]))
@@ -218,7 +268,7 @@ def balance(
     factors = nmi[1:, 1:]
     # assume class is a factor
-    num_classes = metadata.class_names.size
+    num_classes = len(metadata.class_names)
     classwise_mi = np.full((num_classes, num_factors), np.nan, dtype=np.float32)
     # classwise targets

dataeval/metrics/bias/{coverage.py → _coverage.py} RENAMED Viewed

@@ -8,12 +8,12 @@ from dataclasses import dataclass
 from typing import Any, Literal
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
 from scipy.spatial.distance import pdist, squareform
-from dataeval.interop import to_numpy
-from dataeval.output import Output, set_metadata
-from dataeval.utils.shared import flatten
+from dataeval._output import Output, set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import ensure_embeddings, flatten, to_numpy
 with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
@@ -71,21 +71,21 @@ def _plot(images: NDArray[Any], num_images: int) -> Figure:
 @dataclass(frozen=True)
 class CoverageOutput(Output):
     """
-    Output class for :func:`coverage` :term:`bias<Bias>` metric.
+    Output class for :func:`.coverage` :term:`bias<Bias>` metric.
     Attributes
     ----------
-    indices : NDArray[np.intp]
+    uncovered_indices : NDArray[np.intp]
         Array of uncovered indices
-    radii : NDArray[np.float64]
+    critical_value_radii : NDArray[np.float64]
         Array of critical value radii
-    critical_value : float
+    coverage_radius : float
         Radius for :term:`coverage<Coverage>`
     """
-    indices: NDArray[np.intp]
-    radii: NDArray[np.float64]
-    critical_value: float
+    uncovered_indices: NDArray[np.intp]
+    critical_value_radii: NDArray[np.float64]
+    coverage_radius: float
     def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
         """
@@ -102,8 +102,9 @@ class CoverageOutput(Output):
         -------
         matplotlib.figure.Figure
         """
         # Determine which images to plot
-        highest_uncovered_indices = self.indices[:top_k]
+        highest_uncovered_indices = self.uncovered_indices[:top_k]
         # Grab the images
         images = to_numpy(images)
@@ -119,7 +120,7 @@ class CoverageOutput(Output):
 def coverage(
     embeddings: ArrayLike,
     radius_type: Literal["adaptive", "naive"] = "adaptive",
-    k: int = 20,
+    num_observations: int = 20,
     percent: float = 0.01,
 ) -> CoverageOutput:
     """
@@ -128,11 +129,11 @@ def coverage(
     Parameters
     ----------
     embeddings : ArrayLike, shape - (N, P)
-        A dataset in an ArrayLike format.
-        Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
+        Dataset embeddings as unit interval [0, 1].
+        Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
     radius_type : {"adaptive", "naive"}, default "adaptive"
         The function used to determine radius.
-    k : int, default 20
+    num_observations : int, default 20
         Number of observations required in order to be covered.
         [1] suggests that a minimum of 20-50 samples is necessary.
     percent : float, default 0.01
@@ -146,7 +147,9 @@ def coverage(
     Raises
     ------
     ValueError
-        If length of :term:`embeddings<Embeddings>` is less than or equal to k
+        If embeddings are not unit interval [0-1]
+    ValueError
+        If length of :term:`embeddings<Embeddings>` is less than or equal to num_observations
     ValueError
         If radius_type is unknown
@@ -157,10 +160,10 @@ def coverage(
     Example
     -------
     >>> results = coverage(embeddings)
-    >>> results.indices
+    >>> results.uncovered_indices
     array([447, 412,   8,  32,  63])
-    >>> results.critical_value
-    0.8459038956941765
+    >>> results.coverage_radius
+    0.17592147193757596
     Reference
     ---------
@@ -169,26 +172,29 @@ def coverage(
     [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
     """
-    # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
-    embeddings = to_numpy(embeddings)
-    n = len(embeddings)
-    if n <= k:
+    # Calculate distance matrix, look at the (num_observations + 1)th farthest neighbor for each image.
+    embeddings = ensure_embeddings(embeddings, dtype=np.float64, unit_interval=True)
+    len_embeddings = len(embeddings)
+    if len_embeddings <= num_observations:
         raise ValueError(
-            f"Number of observations n={n} is less than or equal to the specified number of neighbors k={k}."
+            f"Length of embeddings ({len_embeddings}) is less than or equal to the specified number of \
+                observations ({num_observations})."
         )
-    mat = squareform(pdist(flatten(embeddings))).astype(np.float64)
-    sorted_dists = np.sort(mat, axis=1)
-    crit = sorted_dists[:, k + 1]
+    embeddings_matrix = squareform(pdist(flatten(embeddings))).astype(np.float64)
+    sorted_dists = np.sort(embeddings_matrix, axis=1)
+    critical_value_radii = sorted_dists[:, num_observations + 1]
     d = embeddings.shape[1]
     if radius_type == "naive":
-        rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
-        pvals = np.where(crit > rho)[0]
+        coverage_radius = (1 / math.sqrt(math.pi)) * (
+            (2 * num_observations * math.gamma(d / 2 + 1)) / (len_embeddings)
+        ) ** (1 / d)
+        uncovered_indices = np.where(critical_value_radii > coverage_radius)[0]
     elif radius_type == "adaptive":
-        # Use data adaptive cutoff as rho
-        selection = int(max(n * percent, 1))
-        pvals = np.argsort(crit)[::-1][:selection]
-        rho = float(np.mean(np.sort(crit)[::-1][selection - 1 : selection + 1]))
+        # Use data adaptive cutoff as coverage_radius
+        selection = int(max(len_embeddings * percent, 1))
+        uncovered_indices = np.argsort(critical_value_radii)[::-1][:selection]
+        coverage_radius = float(np.mean(np.sort(critical_value_radii)[::-1][selection - 1 : selection + 1]))
     else:
         raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
-    return CoverageOutput(pvals, crit, rho)
+    return CoverageOutput(uncovered_indices, critical_value_radii, coverage_radius)

dataeval/metrics/bias/{diversity.py → _diversity.py} RENAMED Viewed

@@ -8,12 +8,14 @@ from typing import Any, Literal
 import numpy as np
 import scipy as sp
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.output import Output, set_metadata
-from dataeval.utils.metadata import Metadata, get_counts
-from dataeval.utils.plot import heatmap
-from dataeval.utils.shared import get_method
+from dataeval._output import Output, set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._bin import get_counts
+from dataeval.utils._method import get_method
+from dataeval.utils._plot import heatmap
+from dataeval.utils.data import Metadata
 with contextlib.suppress(ImportError):
     from matplotlib.figure import Figure
@@ -37,7 +39,7 @@ def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
     """
     import matplotlib.pyplot as plt
-    fig, ax = plt.subplots(figsize=(10, 10))
+    fig, ax = plt.subplots(figsize=(8, 8))
     ax.bar(labels, bar_heights)
     ax.set_xlabel("Factors")
@@ -51,7 +53,7 @@ def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
 @dataclass(frozen=True)
 class DiversityOutput(Output):
     """
-    Output class for :func:`diversity` :term:`bias<Bias>` metric.
+    Output class for :func:`.diversity` :term:`bias<Bias>` metric.
     Attributes
     ----------
@@ -61,14 +63,14 @@ class DiversityOutput(Output):
         Classwise diversity index [n_class x n_factor]
     factor_names : list[str]
         Names of each metadata factor
-    class_list : NDArray[Any]
+    class_names : list[str]
         Class labels for each value in the dataset
     """
     diversity_index: NDArray[np.double]
     classwise: NDArray[np.double]
     factor_names: list[str]
-    class_list: NDArray[Any]
+    class_names: list[str]
     def plot(
         self,
@@ -90,7 +92,7 @@ class DiversityOutput(Output):
         """
         if plot_classwise:
             if row_labels is None:
-                row_labels = self.class_list
+                row_labels = self.class_names
             if col_labels is None:
                 col_labels = self.factor_names
@@ -191,6 +193,9 @@ def diversity_simpson(
     return ev_index
+_DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
 @set_metadata
 def diversity(
     metadata: Metadata,
@@ -210,7 +215,7 @@ def diversity(
     Parameters
     ----------
     metadata : Metadata
-        Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
+        Preprocessed metadata
     method : "simpson" or "shannon", default "simpson"
         The methodology used for defining diversity
@@ -251,7 +256,7 @@ def diversity(
     --------
     scipy.stats.entropy
     """
-    diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
+    diversity_fn = get_method(_DIVERSITY_FN_MAP, method)
     discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
     cnts = get_counts(discretized_data)
     num_bins = np.bincount(np.nonzero(cnts)[1])

dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl

dataeval 0.76.0py3-none-any.whl → 0.81.0py3-none-any.whl