PyPI - dataeval - Versions diffs - 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl - Mend

dataeval 0.72.2py3-none-any.whl → 0.73.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

dataeval/__init__.py +3 -3
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +1 -1
dataeval/detectors/drift/base.py +2 -2
dataeval/detectors/linters/clusterer.py +1 -1
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +14 -6
dataeval/detectors/ood/aegmm.py +14 -6
dataeval/detectors/ood/base.py +9 -3
dataeval/detectors/ood/llr.py +22 -16
dataeval/detectors/ood/vae.py +14 -6
dataeval/detectors/ood/vaegmm.py +14 -6
dataeval/interop.py +9 -7
dataeval/metrics/bias/balance.py +50 -44
dataeval/metrics/bias/coverage.py +38 -6
dataeval/metrics/bias/diversity.py +117 -65
dataeval/metrics/bias/metadata.py +225 -60
dataeval/metrics/bias/parity.py +68 -54
dataeval/utils/__init__.py +4 -3
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/shared.py +1 -1
dataeval/utils/split_dataset.py +12 -6
dataeval/utils/tensorflow/_internal/gmm.py +8 -2
dataeval/utils/tensorflow/_internal/loss.py +20 -11
dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
dataeval/utils/tensorflow/_internal/trainer.py +12 -5
dataeval/utils/tensorflow/_internal/utils.py +70 -71
dataeval/utils/torch/datasets.py +2 -2
dataeval/workflows/__init__.py +1 -1
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0

dataeval/metrics/bias/coverage.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 __all__ = ["CoverageOutput", "coverage"]
+import contextlib
 import math
 from dataclasses import dataclass
 from typing import Literal
@@ -11,9 +12,13 @@ from numpy.typing import ArrayLike, NDArray
 from scipy.spatial.distance import pdist, squareform
 from dataeval.interop import to_numpy
+from dataeval.metrics.bias.metadata import coverage_plot
 from dataeval.output import OutputMetadata, set_metadata
 from dataeval.utils.shared import flatten
+with contextlib.suppress(ImportError):
+    from matplotlib.figure import Figure
 @dataclass(frozen=True)
 class CoverageOutput(OutputMetadata):
@@ -22,9 +27,9 @@ class CoverageOutput(OutputMetadata):
     Attributes
     ----------
-    indices : NDArray
+    indices : NDArray[np.intp]
         Array of uncovered indices
-    radii : NDArray
+    radii : NDArray[np.float64]
         Array of critical value radii
     critical_value : float
         Radius for :term:`coverage<Coverage>`
@@ -34,13 +39,40 @@ class CoverageOutput(OutputMetadata):
     radii: NDArray[np.float64]
     critical_value: float
+    def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
+        """
+        Plot the top k images together for visualization
+        Parameters
+        ----------
+        images : ArrayLike
+            Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
+        top_k : int, default 6
+            Number of images to plot (plotting assumes groups of 3)
+        Returns
+        -------
+        matplotlib.figure.Figure
+        """
+        # Determine which images to plot
+        highest_uncovered_indices = self.indices[:top_k]
+        # Grab the images
+        images = to_numpy(images)
+        selected_images = images[highest_uncovered_indices]
+        # Plot the images
+        fig = coverage_plot(selected_images, top_k)
+        return fig
 @set_metadata()
 def coverage(
     embeddings: ArrayLike,
     radius_type: Literal["adaptive", "naive"] = "adaptive",
     k: int = 20,
-    percent: np.float64 = np.float64(0.01),
+    percent: float = 0.01,
 ) -> CoverageOutput:
     """
     Class for evaluating :term:`coverage<Coverage>` and identifying images/samples that are in undercovered regions.
@@ -50,12 +82,12 @@ def coverage(
     embeddings : ArrayLike, shape - (N, P)
         A dataset in an ArrayLike format.
         Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
-    radius_type : Literal["adaptive", "naive"], default "adaptive"
+    radius_type : {"adaptive", "naive"}, default "adaptive"
         The function used to determine radius.
-    k: int, default 20
+    k : int, default 20
         Number of observations required in order to be covered.
         [1] suggests that a minimum of 20-50 samples is necessary.
-    percent: np.float64, default np.float(0.01)
+    percent : float, default 0.01
         Percent of observations to be considered uncovered. Only applies to adaptive radius.
     Returns

dataeval/metrics/bias/diversity.py CHANGED Viewed

@@ -2,16 +2,28 @@ from __future__ import annotations
 __all__ = ["DiversityOutput", "diversity"]
+import contextlib
 from dataclasses import dataclass
 from typing import Any, Literal, Mapping
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.bias.metadata import entropy, get_counts, get_num_bins, heatmap, preprocess_metadata
+from dataeval.metrics.bias.metadata import (
+    CLASS_LABEL,
+    diversity_bar_plot,
+    entropy,
+    get_counts,
+    get_num_bins,
+    heatmap,
+    preprocess_metadata,
+)
 from dataeval.output import OutputMetadata, set_metadata
 from dataeval.utils.shared import get_method
+with contextlib.suppress(ImportError):
+    from matplotlib.figure import Figure
 @dataclass(frozen=True)
 class DiversityOutput(OutputMetadata):
@@ -24,50 +36,63 @@ class DiversityOutput(OutputMetadata):
         :term:`Diversity` index for classes and factors
     classwise : NDArray[np.float64]
         Classwise diversity index [n_class x n_factor]
-    class_list: NDArray[np.int64]
+    class_list : NDArray[np.int64]
         Class labels for each value in the dataset
-    metadata_names: list[str]
+    metadata_names : list[str]
         Names of each metadata factor
     """
     diversity_index: NDArray[np.float64]
     classwise: NDArray[np.float64]
-    class_list: NDArray[np.int64]
+    class_list: NDArray[Any]
     metadata_names: list[str]
-    method: Literal["shannon", "simpson"]
-    def plot(self, row_labels: NDArray[Any] | None = None, col_labels: NDArray[Any] | None = None) -> None:
+    def plot(
+        self,
+        row_labels: ArrayLike | list[Any] | None = None,
+        col_labels: ArrayLike | list[Any] | None = None,
+        plot_classwise: bool = False,
+    ) -> Figure:
         """
         Plot a heatmap of diversity information
         Parameters
         ----------
-        row_labels: NDArray | None, default None
-            Array containing the labels for rows in the histogram
-        col_labels: NDArray | None, default None
-            Array containing the labels for columns in the histogram
+        row_labels : ArrayLike or None, default None
+            List/Array containing the labels for rows in the histogram
+        col_labels : ArrayLike or None, default None
+            List/Array containing the labels for columns in the histogram
+        plot_classwise : bool, default False
+            Whether to plot per-class balance instead of global balance
         """
-        if row_labels is None:
-            row_labels = np.unique(self.class_list)
-        if col_labels is None:
-            col_labels = np.array(self.metadata_names)
+        if plot_classwise:
+            if row_labels is None:
+                row_labels = self.class_list
+            if col_labels is None:
+                col_labels = self.metadata_names
+            fig = heatmap(
+                self.classwise,
+                row_labels,
+                col_labels,
+                xlabel="Factors",
+                ylabel="Class",
+                cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
+            )
-        heatmap(
-            self.classwise,
-            row_labels,
-            col_labels,
-            xlabel="Factors",
-            ylabel="Class",
-            cbarlabel=f"Normalized {self.method.title()} Index",
-        )
+        else:
+            # Creating label array for heat map axes
+            heat_labels = np.concatenate((["class"], self.metadata_names))
+            fig = diversity_bar_plot(heat_labels, self.diversity_index)
+        return fig
 def diversity_shannon(
     data: NDArray[Any],
     names: list[str],
-    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
     subset_mask: NDArray[np.bool_] | None = None,
 ) -> NDArray[np.float64]:
     """
@@ -81,14 +106,16 @@ def diversity_shannon(
     Parameters
     ----------
-    data: NDArray
+    data : NDArray
         Array containing numerical values for metadata factors
-    names: list[str]
+    names : list[str]
         Names of metadata factors -- keys of the metadata dictionary
-    is_categorical: list[bool]
-        List of flags to identify whether variables are categorical (True) or
-        continuous (False)
-    subset_mask: NDArray[np.bool_] | None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Note
@@ -97,18 +124,32 @@ def diversity_shannon(
     Returns
     -------
-    diversity_index: NDArray
+    diversity_index : NDArray[np.float64]
         Diversity index per column of X
     See Also
     --------
     numpy.histogram
     """
+    hist_cache = {}
     # entropy computed using global auto bins so that we can properly normalize
-    ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
+    ent_unnormalized = entropy(
+        data,
+        names,
+        continuous_factor_bincounts,
+        normalized=False,
+        subset_mask=subset_mask,
+        hist_cache=hist_cache,
+    )
     # normalize by global counts rather than classwise counts
-    num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
+    num_bins = get_num_bins(
+        data,
+        names,
+        continuous_factor_bincounts=continuous_factor_bincounts,
+        subset_mask=subset_mask,
+        hist_cache=hist_cache,
+    )
     ent_norm = np.empty(ent_unnormalized.shape)
     ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
     ent_norm[num_bins == 1] = 0
@@ -118,7 +159,7 @@ def diversity_shannon(
 def diversity_simpson(
     data: NDArray[Any],
     names: list[str],
-    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
     subset_mask: NDArray[np.bool_] | None = None,
 ) -> NDArray[np.float64]:
     """
@@ -132,14 +173,16 @@ def diversity_simpson(
     Parameters
     ----------
-    data: NDArray
+    data : NDArray
         Array containing numerical values for metadata factors
-    names: list[str]
+    names : list[str]
         Names of metadata factors -- keys of the metadata dictionary
-    is_categorical: list[bool]
-        List of flags to identify whether variables are categorical (True) or
-        continuous (False)
-    subset_mask: NDArray[np.bool_] | None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Note
@@ -150,35 +193,39 @@ def diversity_simpson(
     Returns
     -------
-    NDArray
+    diversity_index : NDArray[np.float64]
         Diversity index per column of X
     See Also
     --------
     numpy.histogram
     """
+    hist_cache = {}
-    hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
+    hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
     # normalize by global counts, not classwise counts
-    num_bins = get_num_bins(data, names, is_categorical)
+    num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
     ev_index = np.empty(len(names))
     # loop over columns for convenience
     for col, cnts in enumerate(hist_counts.values()):
         # relative frequencies
-        p_i = cnts / cnts.sum()
+        p_i = cnts / np.sum(cnts)
         # inverse Simpson index normalized by (number of bins)
-        s_0 = 1 / np.sum(p_i**2) / num_bins[col]
+        s_0 = 1 / np.sum(p_i**2)  # / num_bins[col]
         if num_bins[col] == 1:
             ev_index[col] = 0
         else:
-            ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
+            ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
     return ev_index
 @set_metadata()
 def diversity(
-    class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
+    class_labels: ArrayLike,
+    metadata: Mapping[str, ArrayLike],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
+    method: Literal["simpson", "shannon"] = "simpson",
 ) -> DiversityOutput:
     """
     Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
@@ -191,11 +238,16 @@ def diversity(
     Parameters
     ----------
-    class_labels: ArrayLike
+    class_labels : ArrayLike
         List of class labels for each image
-    metadata: Mapping[str, ArrayLike]
+    metadata : Mapping[str, ArrayLike]
         Dict of list of metadata factors for each image
-    method: Literal["shannon", "simpson"], default "simpson"
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in metadata that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in metadata.
+    method : {"simpson", "shannon"}, default "simpson"
         Indicates which diversity index should be computed
     Note
@@ -214,34 +266,34 @@ def diversity(
     -------
     Compute Simpson diversity index of metadata and class labels
-    >>> div_simp = diversity(class_labels, metadata, method="simpson")
+    >>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
     >>> div_simp.diversity_index
-    array([0.18103448, 0.18103448, 0.88636364])
+    array([0.72413793, 0.72413793, 0.88636364])
     >>> div_simp.classwise
-    array([[0.17241379, 0.39473684],
-           [0.2       , 0.2       ]])
+    array([[0.68965517, 0.69230769],
+           [0.8       , 1.        ]])
     Compute Shannon diversity index of metadata and class labels
-    >>> div_shan = diversity(class_labels, metadata, method="shannon")
+    >>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
     >>> div_shan.diversity_index
-    array([0.37955133, 0.37955133, 0.96748876])
+    array([0.8812909 , 0.8812909 , 0.96748876])
     >>> div_shan.classwise
-    array([[0.43156028, 0.83224889],
-           [0.57938016, 0.57938016]])
+    array([[0.86312057, 0.91651644],
+           [0.91829583, 1.        ]])
     See Also
     --------
     numpy.histogram
     """
     diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
-    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
-    diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
+    data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
+    diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
-    class_idx = names.index("class_label")
-    class_lbl = np.array(data[:, class_idx], dtype=int)
+    class_idx = names.index(CLASS_LABEL)
+    class_lbl = data[:, class_idx]
     u_classes = np.unique(class_lbl)
     num_factors = len(names)
@@ -249,7 +301,7 @@ def diversity(
     diversity[:] = np.nan
     for idx, cls in enumerate(u_classes):
         subset_mask = class_lbl == cls
-        diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
+        diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
     div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
-    return DiversityOutput(diversity_index, div_no_class, class_lbl, list(metadata.keys()), method)
+    return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))

dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl

dataeval 0.72.2py3-none-any.whl → 0.73.1py3-none-any.whl