PyPI - dataeval - Versions diffs - 0.72.2__py3-none-any.whl → 0.73.0__py3-none-any.whl - Mend

dataeval 0.72.2py3-none-any.whl → 0.73.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

dataeval/__init__.py +1 -1
dataeval/detectors/ood/ae.py +14 -6
dataeval/detectors/ood/aegmm.py +14 -6
dataeval/detectors/ood/base.py +9 -3
dataeval/detectors/ood/llr.py +22 -16
dataeval/detectors/ood/vae.py +14 -6
dataeval/detectors/ood/vaegmm.py +14 -6
dataeval/interop.py +9 -7
dataeval/metrics/bias/balance.py +25 -29
dataeval/metrics/bias/coverage.py +35 -3
dataeval/metrics/bias/diversity.py +50 -27
dataeval/metrics/bias/metadata.py +99 -16
dataeval/metrics/bias/parity.py +43 -35
dataeval/utils/__init__.py +2 -1
dataeval/utils/lazy.py +26 -0
dataeval/utils/metadata.py +258 -0
dataeval/utils/tensorflow/_internal/gmm.py +8 -2
dataeval/utils/tensorflow/_internal/loss.py +20 -11
dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
dataeval/utils/tensorflow/_internal/trainer.py +12 -5
dataeval/utils/tensorflow/_internal/utils.py +70 -71
{dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/METADATA +3 -3
{dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/RECORD +25 -24
dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
{dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0

dataeval/metrics/bias/diversity.py CHANGED Viewed

@@ -2,16 +2,27 @@ from __future__ import annotations
 __all__ = ["DiversityOutput", "diversity"]
+import contextlib
 from dataclasses import dataclass
 from typing import Any, Literal, Mapping
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.bias.metadata import entropy, get_counts, get_num_bins, heatmap, preprocess_metadata
+from dataeval.metrics.bias.metadata import (
+    diversity_bar_plot,
+    entropy,
+    get_counts,
+    get_num_bins,
+    heatmap,
+    preprocess_metadata,
+)
 from dataeval.output import OutputMetadata, set_metadata
 from dataeval.utils.shared import get_method
+with contextlib.suppress(ImportError):
+    from matplotlib.figure import Figure
 @dataclass(frozen=True)
 class DiversityOutput(OutputMetadata):
@@ -32,36 +43,50 @@ class DiversityOutput(OutputMetadata):
     diversity_index: NDArray[np.float64]
     classwise: NDArray[np.float64]
-    class_list: NDArray[np.int64]
+    class_list: NDArray[Any]
     metadata_names: list[str]
     method: Literal["shannon", "simpson"]
-    def plot(self, row_labels: NDArray[Any] | None = None, col_labels: NDArray[Any] | None = None) -> None:
+    def plot(
+        self,
+        row_labels: list[Any] | NDArray[Any] | None = None,
+        col_labels: list[Any] | NDArray[Any] | None = None,
+        plot_classwise: bool = False,
+    ) -> Figure:
         """
         Plot a heatmap of diversity information
         Parameters
         ----------
-        row_labels: NDArray | None, default None
-            Array containing the labels for rows in the histogram
-        col_labels: NDArray | None, default None
-            Array containing the labels for columns in the histogram
+        row_labels : ArrayLike | None, default None
+            List/Array containing the labels for rows in the histogram
+        col_labels : ArrayLike | None, default None
+            List/Array containing the labels for columns in the histogram
+        plot_classwise : bool, default False
+            Whether to plot per-class balance instead of global balance
         """
-        if row_labels is None:
-            row_labels = np.unique(self.class_list)
-        if col_labels is None:
-            col_labels = np.array(self.metadata_names)
+        if plot_classwise:
+            if row_labels is None:
+                row_labels = self.class_list
+            if col_labels is None:
+                col_labels = self.metadata_names
+            fig = heatmap(
+                self.classwise,
+                row_labels,
+                col_labels,
+                xlabel="Factors",
+                ylabel="Class",
+                cbarlabel=f"Normalized {self.method.title()} Index",
+            )
-        heatmap(
-            self.classwise,
-            row_labels,
-            col_labels,
-            xlabel="Factors",
-            ylabel="Class",
-            cbarlabel=f"Normalized {self.method.title()} Index",
-        )
+        else:
+            # Creating label array for heat map axes
+            heat_labels = np.concatenate((["class"], self.metadata_names))
+            fig = diversity_bar_plot(heat_labels, self.diversity_index)
+        return fig
 def diversity_shannon(
@@ -237,19 +262,17 @@ def diversity(
     numpy.histogram
     """
     diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
-    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
     diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
     class_idx = names.index("class_label")
-    class_lbl = np.array(data[:, class_idx], dtype=int)
-    u_classes = np.unique(class_lbl)
+    u_classes = np.unique(data[:, class_idx])
     num_factors = len(names)
     diversity = np.empty((len(u_classes), num_factors))
     diversity[:] = np.nan
     for idx, cls in enumerate(u_classes):
-        subset_mask = class_lbl == cls
+        subset_mask = data[:, class_idx] == cls
         diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
     div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
-    return DiversityOutput(diversity_index, div_no_class, class_lbl, list(metadata.keys()), method)
+    return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()), method)

dataeval/metrics/bias/metadata.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 __all__ = []
+import contextlib
 from typing import Any, Mapping
 import numpy as np
@@ -10,6 +11,11 @@ from scipy.stats import entropy as sp_entropy
 from dataeval.interop import to_numpy
+with contextlib.suppress(ImportError):
+    from matplotlib.figure import Figure
+CLASS_LABEL = "class_label"
 def get_counts(
     data: NDArray[np.int_], names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
@@ -147,14 +153,24 @@ def infer_categorical(arr: NDArray[Any], threshold: float = 0.2) -> NDArray[Any]
 def preprocess_metadata(
     class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
-) -> tuple[NDArray[Any], list[str], list[bool]]:
+) -> tuple[NDArray[Any], list[str], list[bool], NDArray[np.str_]]:
+    # if class_labels is not numeric
+    class_array = to_numpy(class_labels)
+    if not np.issubdtype(class_array.dtype, np.number):
+        unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
+    else:
+        numerical_labels = np.asarray(class_array, dtype=int)
+        unique_classes = np.unique(class_array)
     # convert class_labels and dict of lists to matrix of metadata values
-    preprocessed_metadata = {"class_label": np.asarray(class_labels, dtype=int)}
+    preprocessed_metadata = {CLASS_LABEL: numerical_labels}
     # map columns of dict that are not numeric (e.g. string) to numeric values
     # that mutual information and diversity functions can accommodate.  Each
     # unique string receives a unique integer value.
     for k, v in metadata.items():
+        if k == CLASS_LABEL:
+            k = "label_class"
         # if not numeric
         v = to_numpy(v)
         if not np.issubdtype(v.dtype, np.number):
@@ -167,35 +183,34 @@ def preprocess_metadata(
     names = list(preprocessed_metadata.keys())
     is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
-    return data, names, is_categorical
+    return data, names, is_categorical, unique_classes
 def heatmap(
     data: NDArray[Any],
-    row_labels: NDArray[Any],
-    col_labels: NDArray[Any],
+    row_labels: list[str] | NDArray[Any],
+    col_labels: list[str] | NDArray[Any],
     xlabel: str = "",
     ylabel: str = "",
     cbarlabel: str = "",
-) -> None:
+) -> Figure:
     """
     Plots a formatted heatmap
     Parameters
     ----------
-    data: NDArray
+    data : NDArray
         Array containing numerical values for factors to plot
-    row_labels: NDArray
-        Array containing the labels for rows in the histogram
-    col_labels: NDArray
-        Array containing the labels for columns in the histogram
-    xlabel: str, default ""
+    row_labels : ArrayLike
+        List/Array containing the labels for rows in the histogram
+    col_labels : ArrayLike
+        List/Array containing the labels for columns in the histogram
+    xlabel : str, default ""
         X-axis label
-    ylabel: str, default ""
+    ylabel : str, default ""
         Y-axis label
-    cbarlabel: str, default ""
+    cbarlabel : str, default ""
         Label for the colorbar
     """
     import matplotlib
     import matplotlib.pyplot as plt
@@ -252,7 +267,7 @@ def heatmap(
             texts.append(text)
     fig.tight_layout()
-    plt.show()
+    return fig
 # Function to define how the text is displayed in the heatmap
@@ -273,3 +288,71 @@ def format_text(*args: str) -> str:
     """
     x = args[0]
     return f"{x:.2f}".replace("0.00", "0").replace("0.", ".").replace("nan", "")
+def diversity_bar_plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
+    """
+    Plots a formatted bar plot
+    Parameters
+    ----------
+    labels : NDArray
+        Array containing the labels for each bar
+    bar_heights : NDArray
+        Array containing the values for each bar
+    """
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(figsize=(10, 10))
+    ax.bar(labels, bar_heights)
+    ax.set_xlabel("Factors")
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
+    fig.tight_layout()
+    return fig
+def coverage_plot(images: NDArray[Any], num_images: int) -> Figure:
+    """
+    Creates a single plot of all of the provided images
+    Parameters
+    ----------
+    images : NDArray
+        Array containing only the desired images to plot
+    """
+    import matplotlib.pyplot as plt
+    num_images = min(num_images, len(images))
+    if images.ndim == 4:
+        images = np.moveaxis(images, 1, -1)
+    elif images.ndim == 3:
+        images = np.repeat(images[:, :, :, np.newaxis], 3, axis=-1)
+    else:
+        raise ValueError(
+            f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
+        )
+    rows = np.ceil(num_images / 3).astype(int)
+    fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
+    if rows == 1:
+        for j in range(3):
+            if j >= len(images):
+                continue
+            axs[j].imshow(images[j])
+            axs[j].axis("off")
+    else:
+        for i in range(rows):
+            for j in range(3):
+                i_j = i * 3 + j
+                if i_j >= len(images):
+                    continue
+                axs[i, j].imshow(images[i_j])
+                axs[i, j].axis("off")
+    fig.tight_layout()
+    return fig

dataeval/metrics/bias/parity.py CHANGED Viewed

@@ -11,6 +11,7 @@ from numpy.typing import ArrayLike, NDArray
 from scipy.stats import chi2_contingency, chisquare
 from dataeval.interop import to_numpy
+from dataeval.metrics.bias.metadata import CLASS_LABEL, preprocess_metadata
 from dataeval.output import OutputMetadata, set_metadata
 TData = TypeVar("TData", np.float64, NDArray[np.float64])
@@ -27,10 +28,13 @@ class ParityOutput(Generic[TData], OutputMetadata):
         chi-squared score(s) of the test
     p_value : np.float64 | NDArray[np.float64]
         p-value(s) of the test
+    metadata_names: list[str] | None
+        Names of each metadata factor
     """
     score: TData
     p_value: TData
+    metadata_names: list[str] | None
 def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name: str) -> NDArray[np.intp]:
@@ -66,7 +70,7 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
 def format_discretize_factors(
-    data_factors: Mapping[str, NDArray[Any]], continuous_factor_bincounts: Mapping[str, int]
+    data: NDArray[Any], names: list[str], is_categorical: list[bool], continuous_factor_bincounts: Mapping[str, int]
 ) -> dict[str, NDArray[Any]]:
     """
     Sets up the internal list of metadata factors.
@@ -89,30 +93,32 @@ def format_discretize_factors(
           Each key is a metadata factor, whose value is the discrete per-image factor values.
     """
-    invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
+    invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
     if invalid_keys:
         raise KeyError(
             f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
             "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
         )
+    warn = []
     metadata_factors = {}
-    # make sure each factor has the same number of entries
-    lengths = []
-    for arr in data_factors.values():
-        lengths.append(arr.shape)
-    if lengths[1:] != lengths[:-1]:
-        raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
-    metadata_factors = {
-        name: val
-        if name not in continuous_factor_bincounts
-        else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
-        for name, val in data_factors.items()
-        if name != "class"
-    }
+    for i, name in enumerate(names):
+        if name == CLASS_LABEL:
+            continue
+        if name in continuous_factor_bincounts:
+            metadata_factors[name] = digitize_factor_bins(data[:, i], continuous_factor_bincounts[name], name)
+        elif not is_categorical[i]:
+            warn.append(name)
+            metadata_factors[name] = data[:, i]
+        else:
+            metadata_factors[name] = data[:, i]
+    if warn:
+        warnings.warn(
+            f"The following factors appear to be continuous but did not have the desired number of bins specified: \n\
+            {warn}",
+            UserWarning,
+        )
     return metadata_factors
@@ -247,7 +253,7 @@ def label_parity(
     >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
     >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
     >>> label_parity(expected_labels, observed_labels)
-    ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
+    ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
     """
     # Calculate
@@ -278,13 +284,13 @@ def label_parity(
         )
     cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
-    return ParityOutput(cs, p)
+    return ParityOutput(cs, p, None)
 @set_metadata()
 def parity(
     class_labels: ArrayLike,
-    data_factors: Mapping[str, ArrayLike],
+    metadata: Mapping[str, ArrayLike],
     continuous_factor_bincounts: Mapping[str, int] | None = None,
 ) -> ParityOutput[NDArray[np.float64]]:
     """
@@ -299,12 +305,12 @@ def parity(
     ----------
     class_labels: ArrayLike
         List of class labels for each image
-    data_factors: Mapping[str, ArrayLike]
+    metadata: Mapping[str, ArrayLike]
         The dataset factors, which are per-image metadata attributes.
         Each key of dataset_factors is a factor, whose value is the per-image factor values.
     continuous_factor_bincounts : Mapping[str, int] | None, default None
         A dictionary specifying the number of bins for discretizing the continuous factors.
-        The keys should correspond to the names of continuous factors in `data_factors`,
+        The keys should correspond to the names of continuous factors in `metadata`,
         and the values should be the number of bins to use for discretization.
         If not provided, no discretization is applied.
@@ -337,42 +343,44 @@ def parity(
     Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
     >>> labels = np_random_gen.choice([0, 1, 2], (100))
-    >>> data_factors = {
+    >>> metadata = {
     ...     "age": np_random_gen.choice([25, 30, 35, 45], (100)),
     ...     "income": np_random_gen.choice([50000, 65000, 80000], (100)),
     ...     "gender": np_random_gen.choice(["M", "F"], (100)),
     ... }
     >>> continuous_factor_bincounts = {"age": 4, "income": 3}
-    >>> parity(labels, data_factors, continuous_factor_bincounts)
-    ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]))
-    """
+    >>> parity(labels, metadata, continuous_factor_bincounts)
+    ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
+    """  # noqa: E501
     if len(np.shape(class_labels)) > 1:
         raise ValueError(
             f"Got class labels with {len(np.shape(class_labels))}-dimensional",
             f" shape {np.shape(class_labels)}, but expected a 1-dimensional array.",
         )
-    data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
+    data, names, is_categorical, _ = preprocess_metadata(class_labels, metadata)
     continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
-    labels = to_numpy(class_labels)
-    factors = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
+    factors = format_discretize_factors(data, names, is_categorical, continuous_factor_bincounts)
+    # unique class labels
+    class_idx = names.index(CLASS_LABEL)
+    u_cls = np.unique(data[:, class_idx])
     chi_scores = np.zeros(len(factors))
     p_values = np.zeros(len(factors))
-    n_cls = len(np.unique(labels))
     not_enough_data = {}
     for i, (current_factor_name, factor_values) in enumerate(factors.items()):
         unique_factor_values = np.unique(factor_values)
-        contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
+        contingency_matrix = np.zeros((len(unique_factor_values), u_cls.size))
         # Builds a contingency matrix where entry at index (r,c) represents
         # the frequency of current_factor_name achieving value unique_factor_values[r]
         # at a data point with class c.
         # TODO: Vectorize this nested for loop
         for fi, factor_value in enumerate(unique_factor_values):
-            for label in range(n_cls):
-                with_both = np.bitwise_and((labels == label), factor_values == factor_value)
+            for label in u_cls:
+                with_both = np.bitwise_and((data[:, class_idx] == label), factor_values == factor_value)
                 contingency_matrix[fi, label] = np.sum(with_both)
                 if 0 < contingency_matrix[fi, label] < 5:
                     if current_factor_name not in not_enough_data:
@@ -414,4 +422,4 @@ def parity(
             UserWarning,
         )
-    return ParityOutput(chi_scores, p_values)
+    return ParityOutput(chi_scores, p_values, list(metadata.keys()))

dataeval/utils/__init__.py CHANGED Viewed

@@ -5,9 +5,10 @@ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backend
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
+from dataeval.utils.metadata import merge_metadata
 from dataeval.utils.split_dataset import split_dataset
-__all__ = ["split_dataset"]
+__all__ = ["split_dataset", "merge_metadata"]
 if _IS_TORCH_AVAILABLE:  # pragma: no cover
     from dataeval.utils import torch

dataeval/utils/lazy.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+from functools import cached_property
+from importlib import import_module
+from typing import Any
+class LazyModule:
+    def __init__(self, name: str) -> None:
+        self._name = name
+    def __getattr__(self, key: str) -> Any:
+        return getattr(self._module, key)
+    @cached_property
+    def _module(self):
+        return import_module(self._name)
+LAZY_MODULES: dict[str, LazyModule] = {}
+def lazyload(name: str) -> LazyModule:
+    if name not in LAZY_MODULES:
+        LAZY_MODULES[name] = LazyModule(name)
+    return LAZY_MODULES[name]

dataeval 0.72.2__py3-none-any.whl → 0.73.0__py3-none-any.whl

dataeval 0.72.2py3-none-any.whl → 0.73.0py3-none-any.whl