PyPI - dataeval - Versions diffs - 0.73.0__py3-none-any.whl → 0.73.1__py3-none-any.whl - Mend

dataeval 0.73.0py3-none-any.whl → 0.73.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

dataeval/__init__.py +3 -3
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +1 -1
dataeval/detectors/drift/base.py +2 -2
dataeval/detectors/linters/clusterer.py +1 -1
dataeval/detectors/ood/__init__.py +1 -1
dataeval/metrics/bias/balance.py +29 -19
dataeval/metrics/bias/coverage.py +11 -11
dataeval/metrics/bias/diversity.py +79 -50
dataeval/metrics/bias/metadata.py +133 -51
dataeval/metrics/bias/parity.py +30 -24
dataeval/utils/__init__.py +2 -2
dataeval/utils/shared.py +1 -1
dataeval/utils/split_dataset.py +12 -6
dataeval/utils/torch/datasets.py +2 -2
dataeval/workflows/__init__.py +1 -1
{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/METADATA +1 -1
{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/RECORD +20 -20
{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.73.0"
+__version__ = "0.73.1"
 from importlib.util import find_spec
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics  # noqa: E402
 __all__ = ["detectors", "metrics"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval import workflows
     __all__ += ["workflows"]
-if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
     from dataeval import utils
     __all__ += ["utils"]

dataeval/detectors/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ from dataeval.detectors import drift, linters
 __all__ = ["drift", "linters"]
-if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE:
     from dataeval.detectors import ood
     __all__ += ["ood"]

dataeval/detectors/drift/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
 __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
     from dataeval.detectors.drift.torch import preprocess_drift
     from dataeval.detectors.drift.uncertainty import DriftUncertainty

dataeval/detectors/drift/base.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.interop import as_numpy, to_numpy
+from dataeval.interop import as_numpy
 from dataeval.output import OutputMetadata, set_metadata
 R = TypeVar("R")
@@ -196,7 +196,7 @@ class BaseDrift:
         if correction not in ["bonferroni", "fdr"]:
             raise ValueError("`correction` must be `bonferroni` or `fdr`.")
-        self._x_ref = to_numpy(x_ref)
+        self._x_ref = as_numpy(x_ref)
         self.x_ref_preprocessed: bool = x_ref_preprocessed
         # Other attributes

dataeval/detectors/linters/clusterer.py CHANGED Viewed

@@ -480,7 +480,7 @@ class Clusterer:
             samples = self.clusters[level][cluster_id].samples
             if len(samples) >= self._min_num_samples_per_cluster:
                 duplicates_std.append(self.clusters[level][cluster_id].dist_std)
-        diag_mask = np.ones_like(self._sqdmat, dtype=bool)
+        diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
         np.fill_diagonal(diag_mask, 0)
         diag_mask = np.triu(diag_mask)

dataeval/detectors/ood/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE:
     from dataeval.detectors.ood.ae import OOD_AE
     from dataeval.detectors.ood.aegmm import OOD_AEGMM
     from dataeval.detectors.ood.base import OODOutput, OODScoreOutput

dataeval/metrics/bias/balance.py CHANGED Viewed

@@ -11,7 +11,7 @@ import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
+from dataeval.metrics.bias.metadata import CLASS_LABEL, entropy, heatmap, preprocess_metadata
 from dataeval.output import OutputMetadata, set_metadata
 with contextlib.suppress(ImportError):
@@ -31,9 +31,9 @@ class BalanceOutput(OutputMetadata):
         Estimate of inter/intra-factor mutual information
     classwise : NDArray[np.float64]
         Estimate of mutual information between metadata factors and individual class labels
-    class_list: NDArray
+    class_list : NDArray
         Array of the class labels present in the dataset
-    metadata_names: list[str]
+    metadata_names : list[str]
         Names of each metadata factor
     """
@@ -54,9 +54,9 @@ class BalanceOutput(OutputMetadata):
         Parameters
         ----------
-        row_labels : ArrayLike | None, default None
+        row_labels : ArrayLike or None, default None
             List/Array containing the labels for rows in the histogram
-        col_labels : ArrayLike | None, default None
+        col_labels : ArrayLike or None, default None
             List/Array containing the labels for columns in the histogram
         plot_classwise : bool, default False
             Whether to plot per-class balance instead of global balance
@@ -116,19 +116,29 @@ def validate_num_neighbors(num_neighbors: int) -> int:
 @set_metadata("dataeval.metrics")
-def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
+def balance(
+    class_labels: ArrayLike,
+    metadata: Mapping[str, ArrayLike],
+    num_neighbors: int = 5,
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
+) -> BalanceOutput:
     """
     Mutual information (MI) between factors (class label, metadata, label/image properties)
     Parameters
     ----------
-    class_labels: ArrayLike
+    class_labels : ArrayLike
         List of class labels for each image
-    metadata: Mapping[str, ArrayLike]
+    metadata : Mapping[str, ArrayLike]
         Dict of lists of metadata factors for each image
-    num_neighbors: int, default 5
+    num_neighbors : int, default 5
         Number of nearest neighbors to use for computing MI between discrete
         and continuous variables.
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in metadata that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in metadata.
     Returns
     -------
@@ -148,7 +158,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
     -------
     Return balance (mutual information) of factors with class_labels
-    >>> bal = balance(class_labels, metadata)
+    >>> bal = balance(class_labels, metadata, continuous_factor_bincounts=continuous_factor_bincounts)
     >>> bal.balance
     array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
@@ -165,6 +175,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
     array([[0.99999822, 0.13363788, 0.        , 0.        ],
            [0.99999822, 0.13363788, 0.        , 0.        ]])
     See Also
     --------
     sklearn.feature_selection.mutual_info_classif
@@ -178,9 +189,9 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
     mi[:] = np.nan
     for idx in range(num_factors):
-        tgt = data[:, idx].astype(int)
+        tgt = data[:, idx].astype(np.intp)
-        if is_categorical[idx]:
+        if continuous_factor_bincounts and names[idx] not in continuous_factor_bincounts:
             mi[idx, :] = mutual_info_classif(
                 data,
                 tgt,
@@ -197,7 +208,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
                 random_state=0,
             )
-    ent_all = entropy(data, names, is_categorical, normalized=False)
+    ent_all = entropy(data, names, continuous_factor_bincounts, normalized=False)
     norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
     # in principle MI should be symmetric, but it is not in practice.
     nmi = 0.5 * (mi + mi.T) / norm_factor
@@ -205,7 +216,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
     factors = nmi[1:, 1:]
     # unique class labels
-    class_idx = names.index("class_label")
+    class_idx = names.index(CLASS_LABEL)
     u_cls = np.unique(data[:, class_idx])
     num_classes = len(u_cls)
@@ -214,12 +225,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
     classwise_mi[:] = np.nan
     # categorical variables, excluding class label
-    cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
+    cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(np.intp)
-    tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(int)
-    ent_tgt_bin = entropy(
-        tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
-    )
+    tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(np.intp)
+    names = [str(idx) for idx in range(num_classes)]
+    ent_tgt_bin = entropy(tgt_bin, names, continuous_factor_bincounts)
     # classification MI for discrete/categorical features
     for idx in range(num_classes):

dataeval/metrics/bias/coverage.py CHANGED Viewed

@@ -5,7 +5,7 @@ __all__ = ["CoverageOutput", "coverage"]
 import contextlib
 import math
 from dataclasses import dataclass
-from typing import Any, Literal
+from typing import Literal
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
@@ -27,9 +27,9 @@ class CoverageOutput(OutputMetadata):
     Attributes
     ----------
-    indices : NDArray
+    indices : NDArray[np.intp]
         Array of uncovered indices
-    radii : NDArray
+    radii : NDArray[np.float64]
         Array of critical value radii
     critical_value : float
         Radius for :term:`coverage<Coverage>`
@@ -39,11 +39,7 @@ class CoverageOutput(OutputMetadata):
     radii: NDArray[np.float64]
     critical_value: float
-    def plot(
-        self,
-        images: NDArray[Any],
-        top_k: int = 6,
-    ) -> Figure:
+    def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
         """
         Plot the top k images together for visualization
@@ -53,6 +49,10 @@ class CoverageOutput(OutputMetadata):
             Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
         top_k : int, default 6
             Number of images to plot (plotting assumes groups of 3)
+        Returns
+        -------
+        matplotlib.figure.Figure
         """
         # Determine which images to plot
         highest_uncovered_indices = self.indices[:top_k]
@@ -82,12 +82,12 @@ def coverage(
     embeddings : ArrayLike, shape - (N, P)
         A dataset in an ArrayLike format.
         Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
-    radius_type : Literal["adaptive", "naive"], default "adaptive"
+    radius_type : {"adaptive", "naive"}, default "adaptive"
         The function used to determine radius.
-    k: int, default 20
+    k : int, default 20
         Number of observations required in order to be covered.
         [1] suggests that a minimum of 20-50 samples is necessary.
-    percent: float, default 0.01
+    percent : float, default 0.01
         Percent of observations to be considered uncovered. Only applies to adaptive radius.
     Returns

dataeval/metrics/bias/diversity.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from dataeval.metrics.bias.metadata import (
+    CLASS_LABEL,
     diversity_bar_plot,
     entropy,
     get_counts,
@@ -35,9 +36,9 @@ class DiversityOutput(OutputMetadata):
         :term:`Diversity` index for classes and factors
     classwise : NDArray[np.float64]
         Classwise diversity index [n_class x n_factor]
-    class_list: NDArray[np.int64]
+    class_list : NDArray[np.int64]
         Class labels for each value in the dataset
-    metadata_names: list[str]
+    metadata_names : list[str]
         Names of each metadata factor
     """
@@ -45,12 +46,11 @@ class DiversityOutput(OutputMetadata):
     classwise: NDArray[np.float64]
     class_list: NDArray[Any]
     metadata_names: list[str]
-    method: Literal["shannon", "simpson"]
     def plot(
         self,
-        row_labels: list[Any] | NDArray[Any] | None = None,
-        col_labels: list[Any] | NDArray[Any] | None = None,
+        row_labels: ArrayLike | list[Any] | None = None,
+        col_labels: ArrayLike | list[Any] | None = None,
         plot_classwise: bool = False,
     ) -> Figure:
         """
@@ -58,9 +58,9 @@ class DiversityOutput(OutputMetadata):
         Parameters
         ----------
-        row_labels : ArrayLike | None, default None
+        row_labels : ArrayLike or None, default None
             List/Array containing the labels for rows in the histogram
-        col_labels : ArrayLike | None, default None
+        col_labels : ArrayLike or None, default None
             List/Array containing the labels for columns in the histogram
         plot_classwise : bool, default False
             Whether to plot per-class balance instead of global balance
@@ -77,7 +77,7 @@ class DiversityOutput(OutputMetadata):
                 col_labels,
                 xlabel="Factors",
                 ylabel="Class",
-                cbarlabel=f"Normalized {self.method.title()} Index",
+                cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
             )
         else:
@@ -92,7 +92,7 @@ class DiversityOutput(OutputMetadata):
 def diversity_shannon(
     data: NDArray[Any],
     names: list[str],
-    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
     subset_mask: NDArray[np.bool_] | None = None,
 ) -> NDArray[np.float64]:
     """
@@ -106,14 +106,16 @@ def diversity_shannon(
     Parameters
     ----------
-    data: NDArray
+    data : NDArray
         Array containing numerical values for metadata factors
-    names: list[str]
+    names : list[str]
         Names of metadata factors -- keys of the metadata dictionary
-    is_categorical: list[bool]
-        List of flags to identify whether variables are categorical (True) or
-        continuous (False)
-    subset_mask: NDArray[np.bool_] | None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Note
@@ -122,18 +124,32 @@ def diversity_shannon(
     Returns
     -------
-    diversity_index: NDArray
+    diversity_index : NDArray[np.float64]
         Diversity index per column of X
     See Also
     --------
     numpy.histogram
     """
+    hist_cache = {}
     # entropy computed using global auto bins so that we can properly normalize
-    ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
+    ent_unnormalized = entropy(
+        data,
+        names,
+        continuous_factor_bincounts,
+        normalized=False,
+        subset_mask=subset_mask,
+        hist_cache=hist_cache,
+    )
     # normalize by global counts rather than classwise counts
-    num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
+    num_bins = get_num_bins(
+        data,
+        names,
+        continuous_factor_bincounts=continuous_factor_bincounts,
+        subset_mask=subset_mask,
+        hist_cache=hist_cache,
+    )
     ent_norm = np.empty(ent_unnormalized.shape)
     ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
     ent_norm[num_bins == 1] = 0
@@ -143,7 +159,7 @@ def diversity_shannon(
 def diversity_simpson(
     data: NDArray[Any],
     names: list[str],
-    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
     subset_mask: NDArray[np.bool_] | None = None,
 ) -> NDArray[np.float64]:
     """
@@ -157,14 +173,16 @@ def diversity_simpson(
     Parameters
     ----------
-    data: NDArray
+    data : NDArray
         Array containing numerical values for metadata factors
-    names: list[str]
+    names : list[str]
         Names of metadata factors -- keys of the metadata dictionary
-    is_categorical: list[bool]
-        List of flags to identify whether variables are categorical (True) or
-        continuous (False)
-    subset_mask: NDArray[np.bool_] | None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Note
@@ -175,35 +193,39 @@ def diversity_simpson(
     Returns
     -------
-    NDArray
+    diversity_index : NDArray[np.float64]
         Diversity index per column of X
     See Also
     --------
     numpy.histogram
     """
+    hist_cache = {}
-    hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
+    hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
     # normalize by global counts, not classwise counts
-    num_bins = get_num_bins(data, names, is_categorical)
+    num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
     ev_index = np.empty(len(names))
     # loop over columns for convenience
     for col, cnts in enumerate(hist_counts.values()):
         # relative frequencies
-        p_i = cnts / cnts.sum()
+        p_i = cnts / np.sum(cnts)
         # inverse Simpson index normalized by (number of bins)
-        s_0 = 1 / np.sum(p_i**2) / num_bins[col]
+        s_0 = 1 / np.sum(p_i**2)  # / num_bins[col]
         if num_bins[col] == 1:
             ev_index[col] = 0
         else:
-            ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
+            ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
     return ev_index
 @set_metadata()
 def diversity(
-    class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
+    class_labels: ArrayLike,
+    metadata: Mapping[str, ArrayLike],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
+    method: Literal["simpson", "shannon"] = "simpson",
 ) -> DiversityOutput:
     """
     Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
@@ -216,11 +238,16 @@ def diversity(
     Parameters
     ----------
-    class_labels: ArrayLike
+    class_labels : ArrayLike
         List of class labels for each image
-    metadata: Mapping[str, ArrayLike]
+    metadata : Mapping[str, ArrayLike]
         Dict of list of metadata factors for each image
-    method: Literal["shannon", "simpson"], default "simpson"
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in metadata that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in metadata.
+    method : {"simpson", "shannon"}, default "simpson"
         Indicates which diversity index should be computed
     Note
@@ -239,40 +266,42 @@ def diversity(
     -------
     Compute Simpson diversity index of metadata and class labels
-    >>> div_simp = diversity(class_labels, metadata, method="simpson")
+    >>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
     >>> div_simp.diversity_index
-    array([0.18103448, 0.18103448, 0.88636364])
+    array([0.72413793, 0.72413793, 0.88636364])
     >>> div_simp.classwise
-    array([[0.17241379, 0.39473684],
-           [0.2       , 0.2       ]])
+    array([[0.68965517, 0.69230769],
+           [0.8       , 1.        ]])
     Compute Shannon diversity index of metadata and class labels
-    >>> div_shan = diversity(class_labels, metadata, method="shannon")
+    >>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
     >>> div_shan.diversity_index
-    array([0.37955133, 0.37955133, 0.96748876])
+    array([0.8812909 , 0.8812909 , 0.96748876])
     >>> div_shan.classwise
-    array([[0.43156028, 0.83224889],
-           [0.57938016, 0.57938016]])
+    array([[0.86312057, 0.91651644],
+           [0.91829583, 1.        ]])
     See Also
     --------
     numpy.histogram
     """
     diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
-    data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
-    diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
+    data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
+    diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
+    class_idx = names.index(CLASS_LABEL)
+    class_lbl = data[:, class_idx]
-    class_idx = names.index("class_label")
-    u_classes = np.unique(data[:, class_idx])
+    u_classes = np.unique(class_lbl)
     num_factors = len(names)
     diversity = np.empty((len(u_classes), num_factors))
     diversity[:] = np.nan
     for idx, cls in enumerate(u_classes):
-        subset_mask = data[:, class_idx] == cls
-        diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
+        subset_mask = class_lbl == cls
+        diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
     div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
-    return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()), method)
+    return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))

dataeval/metrics/bias/metadata.py CHANGED Viewed

@@ -18,52 +18,80 @@ CLASS_LABEL = "class_label"
 def get_counts(
-    data: NDArray[np.int_], names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
-) -> tuple[dict[str, NDArray[np.int_]], dict[str, NDArray[np.int_]]]:
+    data: NDArray[Any],
+    names: list[str],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
+    subset_mask: NDArray[np.bool_] | None = None,
+    hist_cache: dict[str, NDArray[np.intp]] | None = None,
+) -> dict[str, NDArray[np.intp]]:
     """
     Initialize dictionary of histogram counts --- treat categorical values
     as histogram bins.
     Parameters
     ----------
-    subset_mask: NDArray[np.bool_] | None
+    data : NDArray
+        Array containing numerical values for metadata factors
+    names : list[str]
+        Names of metadata factors -- keys of the metadata dictionary
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+        Names of metadata factors -- keys of the metadata dictionary
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
+    hist_cache : dict[str, NDArray[np.intp]] or None, default None
+        Optional cache to store histogram counts
     Returns
     -------
-    counts: Dict
+    dict[str, NDArray[np.intp]]
         histogram counts per metadata factor in `factors`.  Each
         factor will have a different number of bins.  Counts get reused
         across metrics, so hist_counts are cached but only if computed
         globally, i.e. without masked samples.
     """
-    hist_counts, hist_bins = {}, {}
-    # np.where needed to satisfy linter
-    mask = np.where(subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=bool))
+    hist_counts = {}
+    mask = subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=np.bool_)
     for cdx, fn in enumerate(names):
-        # linter doesn't like double indexing
-        col_data = data[mask, cdx].squeeze()
-        if is_categorical[cdx]:
-            # if discrete, use unique values as bins
-            bins, cnts = np.unique(col_data, return_counts=True)
+        if hist_cache is not None and fn in hist_cache:
+            cnts = hist_cache[fn]
         else:
-            bins = hist_bins.get(fn, "auto")
-            cnts, bins = np.histogram(col_data, bins=bins, density=True)
+            hist_edges = np.array([-np.inf, np.inf])
+            cnts = np.array([len(data[:, cdx].squeeze())])
+            # linter doesn't like double indexing
+            col_data = np.array(data[mask, cdx].squeeze(), dtype=np.float64)
+            if continuous_factor_bincounts and fn in continuous_factor_bincounts:
+                num_bins = continuous_factor_bincounts[fn]
+                _, hist_edges = np.histogram(data[:, cdx].squeeze(), bins=num_bins, density=True)
+                hist_edges[-1] = np.inf
+                hist_edges[0] = -np.inf
+                disc_col_data = np.digitize(col_data, np.array(hist_edges))
+                _, cnts = np.unique(disc_col_data, return_counts=True)
+            else:
+                _, cnts = np.unique(col_data, return_counts=True)
+            if hist_cache is not None:
+                hist_cache[fn] = cnts
         hist_counts[fn] = cnts
-        hist_bins[fn] = bins
-    return hist_counts, hist_bins
+    return hist_counts
 def entropy(
     data: NDArray[Any],
     names: list[str],
-    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
     normalized: bool = False,
     subset_mask: NDArray[np.bool_] | None = None,
+    hist_cache: dict[str, NDArray[np.intp]] | None = None,
 ) -> NDArray[np.float64]:
     """
     Meant for use with :term:`bias<Bias>` metrics, :term:`balance<Balance>`, :term:`diversity<Diversity>`,
@@ -74,19 +102,30 @@ def entropy(
     Parameters
     ----------
-    normalized: bool
+    data : NDArray
+        Array containing numerical values for metadata factors
+    names : list[str]
+        Names of metadata factors -- keys of the metadata dictionary
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    normalized : bool, default False
         Flag that determines whether or not to normalize entropy by log(num_bins)
-    subset_mask: NDArray[np.bool_] | None
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
+    hist_cache : dict[str, NDArray[np.intp]] or None, default None
+        Optional cache to store histogram counts
-    Note
-    ----
+    Notes
+    -----
     For continuous variables, histogram bins are chosen automatically.  See
     numpy.histogram for details.
     Returns
     -------
-    ent: NDArray[np.float64]
+    NDArray[np.float64]
         Entropy estimate per column of X
     See Also
@@ -96,47 +135,64 @@ def entropy(
     """
     num_factors = len(names)
-    hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
+    hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache)
     ev_index = np.empty(num_factors)
     for col, cnts in enumerate(hist_counts.values()):
         # entropy in nats, normalizes counts
         ev_index[col] = sp_entropy(cnts)
         if normalized:
-            if len(cnts) == 1:
+            cnt_len = np.size(cnts, 0)
+            if cnt_len == 1:
                 # log(0)
                 ev_index[col] = 0
             else:
-                ev_index[col] /= np.log(len(cnts))
+                ev_index[col] /= np.log(cnt_len)
     return ev_index
 def get_num_bins(
-    data: NDArray[Any], names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
+    data: NDArray[Any],
+    names: list[str],
+    continuous_factor_bincounts: Mapping[str, int] | None = None,
+    subset_mask: NDArray[np.bool_] | None = None,
+    hist_cache: dict[str, NDArray[np.intp]] | None = None,
 ) -> NDArray[np.float64]:
     """
     Number of bins or unique values for each metadata factor, used to
-    normalize entropy/:term:`diversity<Diversity>`.
+    normalize entropy/diversity.
     Parameters
     ----------
-    subset_mask: NDArray[np.bool_] | None
+    data : NDArray
+        Array containing numerical values for metadata factors
+    names : list[str]
+        Names of metadata factors -- keys of the metadata dictionary
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
+        The factors in names that have continuous values and the array of bin counts to
+        discretize values into. All factors are treated as having discrete values unless they
+        are specified as keys in this dictionary. Each element of this array must occur as a key
+        in names.
+    subset_mask : NDArray[np.bool_] or None, default None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
+    hist_cache : dict[str, NDArray[np.intp]] or None, default None
+        Optional cache to store histogram counts
     Returns
     -------
     NDArray[np.float64]
+        Number of bins used in the discretization for each value in names.
     """
     # likely cached
-    hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
+    hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache)
     num_bins = np.empty(len(hist_counts))
     for idx, cnts in enumerate(hist_counts.values()):
-        num_bins[idx] = len(cnts)
+        num_bins[idx] = np.size(cnts, 0)
     return num_bins
-def infer_categorical(arr: NDArray[Any], threshold: float = 0.2) -> NDArray[Any]:
+def infer_categorical(arr: NDArray[np.float64], threshold: float = 0.2) -> NDArray[np.bool_]:
     """
     Compute fraction of feature values that are unique --- intended to be used
     for inferring whether variables are categorical.
@@ -154,12 +210,16 @@ def infer_categorical(arr: NDArray[Any], threshold: float = 0.2) -> NDArray[Any]
 def preprocess_metadata(
     class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
 ) -> tuple[NDArray[Any], list[str], list[bool], NDArray[np.str_]]:
+    """
+    Formats metadata by organizing factor names, converting labels to numeric values,
+    adds class labels to the dataset structure, and marks which factors are categorical.
+    """
     # if class_labels is not numeric
     class_array = to_numpy(class_labels)
-    if not np.issubdtype(class_array.dtype, np.number):
+    if not np.issubdtype(class_array.dtype, np.integer):
         unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
     else:
-        numerical_labels = np.asarray(class_array, dtype=int)
+        numerical_labels = np.asarray(class_array, dtype=np.intp)
         unique_classes = np.unique(class_array)
     # convert class_labels and dict of lists to matrix of metadata values
@@ -170,7 +230,7 @@ def preprocess_metadata(
     # unique string receives a unique integer value.
     for k, v in metadata.items():
         if k == CLASS_LABEL:
-            k = "label_class"
+            continue
         # if not numeric
         v = to_numpy(v)
         if not np.issubdtype(v.dtype, np.number):
@@ -181,15 +241,18 @@ def preprocess_metadata(
     data = np.stack(list(preprocessed_metadata.values()), axis=-1)
     names = list(preprocessed_metadata.keys())
-    is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
+    is_categorical = [
+        var == CLASS_LABEL or infer_categorical(preprocessed_metadata[var].astype(np.float64), cat_thresh)[0]
+        for var in names
+    ]
     return data, names, is_categorical, unique_classes
 def heatmap(
-    data: NDArray[Any],
-    row_labels: list[str] | NDArray[Any],
-    col_labels: list[str] | NDArray[Any],
+    data: ArrayLike,
+    row_labels: list[str] | ArrayLike,
+    col_labels: list[str] | ArrayLike,
     xlabel: str = "",
     ylabel: str = "",
     cbarlabel: str = "",
@@ -211,14 +274,23 @@ def heatmap(
         Y-axis label
     cbarlabel : str, default ""
         Label for the colorbar
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Formatted heatmap
     """
-    import matplotlib
     import matplotlib.pyplot as plt
+    from matplotlib.ticker import FuncFormatter
+    np_data = to_numpy(data)
+    rows = row_labels if isinstance(row_labels, list) else to_numpy(row_labels)
+    cols = col_labels if isinstance(col_labels, list) else to_numpy(col_labels)
     fig, ax = plt.subplots(figsize=(10, 10))
     # Plot the heatmap
-    im = ax.imshow(data, vmin=0, vmax=1.0)
+    im = ax.imshow(np_data, vmin=0, vmax=1.0)
     # Create colorbar
     cbar = fig.colorbar(im, shrink=0.5)
@@ -227,8 +299,8 @@ def heatmap(
     cbar.set_label(cbarlabel, loc="center")
     # Show all ticks and label them with the respective list entries.
-    ax.set_xticks(np.arange(data.shape[1]), labels=col_labels)
-    ax.set_yticks(np.arange(data.shape[0]), labels=row_labels)
+    ax.set_xticks(np.arange(np_data.shape[1]), labels=cols)
+    ax.set_yticks(np.arange(np_data.shape[0]), labels=rows)
     ax.tick_params(top=False, bottom=True, labeltop=False, labelbottom=True)
     # Rotate the tick labels and set their alignment.
@@ -237,8 +309,8 @@ def heatmap(
     # Turn spines off and create white grid.
     ax.spines[:].set_visible(False)
-    ax.set_xticks(np.arange(data.shape[1] + 1) - 0.5, minor=True)
-    ax.set_yticks(np.arange(data.shape[0] + 1) - 0.5, minor=True)
+    ax.set_xticks(np.arange(np_data.shape[1] + 1) - 0.5, minor=True)
+    ax.set_yticks(np.arange(np_data.shape[0] + 1) - 0.5, minor=True)
     ax.grid(which="minor", color="w", linestyle="-", linewidth=3)
     ax.tick_params(which="minor", bottom=False, left=False)
@@ -247,7 +319,7 @@ def heatmap(
     if ylabel:
         ax.set_ylabel(ylabel)
-    valfmt = matplotlib.ticker.FuncFormatter(format_text)  # type: ignore
+    valfmt = FuncFormatter(format_text)
     # Normalize the threshold to the images color range.
     threshold = im.norm(1.0) / 2.0
@@ -260,10 +332,10 @@ def heatmap(
     # Change the text's color depending on the data.
     textcolors = ("white", "black")
     texts = []
-    for i in range(data.shape[0]):
-        for j in range(data.shape[1]):
-            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
-            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)  # type: ignore
+    for i in range(np_data.shape[0]):
+        for j in range(np_data.shape[1]):
+            kw.update(color=textcolors[int(im.norm(np_data[i, j]) > threshold)])
+            text = im.axes.text(j, i, valfmt(np_data[i, j], None), **kw)  # type: ignore
             texts.append(text)
     fig.tight_layout()
@@ -277,7 +349,7 @@ def format_text(*args: str) -> str:
     Parameters
     ----------
-    *args: Tuple (str, str)
+    *args : tuple[str, str]
         Text to be formatted. Second element is ignored, but is a
         mandatory pass-through argument as per matplotlib.ticket.FuncFormatter
@@ -300,6 +372,11 @@ def diversity_bar_plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figur
         Array containing the labels for each bar
     bar_heights : NDArray
         Array containing the values for each bar
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Bar plot figure
     """
     import matplotlib.pyplot as plt
@@ -322,6 +399,11 @@ def coverage_plot(images: NDArray[Any], num_images: int) -> Figure:
     ----------
     images : NDArray
         Array containing only the desired images to plot
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Plot of all provided images
     """
     import matplotlib.pyplot as plt
@@ -336,7 +418,7 @@ def coverage_plot(images: NDArray[Any], num_images: int) -> Figure:
             f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
         )
-    rows = np.ceil(num_images / 3).astype(int)
+    rows = int(np.ceil(num_images / 3))
     fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
     if rows == 1:

dataeval/metrics/bias/parity.py CHANGED Viewed

@@ -28,7 +28,7 @@ class ParityOutput(Generic[TData], OutputMetadata):
         chi-squared score(s) of the test
     p_value : np.float64 | NDArray[np.float64]
         p-value(s) of the test
-    metadata_names: list[str] | None
+    metadata_names : list[str] | None
         Names of each metadata factor
     """
@@ -43,16 +43,16 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
     Parameters
     ----------
-    continuous_values: NDArray
+    continuous_values : NDArray
         The values to be digitized.
-    bins: int
+    bins : int
         The number of bins for the discrete values that continuous_values will be digitized into.
-    factor_name: str
+    factor_name : str
         The name of the factor to be digitized.
     Returns
     -------
-    NDArray
+    NDArray[np.intp]
         The digitized values
     """
@@ -70,17 +70,21 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
 def format_discretize_factors(
-    data: NDArray[Any], names: list[str], is_categorical: list[bool], continuous_factor_bincounts: Mapping[str, int]
+    data: NDArray[Any],
+    names: list[str],
+    is_categorical: list[bool],
+    continuous_factor_bincounts: Mapping[str, int] | None,
 ) -> dict[str, NDArray[Any]]:
     """
     Sets up the internal list of metadata factors.
     Parameters
     ----------
-    data_factors: Dict[str, NDArray]
+    data : NDArray
         The dataset factors, which are per-image attributes including class label and metadata.
-        Each key of dataset_factors is a factor, whose value is the per-image factor values.
-    continuous_factor_bincounts : Dict[str, int]
+    names : list[str]
+        The class label
+    continuous_factor_bincounts : Mapping[str, int] or None
         The factors in data_factors that have continuous values and the array of bin counts to
         discretize values into. All factors are treated as having discrete values unless they
         are specified as keys in this dictionary. Each element of this array must occur as a key
@@ -93,19 +97,20 @@ def format_discretize_factors(
           Each key is a metadata factor, whose value is the discrete per-image factor values.
     """
-    invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
-    if invalid_keys:
-        raise KeyError(
-            f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
-            "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
-        )
+    if continuous_factor_bincounts:
+        invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
+        if invalid_keys:
+            raise KeyError(
+                f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
+                "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
+            )
     warn = []
     metadata_factors = {}
     for i, name in enumerate(names):
         if name == CLASS_LABEL:
             continue
-        if name in continuous_factor_bincounts:
+        if continuous_factor_bincounts and name in continuous_factor_bincounts:
             metadata_factors[name] = digitize_factor_bins(data[:, i], continuous_factor_bincounts[name], name)
         elif not is_categorical[i]:
             warn.append(name)
@@ -132,14 +137,14 @@ def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[
     Parameters
     ----------
-    expected_dist : np.ndarray
+    expected_dist : NDArray
         The expected label distribution. This array represents the anticipated distribution of labels.
-    observed_dist : np.ndarray
+    observed_dist : NDArray
         The observed label distribution. This array represents the actual distribution of labels in the dataset.
     Returns
     -------
-    np.ndarray
+    NDArray
         The normalized expected distribution, scaled to have the same sum as the observed distribution.
     Raises
@@ -179,6 +184,8 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
     ----------
     label_dist : NDArray
         Array representing label distributions
+    label_name : str
+        String representing label name
     Raises
     ------
@@ -219,7 +226,7 @@ def label_parity(
         List of class labels in the expected dataset
     observed_labels : ArrayLike
         List of class labels in the observed dataset
-    num_classes : int | None, default None
+    num_classes : int or None, default None
         The number of unique classes in the datasets. If not provided, the function will infer it
         from the set of unique labels in expected_labels and observed_labels
@@ -303,12 +310,12 @@ def parity(
     Parameters
     ----------
-    class_labels: ArrayLike
+    class_labels : ArrayLike
         List of class labels for each image
-    metadata: Mapping[str, ArrayLike]
+    metadata : Mapping[str, ArrayLike]
         The dataset factors, which are per-image metadata attributes.
         Each key of dataset_factors is a factor, whose value is the per-image factor values.
-    continuous_factor_bincounts : Mapping[str, int] | None, default None
+    continuous_factor_bincounts : Mapping[str, int] or None, default None
         A dictionary specifying the number of bins for discretizing the continuous factors.
         The keys should correspond to the names of continuous factors in `metadata`,
         and the values should be the number of bins to use for discretization.
@@ -359,7 +366,6 @@ def parity(
         )
     data, names, is_categorical, _ = preprocess_metadata(class_labels, metadata)
-    continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
     factors = format_discretize_factors(data, names, is_categorical, continuous_factor_bincounts)

dataeval/utils/__init__.py CHANGED Viewed

@@ -10,12 +10,12 @@ from dataeval.utils.split_dataset import split_dataset
 __all__ = ["split_dataset", "merge_metadata"]
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval.utils import torch
     __all__ += ["torch"]
-if _IS_TENSORFLOW_AVAILABLE:  # pragma: no cover
+if _IS_TENSORFLOW_AVAILABLE:
     from dataeval.utils import tensorflow
     __all__ += ["tensorflow"]

dataeval/utils/shared.py CHANGED Viewed

@@ -95,7 +95,7 @@ def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
     M = len(classes)
     if M < 2:
         raise ValueError("Label vector contains less than 2 classes!")
-    N = np.sum(counts).astype(int)
+    N = int(np.sum(counts))
     return M, N

dataeval/utils/split_dataset.py CHANGED Viewed

@@ -144,7 +144,7 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
     ----------
     group_ids : np.ndarray
         Identifies the group to which a sample at the same index belongs.
-    num_partitions: int
+    num_partitions : int
         How many total (train, val) folds will be generated (+1 if also specifying a test fold).
     Warns
@@ -242,12 +242,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
     Returns
     -------
-    group_ids: np.ndarray
+    group_ids : np.ndarray
         group identifiers from metadata
     """
     features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
     if not features2group:
-        return np.zeros(num_samples, dtype=int)
+        return np.zeros(num_samples, dtype=np.int_)
     for name, feature in features2group.items():
         if len(feature) != num_samples:
             raise IndexError(f"""Feature length does not match number of labels.
@@ -300,7 +300,13 @@ def make_splits(
         splits = splitter.split(index, labels)
     for train_idx, eval_idx in splits:
         test_ratio = len(eval_idx) / index.shape[0]
-        split_defs.append({"train": train_idx.astype(int), "eval": eval_idx.astype(int), "eval_frac": test_ratio})
+        split_defs.append(
+            {
+                "train": train_idx.astype(np.int_),
+                "eval": eval_idx.astype(np.int_),
+                "eval_frac": test_ratio,
+            }
+        )
     return split_defs
@@ -318,9 +324,9 @@ def find_best_split(
     split_defs : list[dict]
         List of dictionaries, which specifying train index, validation index, and the ratio of
         validation to all data.
-    stratified: bool
+    stratified : bool
         If True, maintain dataset class balance within each train/val split
-    eval_frac: float
+    eval_frac : float
         Desired fraction of the dataset sequestered for evaluation
     Returns

dataeval/utils/torch/datasets.py CHANGED Viewed

@@ -206,7 +206,7 @@ class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
             Option to select specific classes from dataset.
         balance : bool, default True
             If True, returns equal number of samples for each class.
-        randomize : bool, default False
+        randomize : bool, default True
             If True, shuffles the data prior to selection - uses a set seed for reproducibility.
         slice_back : bool, default False
             If True and size has a value greater than 0, then grabs selection starting at the last image.
@@ -251,7 +251,7 @@ class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
         corruption: CorruptionStringMap | None = None,
         classes: TClassMap | None = None,
         balance: bool = True,
-        randomize: bool = False,
+        randomize: bool = True,
         slice_back: bool = False,
         verbose: bool = True,
     ) -> None:

dataeval/workflows/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ Workflows perform a sequence of actions to analyze the dataset and make predicti
 from dataeval import _IS_TORCH_AVAILABLE
-if _IS_TORCH_AVAILABLE:  # pragma: no cover
+if _IS_TORCH_AVAILABLE:
     from dataeval.workflows.sufficiency import Sufficiency, SufficiencyOutput
     __all__ = ["Sufficiency", "SufficiencyOutput"]

{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.73.0
+Version: 0.73.1
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT

{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-dataeval/__init__.py,sha256=cAgMAbawI3EC6HdLfV_g_mMpH5Y-zy-n2qzrRKBH_6s,641
-dataeval/detectors/__init__.py,sha256=xdp8LYOFjV5tVbAwu0Y03KU9EajHkSFy_M3raqbxpDc,383
-dataeval/detectors/drift/__init__.py,sha256=MRPWFOaoVoqAHW36nA5F3wk7QXJU4oecND2RbtgG9oY,757
-dataeval/detectors/drift/base.py,sha256=0S-0MFpIFaJ4_8IGreFKSmyna2L50FBn7DVaoNWmw8E,14509
+dataeval/__init__.py,sha256=SdXxst_wmjSoQkYzGdR-JXSV-iJmKynWsiwkpmGDDPE,601
+dataeval/detectors/__init__.py,sha256=mwAyY54Hvp6N4D57cde3_besOinK8jVF43k0Mw4XZi8,363
+dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
+dataeval/detectors/drift/base.py,sha256=xwI6C-PEH0ZjpSqP6No6WDZp42DnE16OHi_mXe2JSvI,14499
 dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
 dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
 dataeval/detectors/drift/mmd.py,sha256=TqGOnUNYKwpS0GQPV3dSl-_qRa0g2flmoQ-dxzW_JfY,7586
@@ -9,11 +9,11 @@ dataeval/detectors/drift/torch.py,sha256=D46J72OPW8-PpP3w9ODMBfcDSdailIgVjgHVFpb
 dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
 dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
 dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
-dataeval/detectors/linters/clusterer.py,sha256=OtBE5rglAGdTTQRmKUHP6J-uWmnh2E3lZxeqJCnc87U,21014
+dataeval/detectors/linters/clusterer.py,sha256=sau5A9YcQ6VDjbZGOIaCaRHW_63opaA31pqHo5Rm-hQ,21018
 dataeval/detectors/linters/duplicates.py,sha256=tOD43rJkvheIA3mznbUqHhft2yD3xRZQdCt61daIca4,5665
 dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
 dataeval/detectors/linters/outliers.py,sha256=BUVvtbKHo04KnRmrgb84MBr0l1gtcY3-xNCHjetFrEQ,10117
-dataeval/detectors/ood/__init__.py,sha256=FVyVuaxVKAOgSTaaBf-j2OXXDarSBFcJ7CTlMV6w88s,661
+dataeval/detectors/ood/__init__.py,sha256=yzvCszJ0KrX9Eu4S_ykC_jwC0uYGPjxY3Vyx9fU3zQk,641
 dataeval/detectors/ood/ae.py,sha256=XQ_rCsf0VWg_2YXt33XGe6ZgxEud1PfIl7TmBVP1GkM,2347
 dataeval/detectors/ood/aegmm.py,sha256=6UKv0uJYWAzu1F-cITFGly4w9y_t7wqg3OmVyCN365o,2041
 dataeval/detectors/ood/base.py,sha256=a_d52pJMWVmduSt8OvUWYwHE8mpCaI6pIAE4_ib_GOs,8841
@@ -26,11 +26,11 @@ dataeval/detectors/ood/vaegmm.py,sha256=_wwmT37URs0MyhbORk91XJExClv-4e15LH_Bj60P
 dataeval/interop.py,sha256=TZCkZo844DvzHoxuRo-YsBhT6GvKmyQTHtUEQZPly1M,1728
 dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
 dataeval/metrics/bias/__init__.py,sha256=puf645-hAO5hFHNHlZ239TPopqWIoN-uLGXFB8-hA_o,599
-dataeval/metrics/bias/balance.py,sha256=Uz7RHf3UuiAxfYlZpKMg4jMzXwXcEfYj7BUnUjzgkw0,8579
-dataeval/metrics/bias/coverage.py,sha256=eB8PacN_uJ19pMd5SVI3N98NC2KJMgE3tgI-DJFNHYs,4497
-dataeval/metrics/bias/diversity.py,sha256=v9fiuySovMajW9Re0EH_FdbuJryAAdVKkvOuNngO5nc,9618
-dataeval/metrics/bias/metadata.py,sha256=OZB9BzPW6JMq2kTp_a9ucqRNcPpfqOexINax1jH5vVQ,11318
-dataeval/metrics/bias/parity.py,sha256=vfGnt_GoGMjMfWgY1FjqNV-gjqVq13tsTTmVkNtRfDM,17120
+dataeval/metrics/bias/balance.py,sha256=n4SM2Z46dzps_SPgHV8Q69msZ507AP9neebsQ45cNxc,9170
+dataeval/metrics/bias/coverage.py,sha256=7nDufCmQwZ8QG3Me5UiY0N5YoTByjcwK2zOYuMOHkJ0,4540
+dataeval/metrics/bias/diversity.py,sha256=BKGpyJ1K3S5RS_VxXN5DusB2gfRidOksL7r0L3SFa0Y,11018
+dataeval/metrics/bias/metadata.py,sha256=tPvyfFkfqWBFMX6v8i1ZLAA3DZfF6M4O7qXDdKzhQ6g,15040
+dataeval/metrics/bias/parity.py,sha256=_-WdKRWPlKHLNbjq-4mIhVdR1MI3NEabbMWblAmmVRM,17145
 dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
 dataeval/metrics/estimators/ber.py,sha256=SVT-BIC_GLs0l2l2NhWu4OpRbgn96w-OwTSoPHTnQbE,5037
 dataeval/metrics/estimators/divergence.py,sha256=pImaa216-YYTgGWDCSTcpJrC-dfl7150yVrPfW_TyGc,4293
@@ -46,12 +46,12 @@ dataeval/metrics/stats/pixelstats.py,sha256=x90O10IqVjEORtYwueFLvJnVYTxhPBOOx5HM
 dataeval/metrics/stats/visualstats.py,sha256=y0xIvst7epcajk8vz2jngiAiz0T7DZC-M97Rs1-vV9I,4950
 dataeval/output.py,sha256=jWXXNxFNBEaY1rN7Z-6LZl6bQT-I7z_wqr91Rhrdt_0,3061
 dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dataeval/utils/__init__.py,sha256=Qr-D0yHnDE8qit0-Wf6xmdMX9Wle2p_mXKgTueTy5GA,753
+dataeval/utils/__init__.py,sha256=FZLWDA7nMbHOcdg3701cVJpQmUp1Wxxk8h_qIrUQQjY,713
 dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
 dataeval/utils/lazy.py,sha256=M0iBHuJh4UPrSJPHZ0jhFwRSZhyjHJQx_KEf1OCkHD8,588
 dataeval/utils/metadata.py,sha256=A6VN7KbdiOA6rUQvUGKwDcvtOyjBer8bRW_wFxNhmW0,8556
-dataeval/utils/shared.py,sha256=BvEeYPMNQTmx4LSaImGeC0VkvcbEY3Byqtxa-jQ3xgc,3623
-dataeval/utils/split_dataset.py,sha256=IopyxwC3FaZwgVriW4OXze-mDMpOlvRr83OADA5Jydk,19454
+dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
+dataeval/utils/split_dataset.py,sha256=Ot1ZJhbIhVfcShYXF9MkWXak5odBXyuBdRh-noXh-MI,19555
 dataeval/utils/tensorflow/__init__.py,sha256=l4OjIA75JJXeNWDCkST1xtDMVYsw97lZ-9JXFBlyuYg,539
 dataeval/utils/tensorflow/_internal/gmm.py,sha256=RIFx8asEpi2kMf8JVzq9M3aAvNe9fjpJPf3BzWE-aeE,3787
 dataeval/utils/tensorflow/_internal/loss.py,sha256=TFhoNPgqeJtdpIHYobZPyzMpeWjzlFqzu5LCtthEUi4,4463
@@ -61,13 +61,13 @@ dataeval/utils/tensorflow/_internal/utils.py,sha256=lr5hKkAPbjMCUNIzMUIqbEddwbWQ
 dataeval/utils/tensorflow/loss/__init__.py,sha256=Q-66vt91Oe1ByYfo28tW32zXDq2MqQ2gngWgmIVmof8,227
 dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
 dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
-dataeval/utils/torch/datasets.py,sha256=9YV9-Uhq6NCMuu1hPhMnQXjmeI-Ld8ve1z_haxre88o,15023
+dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
 dataeval/utils/torch/models.py,sha256=0BsXmLK8W1OZ8nnEGb1f9LzIeCgtevQC37dvKS1v1vA,3236
 dataeval/utils/torch/trainer.py,sha256=EraOKiXxiMNiycStZNMR5yRz3ehgp87d9ewR9a9dV4w,5559
 dataeval/utils/torch/utils.py,sha256=FI4LJ6DvXFQJVff8fxSCP7LRkp8H9BIUgYX0kk7_Cuo,1537
-dataeval/workflows/__init__.py,sha256=x2JnOoKmLUCZOsB6RNPqMdVvxEb6Hpda5GPJnD_k0v0,310
+dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
 dataeval/workflows/sufficiency.py,sha256=1jSYhH9i4oesmJYs5PZvWS1LGXf8ekOgNhpFtMPLPXk,18552
-dataeval-0.73.0.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
-dataeval-0.73.0.dist-info/METADATA,sha256=YVw0z5C5BZs-9gCxCmbo4aNIN7Ph3rZsel7FofFrMKY,4714
-dataeval-0.73.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-dataeval-0.73.0.dist-info/RECORD,,
+dataeval-0.73.1.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
+dataeval-0.73.1.dist-info/METADATA,sha256=C7xThIWgHNoZEdSiGEZr3VgDLRSzeT3TkFbn4nQgrK0,4714
+dataeval-0.73.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+dataeval-0.73.1.dist-info/RECORD,,

{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{dataeval-0.73.0.dist-info → dataeval-0.73.1.dist-info}/WHEEL RENAMED Viewed

File without changes

dataeval 0.73.0__py3-none-any.whl → 0.73.1__py3-none-any.whl

dataeval 0.73.0py3-none-any.whl → 0.73.1py3-none-any.whl