PyPI - dataeval - Versions diffs - 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl - Mend

dataeval 0.63.0py3-none-any.whl → 0.65.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

dataeval/__init__.py +4 -4
dataeval/_internal/detectors/clusterer.py +47 -34
dataeval/_internal/detectors/drift/base.py +53 -35
dataeval/_internal/detectors/drift/cvm.py +5 -4
dataeval/_internal/detectors/drift/ks.py +7 -6
dataeval/_internal/detectors/drift/mmd.py +39 -19
dataeval/_internal/detectors/drift/torch.py +6 -5
dataeval/_internal/detectors/drift/uncertainty.py +7 -8
dataeval/_internal/detectors/duplicates.py +57 -30
dataeval/_internal/detectors/linter.py +40 -24
dataeval/_internal/detectors/ood/ae.py +2 -1
dataeval/_internal/detectors/ood/aegmm.py +2 -1
dataeval/_internal/detectors/ood/base.py +37 -15
dataeval/_internal/detectors/ood/llr.py +9 -8
dataeval/_internal/detectors/ood/vae.py +2 -1
dataeval/_internal/detectors/ood/vaegmm.py +2 -1
dataeval/_internal/flags.py +42 -21
dataeval/_internal/interop.py +3 -12
dataeval/_internal/metrics/balance.py +188 -0
dataeval/_internal/metrics/ber.py +123 -48
dataeval/_internal/metrics/coverage.py +90 -74
dataeval/_internal/metrics/divergence.py +101 -67
dataeval/_internal/metrics/diversity.py +211 -0
dataeval/_internal/metrics/parity.py +287 -155
dataeval/_internal/metrics/stats.py +198 -317
dataeval/_internal/metrics/uap.py +40 -29
dataeval/_internal/metrics/utils.py +430 -0
dataeval/_internal/models/tensorflow/losses.py +3 -3
dataeval/_internal/models/tensorflow/trainer.py +3 -2
dataeval/_internal/models/tensorflow/utils.py +4 -3
dataeval/_internal/output.py +82 -0
dataeval/_internal/utils.py +64 -0
dataeval/_internal/workflows/sufficiency.py +96 -107
dataeval/flags/__init__.py +2 -2
dataeval/metrics/__init__.py +26 -7
dataeval/utils/__init__.py +9 -0
{dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
dataeval-0.65.0.dist-info/RECORD +60 -0
dataeval/_internal/functional/__init__.py +0 -0
dataeval/_internal/functional/ber.py +0 -63
dataeval/_internal/functional/coverage.py +0 -75
dataeval/_internal/functional/divergence.py +0 -16
dataeval/_internal/functional/hash.py +0 -79
dataeval/_internal/functional/metadata.py +0 -136
dataeval/_internal/functional/metadataparity.py +0 -190
dataeval/_internal/functional/uap.py +0 -6
dataeval/_internal/functional/utils.py +0 -158
dataeval/_internal/maite/__init__.py +0 -0
dataeval/_internal/maite/utils.py +0 -30
dataeval/_internal/metrics/base.py +0 -92
dataeval/_internal/metrics/metadata.py +0 -610
dataeval/_internal/metrics/metadataparity.py +0 -67
dataeval-0.63.0.dist-info/RECORD +0 -68
{dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0

dataeval/_internal/metrics/balance.py ADDED Viewed

@@ -0,0 +1,188 @@
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Sequence
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
+from dataeval._internal.metrics.utils import entropy, preprocess_metadata
+from dataeval._internal.output import OutputMetadata, set_metadata
+@dataclass(frozen=True)
+class BalanceOutput(OutputMetadata):
+    """
+    Attributes
+    ----------
+    mutual_information : NDArray[np.float64]
+        Estimate of mutual information between metadata factors and class label
+    """
+    mutual_information: NDArray[np.float64]
+def validate_num_neighbors(num_neighbors: int) -> int:
+    if not isinstance(num_neighbors, (int, float)):
+        raise TypeError(
+            f"Variable {num_neighbors} is not real-valued numeric type."
+            "num_neighbors should be an int, greater than 0 and less than"
+            "the number of samples in the dataset"
+        )
+    if num_neighbors < 1:
+        raise ValueError(
+            f"Invalid value for {num_neighbors}."
+            "Choose a value greater than 0 and less than number of samples"
+            "in the dataset."
+        )
+    if isinstance(num_neighbors, float):
+        num_neighbors = int(num_neighbors)
+        warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
+    return num_neighbors
+@set_metadata("dataeval.metrics")
+def balance(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
+    """
+    Mutual information (MI) between factors (class label, metadata, label/image properties)
+    Parameters
+    ----------
+    class_labels: Sequence[int]
+        List of class labels for each image
+    metadata: List[Dict]
+        List of metadata factors for each image
+    num_neighbors: int, default 5
+        Number of nearest neighbors to use for computing MI between discrete
+        and continuous variables.
+    Returns
+    -------
+    BalanceOutput
+        (num_factors+1) x (num_factors+1) estimate of mutual information
+        between num_factors metadata factors and class label. Symmetry is enforced.
+    Notes
+    -----
+    We use `mutual_info_classif` from sklearn since class label is categorical.
+    `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
+    seed. MI is computed differently for categorical and continuous variables, and
+    we attempt to infer whether a variable is categorical by the fraction of unique
+    values in the dataset.
+    See Also
+    --------
+    sklearn.feature_selection.mutual_info_classif
+    sklearn.feature_selection.mutual_info_regression
+    sklearn.metrics.mutual_info_score
+    """
+    num_neighbors = validate_num_neighbors(num_neighbors)
+    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    num_factors = len(names)
+    mi = np.empty((num_factors, num_factors))
+    mi[:] = np.nan
+    for idx in range(num_factors):
+        tgt = data[:, idx]
+        if is_categorical[idx]:
+            if tgt.dtype == float:
+                # map to unique integers if categorical
+                _, tgt = np.unique(tgt, return_inverse=True)
+            # categorical target
+            mi[idx, :] = mutual_info_classif(
+                data,
+                tgt,
+                discrete_features=is_categorical,  # type: ignore
+                n_neighbors=num_neighbors,
+            )
+        else:
+            # continuous variables
+            mi[idx, :] = mutual_info_regression(
+                data,
+                tgt,
+                discrete_features=is_categorical,  # type: ignore
+                n_neighbors=num_neighbors,
+            )
+    ent_all = entropy(data, names, is_categorical, normalized=False)
+    norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
+    # in principle MI should be symmetric, but it is not in practice.
+    nmi = 0.5 * (mi + mi.T) / norm_factor
+    return BalanceOutput(nmi)
+@set_metadata("dataeval.metrics")
+def balance_classwise(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
+    """
+    Compute mutual information (analogous to correlation) between metadata factors
+    (class label, metadata, label/image properties) with individual class labels.
+    Parameters
+    ----------
+    class_labels: Sequence[int]
+        List of class labels for each image
+    metadata: List[Dict]
+        List of metadata factors for each image
+    num_neighbors: int, default 5
+        Number of nearest neighbors to use for computing MI between discrete
+        and continuous variables.
+    Notes
+    -----
+    We use `mutual_info_classif` from sklearn since class label is categorical.
+    `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
+    seed. MI is computed differently for categorical and continuous variables, so we
+    have to specify with is_categorical.
+    Returns
+    -------
+    BalanceOutput
+        (num_classes x num_factors) estimate of mutual information between
+        num_factors metadata factors and individual class labels.
+    See Also
+    --------
+    sklearn.feature_selection.mutual_info_classif
+    sklearn.feature_selection.mutual_info_regression
+    sklearn.metrics.mutual_info_score
+    compute_mutual_information
+    """
+    num_neighbors = validate_num_neighbors(num_neighbors)
+    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    num_factors = len(names)
+    # unique class labels
+    class_idx = names.index("class_label")
+    class_data = data[:, class_idx]
+    u_cls = np.unique(class_data)
+    num_classes = len(u_cls)
+    data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
+    # assume class is a factor
+    mi = np.empty((num_classes, num_factors - 1))
+    mi[:] = np.nan
+    # categorical variables, excluding class label
+    cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
+    # classification MI for discrete/categorical features
+    for idx, cls in enumerate(u_cls):
+        tgt = class_data == cls
+        # units: nat
+        mi[idx, :] = mutual_info_classif(
+            data_no_class,
+            tgt,
+            discrete_features=cat_mask,  # type: ignore
+            n_neighbors=num_neighbors,
+        )
+    # let this recompute for all features including class label
+    ent_all = entropy(data, names, is_categorical)
+    ent_tgt = ent_all[class_idx]
+    ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
+    norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
+    nmi = mi / norm_factor
+    return BalanceOutput(nmi)

dataeval/_internal/metrics/ber.py CHANGED Viewed

@@ -7,68 +7,143 @@ Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
 https://arxiv.org/abs/1811.06419
 """
-from typing import Callable, Dict, Literal, Tuple
+from dataclasses import dataclass
+from typing import Literal, Tuple
 import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from scipy.sparse import coo_matrix
+from scipy.stats import mode
-from dataeval._internal.functional.ber import ber_knn, ber_mst
-from dataeval._internal.interop import ArrayLike, to_numpy
-from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
+from dataeval._internal.interop import to_numpy
+from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
+from dataeval._internal.output import OutputMetadata, set_metadata
-_METHODS = Literal["MST", "KNN"]
-_FUNCTION = Callable[[np.ndarray, np.ndarray, int], Tuple[float, float]]
+@dataclass(frozen=True)
+class BEROutput(OutputMetadata):
+    """
+    Attributes
+    ----------
+    ber : float
+        The upper bounds of the Bayes Error Rate
+    ber_lower : float
+        The lower bounds of the Bayes Error Rate
+    """
+    ber: float
+    ber_lower: float
+def ber_mst(X: NDArray, y: NDArray) -> Tuple[float, float]:
+    """Calculates the Bayes Error Rate using a minimum spanning tree
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        n_samples containing n_features
+    y : NDArray, shape - (N, 1)
+        Labels corresponding to each sample
+    Returns
+    -------
+    Tuple[float, float]
+        The upper and lower bounds of the bayes error rate
+    """
+    M, N = get_classes_counts(y)
+    tree = coo_matrix(minimum_spanning_tree(X))
+    matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
+    deltas = matches / (2 * N)
+    upper = 2 * deltas
+    lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
+    return upper, lower
+def ber_knn(X: NDArray, y: NDArray, k: int) -> Tuple[float, float]:
+    """Calculates the Bayes Error Rate using K-nearest neighbors
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        n_samples containing n_features
+    y : NDArray, shape - (N, 1)
+        Labels corresponding to each sample
+    Returns
+    -------
+    Tuple[float, float]
+        The upper and lower bounds of the bayes error rate
+    """
+    M, N = get_classes_counts(y)
+    nn_indices = compute_neighbors(X, X, k=k)
+    nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
+    modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
+    upper = float(np.count_nonzero(modal_class - y) / N)
+    lower = knn_lowerbound(upper, M, k)
+    return upper, lower
+def knn_lowerbound(value: float, classes: int, k: int) -> float:
+    """Several cases for computing the BER lower bound"""
+    if value <= 1e-10:
+        return 0.0
+    if classes == 2 and k != 1:
+        if k > 5:
+            # Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
+            alpha = 0.3399
+            beta = 0.9749
+            a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
+            return value / (1 + a_k)
+        if k > 2:
+            return value / (1 + (1 / np.sqrt(k)))
+        # k == 2:
+        return value / 2
+    return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
+BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
-class BER(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
+@set_metadata("dataeval.metrics")
+def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
     """
     An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
     Parameters
     ----------
-    method : Literal["MST", "KNN"], default "KNN"
-        Method to use when estimating the Bayes error rate
+    images : ArrayLike (N, ... )
+        Array of images or image embeddings
+    labels : ArrayLike (N, 1)
+        Array of labels for each image or image embedding
     k : int, default 1
-        number of nearest neighbors for KNN estimator -- ignored by MST estimator
+        Number of nearest neighbors for KNN estimator -- ignored by MST estimator
+    method : Literal["KNN", "MST"], default "KNN"
+        Method to use when estimating the Bayes error rate
+    Returns
+    -------
+    BEROutput
+        The upper and lower bounds of the Bayes Error Rate
-    See Also
+    References
+    ----------
+    [1] `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
+    Examples
     --------
-    `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
+    >>> import sklearn.datasets as dsets
+    >>> from dataeval.metrics import ber
-    """
+    >>> images, labels = dsets.make_blobs(n_samples=50, centers=2, n_features=2, random_state=0)
-    def __init__(self, method: _METHODS = "KNN", k: int = 1) -> None:
-        self.k: int = k
-        self._set_method(method)
-    @classmethod
-    def _methods(cls) -> Dict[str, _FUNCTION]:
-        return {"KNN": ber_knn, "MST": ber_mst}
-    def evaluate(self, images: ArrayLike, labels: ArrayLike) -> Dict[str, float]:
-        """
-        Calculates the Bayes Error Rate estimate using the provided method
-        Parameters
-        ----------
-        images : ArrayLike (N, : )
-            Array of images or image embeddings
-        labels : ArrayLike (N, 1)
-            Array of labels for each image or image embedding
-        Returns
-        -------
-        Dict[str, float]
-            ber : float
-                The estimated lower bounds of the Bayes Error Rate
-            ber_lower : float
-                The estimated upper bounds of the Bayes Error Rate
-        Raises
-        ------
-        ValueError
-            If unique classes M < 2
-        """
-        upper, lower = self._method(to_numpy(images), to_numpy(labels), self.k)
-        return {"ber": upper, "ber_lower": lower}
+    >>> ber(images, labels)
+    BEROutput(ber=0.04, ber_lower=0.020416847668728033)
+    """
+    ber_fn = get_method(BER_FN_MAP, method)
+    X = to_numpy(images)
+    y = to_numpy(labels)
+    upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
+    return BEROutput(upper, lower)

dataeval/_internal/metrics/coverage.py CHANGED Viewed

@@ -1,18 +1,49 @@
-from typing import Literal, Tuple
+import math
+from dataclasses import dataclass
+from typing import Literal
 import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from scipy.spatial.distance import pdist, squareform
-from dataeval._internal.functional.coverage import coverage
-from dataeval._internal.interop import ArrayLike, to_numpy
-from dataeval._internal.metrics.base import EvaluateMixin
+from dataeval._internal.interop import to_numpy
+from dataeval._internal.metrics.utils import flatten
+from dataeval._internal.output import OutputMetadata, set_metadata
-class Coverage(EvaluateMixin):
+@dataclass(frozen=True)
+class CoverageOutput(OutputMetadata):
+    """
+    Attributes
+    ----------
+    indices : NDArray
+        Array of uncovered indices
+    radii : NDArray
+        Array of critical value radii
+    critical_value : float
+        Radius for coverage
+    """
+    indices: NDArray[np.intp]
+    radii: NDArray[np.float64]
+    critical_value: float
+@set_metadata("dataeval.metrics")
+def coverage(
+    embeddings: ArrayLike,
+    radius_type: Literal["adaptive", "naive"] = "adaptive",
+    k: int = 20,
+    percent: np.float64 = np.float64(0.01),
+) -> CoverageOutput:
     """
     Class for evaluating coverage and identifying images/samples that are in undercovered regions.
     Parameters
     ----------
+    embeddings : ArrayLike, shape - (N, P)
+        A dataset in an ArrayLike format.
+        Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
     radius_type : Literal["adaptive", "naive"], default "adaptive"
         The function used to determine radius.
     k: int, default 20
@@ -21,76 +52,61 @@ class Coverage(EvaluateMixin):
     percent: np.float64, default np.float(0.01)
         Percent of observations to be considered uncovered. Only applies to adaptive radius.
+    Returns
+    -------
+    CoverageOutput
+        Array of uncovered indices, critical value radii, and the radius for coverage
+    Raises
+    ------
+    ValueError
+        If length of embeddings is less than or equal to k
+    ValueError
+        If radius_type is unknown
+    Note
+    ----
+    Embeddings should be on the unit interval.
+    Example
+    -------
+    >>> coverage(embeddings)
+    CoverageOutput(indices=array([], dtype=int64), radii=array([0.59307666, 0.56956307, 0.56328616, 0.70660265, 0.57778087,
+           0.53738624, 0.58968217, 1.27721334, 0.84378694, 0.67767021,
+           0.69680335, 1.35532621, 0.59764166, 0.8691945 , 0.83627602,
+           0.84187303, 0.62212358, 1.09039732, 0.67956797, 0.60134383,
+           0.83713908, 0.91784263, 1.12901193, 0.73907618, 0.63943983,
+           0.61188447, 0.47872713, 0.57207771, 0.92885883, 0.54750511,
+           0.83015726, 1.20721778, 0.50421928, 0.98312246, 0.59764166,
+           0.61009202, 0.73864073, 1.0381061 , 0.77598609, 0.72984036,
+           0.67573006, 0.48056064, 1.00050879, 0.89532971, 0.58395529,
+           0.95954793, 0.60134383, 1.10096454, 0.51955314, 0.73038702]), critical_value=0)
     Reference
     ---------
     This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
     [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
-    Examples
-    --------
-    Initialize the Coverage class:
-    >>> cover = Coverage()
-    Adjusting parameters:
-    >>> cover = Coverage(k=5, percent=0.1)
-    """
-    def __init__(
-        self,
-        radius_type: Literal["adaptive", "naive"] = "adaptive",
-        k: int = 20,
-        percent: np.float64 = np.float64(0.01),
-    ):
-        self.radius_type: Literal["adaptive", "naive"] = radius_type
-        self.k: int = k
-        self.percent: np.float64 = percent
-    def evaluate(self, embeddings: ArrayLike) -> Tuple[np.ndarray, np.ndarray, float]:
-        """
-        Perform a one-way chi-squared test between observation frequencies and expected frequencies that
-        tests the null hypothesis that the observed data has the expected frequencies.
-        Parameters
-        ----------
-        embeddings : ArrayLike, shape - (N, P)
-            A dataset in an ArrayLike format.
-            Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
-        Returns
-        -------
-        np.ndarray
-            Array of uncovered indices
-        np.ndarray
-            Array of critical value radii
-        float
-            Radius for coverage
-        Raises
-        ------
-        ValueError
-            If length of embeddings is less than or equal to k
-        ValueError
-            If radius_type is unknown
-        Note
-        ----
-        Embeddings should be on the unit interval.
-        Example
-        -------
-        >>> cover.evaluate(embeddings)
-        (array([31,  7, 22, 37, 11]), array([0.35938604, 0.26462789, 0.20319609, 0.34140912, 0.31069921,
-               0.2308378 , 0.33300179, 0.69881025, 0.53587532, 0.35689803,
-               0.39333634, 0.67497874, 0.21788128, 0.43510162, 0.38601861,
-               0.34171868, 0.16941337, 0.66438044, 0.20319609, 0.19732733,
-               0.48660288, 0.5135814 , 0.69352653, 0.26946943, 0.31120605,
-               0.33067705, 0.30508271, 0.32802489, 0.51805702, 0.31120605,
-               0.40843265, 0.74996768, 0.31069921, 0.52263763, 0.26654013,
-               0.33113507, 0.40814838, 0.67723008, 0.48124375, 0.37243185,
-               0.29760001, 0.30907904, 0.59023236, 0.57778087, 0.21839853,
-               0.46067782, 0.31078966, 0.65199049, 0.26410603, 0.19542706]))
-        """
-        return coverage(to_numpy(embeddings), self.radius_type, self.k, self.percent)
+    """  # noqa: E501
+    # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
+    embeddings = to_numpy(embeddings)
+    n = len(embeddings)
+    if n <= k:
+        raise ValueError(
+            f"Number of observations n={n} is less than or equal to the specified number of neighbors k={k}."
+        )
+    mat = squareform(pdist(flatten(embeddings))).astype(np.float64)
+    sorted_dists = np.sort(mat, axis=1)
+    crit = sorted_dists[:, k + 1]
+    d = embeddings.shape[1]
+    if radius_type == "naive":
+        rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
+        pvals = np.where(crit > rho)[0]
+    elif radius_type == "adaptive":
+        # Use data adaptive cutoff as rho
+        rho = int(n * percent)
+        pvals = np.argsort(crit)[::-1][:rho]
+    else:
+        raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
+    return CoverageOutput(pvals, crit, rho)

dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

dataeval 0.63.0py3-none-any.whl → 0.65.0py3-none-any.whl