PyPI - dataeval - Versions diffs - 0.63.0__tar.gz → 0.64.0__tar.gz - Mend

dataeval 0.63.0tar.gz → 0.64.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

{dataeval-0.63.0 → dataeval-0.64.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.63.0
+Version: 0.64.0
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT

{dataeval-0.63.0 → dataeval-0.64.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dataeval"
-version = "0.63.0" # dynamic
+version = "0.64.0" # dynamic
 description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
 license = "MIT"
 readme = "README.md"

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/__init__.py RENAMED Viewed

@@ -2,14 +2,14 @@ from importlib.util import find_spec
 from . import detectors, flags, metrics
-__version__ = "0.63.0"
+__version__ = "0.64.0"
 __all__ = ["detectors", "flags", "metrics"]
 if find_spec("torch") is not None:  # pragma: no cover
-    from . import models, workflows
+    from . import models, utils, workflows
-    __all__ += ["models", "workflows"]
+    __all__ += ["models", "utils", "workflows"]
 elif find_spec("tensorflow") is not None:  # pragma: no cover
     from . import models

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/clusterer.py RENAMED Viewed

@@ -1,10 +1,11 @@
 from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
 import numpy as np
+from numpy.typing import ArrayLike
 from scipy.cluster.hierarchy import linkage
 from scipy.spatial.distance import pdist, squareform
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 def extend_linkage(link_arr: np.ndarray) -> np.ndarray:

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/drift/base.py RENAMED Viewed

@@ -11,8 +11,9 @@ from functools import wraps
 from typing import Callable, Dict, Literal, Optional, Tuple, Union
 import numpy as np
+from numpy.typing import ArrayLike
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 def update_x_ref(fn):

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/drift/cvm.py RENAMED Viewed

@@ -9,9 +9,10 @@ Licensed under Apache Software License (Apache 2.0)
 from typing import Callable, Literal, Optional, Tuple
 import numpy as np
+from numpy.typing import ArrayLike
 from scipy.stats import cramervonmises_2samp
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/drift/ks.py RENAMED Viewed

@@ -9,9 +9,10 @@ Licensed under Apache Software License (Apache 2.0)
 from typing import Callable, Literal, Optional, Tuple
 import numpy as np
+from numpy.typing import ArrayLike
 from scipy.stats import ks_2samp
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/drift/mmd.py RENAMED Viewed

@@ -9,8 +9,9 @@ Licensed under Apache Software License (Apache 2.0)
 from typing import Callable, Dict, Optional, Tuple, Union
 import torch
+from numpy.typing import ArrayLike
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from .base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
 from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
@@ -74,7 +75,7 @@ class DriftMMD(BaseDrift):
         super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
         self.infer_sigma = configure_kernel_from_x_ref
-        if configure_kernel_from_x_ref and isinstance(sigma, ArrayLike):
+        if configure_kernel_from_x_ref and sigma is not None:
             self.infer_sigma = False
         self.n_permutations = n_permutations  # nb of iterations through permutation test
@@ -83,7 +84,7 @@ class DriftMMD(BaseDrift):
         self.device = get_device(device)
         # initialize kernel
-        sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if isinstance(sigma, ArrayLike) else None
+        sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if sigma is not None else None
         self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
         # compute kernel matrix for the reference data

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/drift/uncertainty.py RENAMED Viewed

@@ -10,11 +10,10 @@ from functools import partial
 from typing import Callable, Dict, Literal, Optional, Union
 import numpy as np
+from numpy.typing import ArrayLike
 from scipy.special import softmax
 from scipy.stats import entropy
-from dataeval._internal.interop import ArrayLike
 from .base import UpdateStrategy
 from .ks import DriftKS
 from .torch import get_device, preprocess_drift

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/duplicates.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from typing import Dict, Iterable, List, Literal
+from numpy.typing import ArrayLike
 from dataeval._internal.flags import ImageHash
-from dataeval._internal.interop import ArrayLike
 from dataeval._internal.metrics.stats import ImageStats

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/linter.py RENAMED Viewed

@@ -1,9 +1,9 @@
 from typing import Iterable, Literal, Optional, Sequence, Union
 import numpy as np
+from numpy.typing import ArrayLike
 from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
-from dataeval._internal.interop import ArrayLike
 from dataeval._internal.metrics.stats import ImageStats

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/ae.py RENAMED Viewed

@@ -10,9 +10,10 @@ from typing import Callable
 import keras
 import numpy as np
+from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODBase, OODScore
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.autoencoder import AE
 from dataeval._internal.models.tensorflow.utils import predict_batch

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/aegmm.py RENAMED Viewed

@@ -9,9 +9,10 @@ Licensed under Apache Software License (Apache 2.0)
 from typing import Callable
 import keras
+from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.autoencoder import AEGMM
 from dataeval._internal.models.tensorflow.gmm import gmm_energy
 from dataeval._internal.models.tensorflow.losses import LossGMM

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/base.py RENAMED Viewed

@@ -12,8 +12,9 @@ from typing import Callable, Dict, List, Literal, NamedTuple, Optional, Tuple, c
 import keras
 import numpy as np
 import tensorflow as tf
+from numpy.typing import ArrayLike
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
 from dataeval._internal.models.tensorflow.trainer import trainer

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/llr.py RENAMED Viewed

@@ -14,9 +14,10 @@ import numpy as np
 import tensorflow as tf
 from keras.layers import Input
 from keras.models import Model
+from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODBase, OODScore
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
 from dataeval._internal.models.tensorflow.trainer import trainer
 from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -180,7 +181,7 @@ class OOD_LLR(OODBase):
         # create background data
         mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
-        X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype)
+        X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype)  # type: ignore
         # prepare sequential data
         if self.sequential and not self.has_log_prob:

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/vae.py RENAMED Viewed

@@ -10,9 +10,10 @@ from typing import Callable
 import keras
 import numpy as np
+from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODBase, OODScore
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.autoencoder import VAE
 from dataeval._internal.models.tensorflow.losses import Elbo
 from dataeval._internal.models.tensorflow.utils import predict_batch

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/detectors/ood/vaegmm.py RENAMED Viewed

@@ -10,9 +10,10 @@ from typing import Callable
 import keras
 import numpy as np
+from numpy.typing import ArrayLike
 from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
-from dataeval._internal.interop import ArrayLike, to_numpy
+from dataeval._internal.interop import to_numpy
 from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
 from dataeval._internal.models.tensorflow.gmm import gmm_energy
 from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM

{dataeval-0.63.0 → dataeval-0.64.0}/src/dataeval/_internal/interop.py RENAMED Viewed

@@ -1,7 +1,8 @@
 from importlib import import_module
-from typing import Any, Iterable, Optional, runtime_checkable
+from typing import Iterable, Optional
 import numpy as np
+from numpy.typing import ArrayLike
 module_cache = {}
@@ -19,16 +20,6 @@ def try_import(module_name):
     return module
-try:
-    from maite.protocols import ArrayLike  # type: ignore
-except ImportError:  # pragma: no cover - covered by test_mindeps.py
-    from typing import Protocol
-    @runtime_checkable
-    class ArrayLike(Protocol):
-        def __array__(self) -> Any: ...
 def to_numpy(array: Optional[ArrayLike]) -> np.ndarray:
     if array is None:
         return np.ndarray([])

dataeval-0.64.0/src/dataeval/_internal/metrics/balance.py ADDED Viewed

@@ -0,0 +1,180 @@
+import warnings
+from typing import Dict, List, NamedTuple, Sequence
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
+from dataeval._internal.metrics.utils import entropy, preprocess_metadata
+class BalanceOutput(NamedTuple):
+    """
+    Attributes
+    ----------
+    mutual_information : NDArray[np.float64]
+        Estimate of mutual information between metadata factors and class label
+    """
+    mutual_information: NDArray[np.float64]
+def validate_num_neighbors(num_neighbors: int) -> int:
+    if not isinstance(num_neighbors, (int, float)):
+        raise TypeError(
+            f"Variable {num_neighbors} is not real-valued numeric type."
+            "num_neighbors should be an int, greater than 0 and less than"
+            "the number of samples in the dataset"
+        )
+    if num_neighbors < 1:
+        raise ValueError(
+            f"Invalid value for {num_neighbors}."
+            "Choose a value greater than 0 and less than number of samples"
+            "in the dataset."
+        )
+    if isinstance(num_neighbors, float):
+        num_neighbors = int(num_neighbors)
+        warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
+    return num_neighbors
+def balance(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
+    """
+    Mutual information (MI) between factors (class label, metadata, label/image properties)
+    Parameters
+    ----------
+    class_labels: Sequence[int]
+        List of class labels for each image
+    metadata: List[Dict]
+        List of metadata factors for each image
+    num_neighbors: int, default 5
+        Number of nearest neighbors to use for computing MI between discrete
+        and continuous variables.
+    Returns
+    -------
+    BalanceOutput
+        (num_factors+1) x (num_factors+1) estimate of mutual information
+        between num_factors metadata factors and class label. Symmetry is enforced.
+    Notes
+    -----
+    We use `mutual_info_classif` from sklearn since class label is categorical.
+    `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
+    seed. MI is computed differently for categorical and continuous variables, and
+    we attempt to infer whether a variable is categorical by the fraction of unique
+    values in the dataset.
+    See Also
+    --------
+    sklearn.feature_selection.mutual_info_classif
+    sklearn.feature_selection.mutual_info_regression
+    sklearn.metrics.mutual_info_score
+    """
+    num_neighbors = validate_num_neighbors(num_neighbors)
+    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    num_factors = len(names)
+    mi = np.empty((num_factors, num_factors))
+    mi[:] = np.nan
+    for idx in range(num_factors):
+        tgt = data[:, idx]
+        if is_categorical[idx]:
+            # categorical target
+            mi[idx, :] = mutual_info_classif(
+                data,
+                tgt,
+                discrete_features=is_categorical,  # type: ignore
+                n_neighbors=num_neighbors,
+            )
+        else:
+            # continuous variables
+            mi[idx, :] = mutual_info_regression(
+                data,
+                tgt,
+                discrete_features=is_categorical,  # type: ignore
+                n_neighbors=num_neighbors,
+            )
+    ent_all = entropy(data, names, is_categorical, normalized=False)
+    norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
+    # in principle MI should be symmetric, but it is not in practice.
+    nmi = 0.5 * (mi + mi.T) / norm_factor
+    return BalanceOutput(nmi)
+def balance_classwise(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
+    """
+    Compute mutual information (analogous to correlation) between metadata factors
+    (class label, metadata, label/image properties) with individual class labels.
+    Parameters
+    ----------
+    class_labels: Sequence[int]
+        List of class labels for each image
+    metadata: List[Dict]
+        List of metadata factors for each image
+    num_neighbors: int, default 5
+        Number of nearest neighbors to use for computing MI between discrete
+        and continuous variables.
+    Notes
+    -----
+    We use `mutual_info_classif` from sklearn since class label is categorical.
+    `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
+    seed. MI is computed differently for categorical and continuous variables, so we
+    have to specify with is_categorical.
+    Returns
+    -------
+    BalanceOutput
+        (num_classes x num_factors) estimate of mutual information between
+        num_factors metadata factors and individual class labels.
+    See Also
+    --------
+    sklearn.feature_selection.mutual_info_classif
+    sklearn.feature_selection.mutual_info_regression
+    sklearn.metrics.mutual_info_score
+    compute_mutual_information
+    """
+    num_neighbors = validate_num_neighbors(num_neighbors)
+    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    num_factors = len(names)
+    # unique class labels
+    class_idx = names.index("class_label")
+    class_data = data[:, class_idx]
+    u_cls = np.unique(class_data)
+    num_classes = len(u_cls)
+    data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
+    # assume class is a factor
+    mi = np.empty((num_classes, num_factors - 1))
+    mi[:] = np.nan
+    # categorical variables, excluding class label
+    cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
+    # classification MI for discrete/categorical features
+    for idx, cls in enumerate(u_cls):
+        tgt = class_data == cls
+        # units: nat
+        mi[idx, :] = mutual_info_classif(
+            data_no_class,
+            tgt,
+            discrete_features=cat_mask,  # type: ignore
+            n_neighbors=num_neighbors,
+        )
+    # let this recompute for all features including class label
+    ent_all = entropy(data, names, is_categorical)
+    ent_tgt = ent_all[class_idx]
+    ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
+    norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
+    nmi = mi / norm_factor
+    return BalanceOutput(nmi)

dataeval-0.64.0/src/dataeval/_internal/metrics/base.py ADDED Viewed

@@ -0,0 +1,10 @@
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+TOutput = TypeVar("TOutput", bound=dict)
+class EvaluateMixin(ABC, Generic[TOutput]):
+    @abstractmethod
+    def evaluate(self, *args, **kwargs) -> TOutput:
+        """Abstract method to calculate metric based off of constructor parameters"""

dataeval-0.64.0/src/dataeval/_internal/metrics/ber.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""
+This module contains the implementation of the
+FR Test Statistic based estimate and the
+KNN based estimate for the Bayes Error Rate
+Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
+https://arxiv.org/abs/1811.06419
+"""
+from typing import Literal, NamedTuple, Tuple
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+from scipy.sparse import coo_matrix
+from scipy.stats import mode
+from dataeval._internal.interop import to_numpy
+from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
+class BEROutput(NamedTuple):
+    """
+    Attributes
+    ----------
+    ber : float
+        The upper bounds of the Bayes Error Rate
+    ber_lower : float
+        The lower bounds of the Bayes Error Rate
+    """
+    ber: float
+    ber_lower: float
+def ber_mst(X: NDArray, y: NDArray) -> Tuple[float, float]:
+    """Calculates the Bayes Error Rate using a minimum spanning tree
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        n_samples containing n_features
+    y : NDArray, shape - (N, 1)
+        Labels corresponding to each sample
+    Returns
+    -------
+    Tuple[float, float]
+        The upper and lower bounds of the bayes error rate
+    """
+    M, N = get_classes_counts(y)
+    tree = coo_matrix(minimum_spanning_tree(X))
+    matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
+    deltas = matches / (2 * N)
+    upper = 2 * deltas
+    lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
+    return upper, lower
+def ber_knn(X: NDArray, y: NDArray, k: int) -> Tuple[float, float]:
+    """Calculates the Bayes Error Rate using K-nearest neighbors
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        n_samples containing n_features
+    y : NDArray, shape - (N, 1)
+        Labels corresponding to each sample
+    Returns
+    -------
+    Tuple[float, float]
+        The upper and lower bounds of the bayes error rate
+    """
+    M, N = get_classes_counts(y)
+    # All features belong on second dimension
+    X = X.reshape((X.shape[0], -1))
+    nn_indices = compute_neighbors(X, X, k=k)
+    nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
+    modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
+    upper = float(np.count_nonzero(modal_class - y) / N)
+    lower = knn_lowerbound(upper, M, k)
+    return upper, lower
+def knn_lowerbound(value: float, classes: int, k: int) -> float:
+    """Several cases for computing the BER lower bound"""
+    if value <= 1e-10:
+        return 0.0
+    if classes == 2 and k != 1:
+        if k > 5:
+            # Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
+            alpha = 0.3399
+            beta = 0.9749
+            a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
+            return value / (1 + a_k)
+        if k > 2:
+            return value / (1 + (1 / np.sqrt(k)))
+        # k == 2:
+        return value / 2
+    return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
+BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
+def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
+    """
+    An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
+    Parameters
+    ----------
+    images : ArrayLike (N, ... )
+        Array of images or image embeddings
+    labels : ArrayLike (N, 1)
+        Array of labels for each image or image embedding
+    k : int, default 1
+        Number of nearest neighbors for KNN estimator -- ignored by MST estimator
+    method : Literal["KNN", "MST"], default "KNN"
+        Method to use when estimating the Bayes error rate
+    Returns
+    -------
+    BEROutput
+        The upper and lower bounds of the Bayes Error Rate
+    References
+    ----------
+    [1] `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
+    Examples
+    --------
+    >>> import sklearn.datasets as dsets
+    >>> from dataeval.metrics import ber
+    >>> images, labels = dsets.make_blobs(n_samples=50, centers=2, n_features=2, random_state=0)
+    >>> ber(images, labels)
+    BEROutput(ber=0.04, ber_lower=0.020416847668728033)
+    """
+    ber_fn = get_method(BER_FN_MAP, method)
+    X = to_numpy(images)
+    y = to_numpy(labels)
+    upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
+    return BEROutput(upper, lower)

dataeval 0.63.0__tar.gz → 0.64.0__tar.gz

dataeval 0.63.0tar.gz → 0.64.0tar.gz