PyPI - dataeval - Versions diffs - 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl - Mend

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

dataeval/__init__.py +13 -9
dataeval/_internal/detectors/clusterer.py +63 -49
dataeval/_internal/detectors/drift/base.py +248 -51
dataeval/_internal/detectors/drift/cvm.py +28 -26
dataeval/_internal/detectors/drift/ks.py +31 -28
dataeval/_internal/detectors/drift/mmd.py +62 -42
dataeval/_internal/detectors/drift/torch.py +69 -60
dataeval/_internal/detectors/drift/uncertainty.py +32 -32
dataeval/_internal/detectors/duplicates.py +67 -31
dataeval/_internal/detectors/ood/ae.py +15 -29
dataeval/_internal/detectors/ood/aegmm.py +33 -27
dataeval/_internal/detectors/ood/base.py +86 -47
dataeval/_internal/detectors/ood/llr.py +34 -31
dataeval/_internal/detectors/ood/vae.py +32 -31
dataeval/_internal/detectors/ood/vaegmm.py +34 -28
dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
dataeval/_internal/flags.py +44 -21
dataeval/_internal/interop.py +5 -3
dataeval/_internal/metrics/balance.py +42 -5
dataeval/_internal/metrics/ber.py +11 -8
dataeval/_internal/metrics/coverage.py +15 -8
dataeval/_internal/metrics/divergence.py +41 -7
dataeval/_internal/metrics/diversity.py +57 -19
dataeval/_internal/metrics/parity.py +141 -66
dataeval/_internal/metrics/stats.py +330 -313
dataeval/_internal/metrics/uap.py +33 -4
dataeval/_internal/metrics/utils.py +79 -40
dataeval/_internal/models/pytorch/autoencoder.py +127 -22
dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
dataeval/_internal/models/tensorflow/gmm.py +4 -2
dataeval/_internal/models/tensorflow/losses.py +17 -13
dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
dataeval/_internal/models/tensorflow/trainer.py +10 -7
dataeval/_internal/models/tensorflow/utils.py +23 -20
dataeval/_internal/output.py +85 -0
dataeval/_internal/utils.py +5 -3
dataeval/_internal/workflows/sufficiency.py +122 -121
dataeval/detectors/__init__.py +6 -25
dataeval/detectors/drift/__init__.py +16 -0
dataeval/detectors/drift/kernels/__init__.py +6 -0
dataeval/detectors/drift/updates/__init__.py +3 -0
dataeval/detectors/linters/__init__.py +5 -0
dataeval/detectors/ood/__init__.py +11 -0
dataeval/flags/__init__.py +2 -2
dataeval/metrics/__init__.py +2 -26
dataeval/metrics/bias/__init__.py +14 -0
dataeval/metrics/estimators/__init__.py +9 -0
dataeval/metrics/stats/__init__.py +6 -0
dataeval/tensorflow/__init__.py +3 -0
dataeval/tensorflow/loss/__init__.py +3 -0
dataeval/tensorflow/models/__init__.py +5 -0
dataeval/tensorflow/recon/__init__.py +3 -0
dataeval/torch/__init__.py +3 -0
dataeval/{models/torch → torch/models}/__init__.py +1 -2
dataeval/torch/trainer/__init__.py +3 -0
dataeval/utils/__init__.py +3 -6
dataeval/workflows/__init__.py +2 -4
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
dataeval-0.66.0.dist-info/RECORD +72 -0
dataeval/_internal/metrics/base.py +0 -10
dataeval/models/__init__.py +0 -15
dataeval/models/tensorflow/__init__.py +0 -6
dataeval-0.64.0.dist-info/RECORD +0 -60
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0

dataeval/_internal/metrics/uap.py CHANGED Viewed

@@ -4,15 +4,17 @@ FR Test Statistic based estimate for the upperbound
 average precision using empirical mean precision
 """
-from typing import NamedTuple
+from dataclasses import dataclass
 from numpy.typing import ArrayLike
 from sklearn.metrics import average_precision_score
 from dataeval._internal.interop import to_numpy
+from dataeval._internal.output import OutputMetadata, set_metadata
-class UAPOutput(NamedTuple):
+@dataclass(frozen=True)
+class UAPOutput(OutputMetadata):
     """
     Attributes
     ----------
@@ -23,6 +25,7 @@ class UAPOutput(NamedTuple):
     uap: float
+@set_metadata("dataeval.metrics")
 def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
     """
     FR Test Statistic based estimate of the empirical mean precision for
@@ -37,13 +40,39 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
     Returns
     -------
-    Dict[str, float]
-        uap : The empirical mean precision estimate
+    UAPOutput
+        The empirical mean precision estimate, float
     Raises
     ------
     ValueError
         If unique classes M < 2
+    Notes
+    -----
+    This function calculates the empirical mean precision using the
+    ``average_precision_score`` from scikit-learn, weighted by the class distribution.
+    Examples
+    --------
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> uap(y_true, y_scores)
+    UAPOutput(uap=0.8333333333333333)
+    >>> y_true = np.array([0, 0, 1, 1, 2, 2])
+    >>> y_scores = np.array(
+    ...     [
+    ...         [0.7, 0.2, 0.1],
+    ...         [0.4, 0.3, 0.3],
+    ...         [0.1, 0.8, 0.1],
+    ...         [0.2, 0.3, 0.5],
+    ...         [0.4, 0.4, 0.2],
+    ...         [0.1, 0.2, 0.7],
+    ...     ]
+    ... )
+    >>> uap(y_true, y_scores)
+    UAPOutput(uap=0.7777777777777777)
     """
     precision = float(average_precision_score(to_numpy(labels), to_numpy(scores), average="weighted"))

dataeval/_internal/metrics/utils.py CHANGED Viewed

@@ -1,7 +1,10 @@
-from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Sequence, Tuple, Union
+from __future__ import annotations
+from typing import Any, Callable, Literal, NamedTuple, Sequence
 import numpy as np
 import xxhash as xxh
+from numpy.typing import NDArray
 from PIL import Image
 from scipy.fftpack import dct
 from scipy.signal import convolve2d
@@ -18,22 +21,22 @@ HASH_SIZE = 8
 MAX_FACTOR = 4
-def get_method(method_map: Dict[str, Callable], method: str) -> Callable:
+def get_method(method_map: dict[str, Callable], method: str) -> Callable:
     if method not in method_map:
         raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
     return method_map[method]
 def get_counts(
-    data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
-) -> tuple[Dict, Dict]:
+    data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
+) -> tuple[dict, dict]:
     """
     Initialize dictionary of histogram counts --- treat categorical values
     as histogram bins.
     Parameters
     ----------
-    subset_mask: Optional[np.ndarray[bool]]
+    subset_mask: NDArray[np.bool_] | None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Returns
@@ -66,24 +69,24 @@ def get_counts(
 def entropy(
-    data: np.ndarray,
-    names: List[str],
-    is_categorical: List[bool],
+    data: NDArray,
+    names: list[str],
+    is_categorical: list[bool],
     normalized: bool = False,
-    subset_mask: Optional[np.ndarray] = None,
-) -> np.ndarray:
+    subset_mask: NDArray[np.bool_] | None = None,
+) -> NDArray[np.float64]:
     """
     Meant for use with Bias metrics, Balance, Diversity, ClasswiseBalance,
     and Classwise Diversity.
-    Compute entropy for discrete/categorical variables and, through standard
-    histogram binning, for continuous variables.
+    Compute entropy for discrete/categorical variables and for continuous variables through standard
+    histogram binning.
     Parameters
     ----------
     normalized: bool
         Flag that determines whether or not to normalize entropy by log(num_bins)
-    subset_mask: Optional[np.ndarray[bool]]
+    subset_mask: NDArray[np.bool_] | None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
     Notes
@@ -93,7 +96,7 @@ def entropy(
     Returns
     -------
-    ent: np.ndarray[float]
+    ent: NDArray[np.float64]
         Entropy estimate per column of X
     See Also
@@ -119,16 +122,20 @@ def entropy(
 def get_num_bins(
-    data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
-) -> np.ndarray:
+    data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
+) -> NDArray[np.float64]:
     """
     Number of bins or unique values for each metadata factor, used to
     normalize entropy/diversity.
     Parameters
     ----------
-    subset_mask: Optional[np.ndarray[bool]]
+    subset_mask: NDArray[np.bool_] | None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
+    Returns
+    -------
+    NDArray[np.float64]
     """
     # likely cached
     hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
@@ -139,7 +146,7 @@ def get_num_bins(
     return num_bins
-def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
+def infer_categorical(X: NDArray, threshold: float = 0.2) -> NDArray:
     """
     Compute fraction of feature values that are unique --- intended to be used
     for inferring whether variables are categorical.
@@ -154,9 +161,11 @@ def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
     return pct_unique < threshold
-def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tuple[np.ndarray, List[str], List[bool]]:
+def preprocess_metadata(
+    class_labels: Sequence[int], metadata: list[dict], cat_thresh: float = 0.2
+) -> tuple[NDArray, list[str], list[bool]]:
     # convert class_labels and list of metadata dicts to dict of ndarrays
-    metadata_dict: Dict[str, np.ndarray] = {
+    metadata_dict: dict[str, NDArray] = {
         "class_label": np.asarray(class_labels, dtype=int),
         **{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
     }
@@ -172,18 +181,35 @@ def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tu
     data = np.stack(list(metadata_dict.values()), axis=-1)
     names = list(metadata_dict.keys())
-    is_categorical = [infer_categorical(metadata_dict[var], 0.25)[0] for var in names]
+    is_categorical = [infer_categorical(metadata_dict[var], cat_thresh)[0] for var in names]
     return data, names, is_categorical
-def minimum_spanning_tree(X: np.ndarray) -> Any:
+def flatten(X: NDArray):
+    """
+    Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
+    Parameters
+    ----------
+    X : NDArray, shape - (N, ... )
+        Input array
+    Returns
+    -------
+    NDArray, shape - (N, -1)
+    """
+    return X.reshape((X.shape[0], -1))
+def minimum_spanning_tree(X: NDArray) -> Any:
     """
     Returns the minimum spanning tree from a NumPy image array.
     Parameters
     ----------
-    X: np.ndarray
+    X : NDArray
         Numpy image array
     Returns
@@ -191,7 +217,7 @@ def minimum_spanning_tree(X: np.ndarray) -> Any:
         Data representing the minimum spanning tree
     """
     # All features belong on second dimension
-    X = X.reshape((X.shape[0], -1))
+    X = flatten(X)
     # We add a small constant to the distance matrix to ensure scipy interprets
     # the input graph as fully-connected.
     dense_eudist = squareform(pdist(X)) + EPSILON
@@ -199,13 +225,13 @@ def minimum_spanning_tree(X: np.ndarray) -> Any:
     return mst(eudist_csr)
-def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
+def get_classes_counts(labels: NDArray) -> tuple[int, int]:
     """
     Returns the classes and counts of from an array of labels
     Parameters
     ----------
-    label: np.ndarray
+    label : NDArray
         Numpy labels array
     Returns
@@ -226,17 +252,17 @@ def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
 def compute_neighbors(
-    A: np.ndarray,
-    B: np.ndarray,
+    A: NDArray,
+    B: NDArray,
     k: int = 1,
     algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
-) -> np.ndarray:
+) -> NDArray:
     """
     For each sample in A, compute the nearest neighbor in B
     Parameters
     ----------
-    A, B : np.ndarray
+    A, B : NDArray
         The n_samples and n_features respectively
     k : int
         The number of neighbors to find
@@ -252,11 +278,24 @@ def compute_neighbors(
     List:
         Closest points to each point in A and B
+    Raises
+    ------
+    ValueError
+        If algorithm is not "auto", "ball_tree", or "kd_tree"
     See Also
     --------
     sklearn.neighbors.NearestNeighbors
     """
+    if k < 1:
+        raise ValueError("k must be >= 1")
+    if algorithm not in ["auto", "ball_tree", "kd_tree"]:
+        raise ValueError("Algorithm must be 'auto', 'ball_tree', or 'kd_tree'")
+    A = flatten(A)
+    B = flatten(B)
     nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
     nns = nbrs.kneighbors(A)[1]
     nns = nns[:, 1:].squeeze()
@@ -266,11 +305,11 @@ def compute_neighbors(
 class BitDepth(NamedTuple):
     depth: int
-    pmin: Union[float, int]
-    pmax: Union[float, int]
+    pmin: float | int
+    pmax: float | int
-def get_bitdepth(image: np.ndarray) -> BitDepth:
+def get_bitdepth(image: NDArray) -> BitDepth:
     """
     Approximates the bit depth of the image using the
     min and max pixel values.
@@ -283,7 +322,7 @@ def get_bitdepth(image: np.ndarray) -> BitDepth:
         return BitDepth(depth, 0, 2**depth - 1)
-def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
+def rescale(image: NDArray, depth: int = 1) -> NDArray:
     """
     Rescales the image using the bit depth provided.
     """
@@ -295,7 +334,7 @@ def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
         return normalized * (2**depth - 1)
-def normalize_image_shape(image: np.ndarray) -> np.ndarray:
+def normalize_image_shape(image: NDArray) -> NDArray:
     """
     Normalizes the image shape into (C,H,W).
     """
@@ -311,7 +350,7 @@ def normalize_image_shape(image: np.ndarray) -> np.ndarray:
         raise ValueError("Images must have 2 or more dimensions.")
-def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
+def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
     """
     Returns the image filtered using a 3x3 edge detection kernel:
     [[ -1, -1, -1 ],
@@ -323,7 +362,7 @@ def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
     return edges
-def pchash(image: np.ndarray) -> str:
+def pchash(image: NDArray) -> str:
     """
     Performs a perceptual hash on an image by resizing to a square NxN image
     using the Lanczos algorithm where N is 32x32 or the largest multiple of
@@ -334,7 +373,7 @@ def pchash(image: np.ndarray) -> str:
     Parameters
     ----------
-    image : np.ndarray
+    image : NDArray
         An image as a numpy array in CxHxW format
     Returns
@@ -374,7 +413,7 @@ def pchash(image: np.ndarray) -> str:
     return hash_hex if hash_hex else "0"
-def xxhash(image: np.ndarray) -> str:
+def xxhash(image: NDArray) -> str:
     """
     Performs a fast non-cryptographic hash using the xxhash algorithm
     (xxhash.com) against the image as a flattened bytearray.  The hash
@@ -382,7 +421,7 @@ def xxhash(image: np.ndarray) -> str:
     Parameters
     ----------
-    image : np.ndarray
+    image : NDArray
         An image as a numpy array
     Returns

dataeval/_internal/models/pytorch/autoencoder.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Any, List, Union
+from __future__ import annotations
+from typing import Any
 import torch
 import torch.nn as nn
@@ -14,40 +16,52 @@ def get_images_from_batch(batch: Any) -> Any:
 class AETrainer:
+    """
+    A class to train and evaluate an autoencoder model.
+    Parameters
+    ----------
+    model : nn.Module
+        The model to be trained.
+    device : str or torch.device, default "auto"
+        The hardware device to use for training.
+        If "auto", the device will be set to "cuda" if available, otherwise "cpu".
+    batch_size : int, default 8
+        The number of images to process in a batch.
+    """
     def __init__(
         self,
         model: nn.Module,
-        device: Union[str, torch.device] = "auto",
+        device: str | torch.device = "auto",
         batch_size: int = 8,
     ):
-        """
-        model : nn.Module
-            Model to be trained
-        device : str | torch.device, default "cpu"
-            Hardware device for model, optimizer, and data to run on
-        batch_size : int, default 8
-            Number of images to group together in `torch.utils.data.DataLoader`
-        """
         if device == "auto":
             device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
         self.model = model.to(device)
         self.batch_size = batch_size
-    def train(self, dataset: Dataset, epochs: int = 25) -> List[float]:
+    def train(self, dataset: Dataset, epochs: int = 25) -> list[float]:
         """
-        Basic training function for Autoencoder models for reconstruction tasks
+        Basic image reconstruction training function for Autoencoder models
         Uses `torch.optim.Adam` and `torch.nn.MSELoss` as default hyperparameters
         Parameters
         ----------
         dataset : Dataset
-            Torch Dataset containing images in the first return position
+            The dataset to train on.
+            Torch Dataset containing images in the first return position.
         epochs : int, default 25
             Number of full training loops
-        Note
+        Returns
+        -------
+        List[float]
+            A list of average loss values for each epoch.
+        Notes
         ----
         To replace this function with a custom function, do
             AETrainer.train = custom_function
@@ -58,7 +72,7 @@ class AETrainer:
         opt = Adam(self.model.parameters(), lr=0.001)
         criterion = nn.MSELoss().to(self.device)
         # Record loss
-        loss_history: List[float] = []
+        loss_history: list[float] = []
         for _ in range(epochs):
             epoch_loss: float = 0
@@ -89,19 +103,20 @@ class AETrainer:
     @torch.no_grad
     def eval(self, dataset: Dataset) -> float:
         """
-        Basic evaluation function for Autoencoder models for reconstruction tasks
+        Basic image reconstruction evaluation function for Autoencoder models
-        Uses `torch.optim.Adam` and `torch.nn.MSELoss` as default hyperparameters
+        Uses `torch.nn.MSELoss` as default loss function.
         Parameters
         ----------
         dataset : Dataset
-            Torch Dataset containing images in the first return position
+            The dataset to evaluate on.
+            Torch Dataset containing images in the first return position.
         Returns
         -------
         float
-            Total reconstruction loss over all data
+            Total reconstruction loss over the entire dataset
         Note
         ----
@@ -124,18 +139,25 @@ class AETrainer:
     @torch.no_grad
     def encode(self, dataset: Dataset) -> torch.Tensor:
         """
-        Encode data through model if it has an encode attribute,
-        otherwise passes data through model.forward
+        Create image embeddings for the dataset using the model's encoder.
+        If the model has an `encode` method, it will be used; otherwise,
+        `model.forward` will be used.
         Parameters
         ----------
         dataset: Dataset
-            Dataset containing images to be encoded by the model
+            The dataset to encode.
+            Torch Dataset containing images in the first return position.
         Returns
         -------
         torch.Tensor
             Data encoded by the model
+        Notes
+        -----
+        This function should be run after the model has been trained and evaluated.
         """
         self.model.eval()
         dl = DataLoader(dataset, batch_size=self.batch_size)
@@ -155,21 +177,67 @@ class AETrainer:
 class AriaAutoencoder(nn.Module):
+    """
+    An autoencoder model with a separate encoder and decoder.
+    Parameters
+    ----------
+    channels : int, default 3
+        Number of input channels
+    """
     def __init__(self, channels=3):
         super().__init__()
         self.encoder = Encoder(channels)
         self.decoder = Decoder(channels)
     def forward(self, x):
+        """
+        Perform a forward pass through the encoder and decoder.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor
+        Returns
+        -------
+        torch.Tensor
+            The reconstructed output tensor.
+        """
         x = self.encoder(x)
         x = self.decoder(x)
         return x
     def encode(self, x):
+        """
+        Encode the input tensor using the encoder.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor
+        Returns
+        -------
+        torch.Tensor
+            The encoded representation of the input tensor.
+        """
         return self.encoder(x)
 class Encoder(nn.Module):
+    """
+    A simple encoder to be used in an autoencoder model.
+    This is the encoder used by the AriaAutoencoder model.
+    Parameters
+    ----------
+    channels : int, default 3
+        Number of input channels
+    """
     def __init__(self, channels=3):
         super().__init__()
         self.encoder = nn.Sequential(
@@ -183,10 +251,34 @@ class Encoder(nn.Module):
         )
     def forward(self, x):
+        """
+        Perform a forward pass through the encoder.
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor
+        Returns
+        -------
+        torch.Tensor
+            The encoded representation of the input tensor.
+        """
         return self.encoder(x)
 class Decoder(nn.Module):
+    """
+    A simple decoder to be used in an autoencoder model.
+    This is the decoder used by the AriaAutoencoder model.
+    Parameters
+    ----------
+    channels : int
+        Number of output channels
+    """
     def __init__(self, channels):
         super().__init__()
         self.decoder = nn.Sequential(
@@ -199,4 +291,17 @@ class Decoder(nn.Module):
         )
     def forward(self, x):
+        """
+        Perform a forward pass through the decoder.
+        Parameters
+        ----------
+        x : torch.Tensor
+            The encoded tensor.
+        Returns
+        -------
+        torch.Tensor
+            The reconstructed output tensor.
+        """
         return self.decoder(x)

dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

dataeval 0.64.0py3-none-any.whl → 0.66.0py3-none-any.whl