PyPI - dataeval - Versions diffs - 0.82.1__py3-none-any.whl → 0.84.0__py3-none-any.whl - Mend

dataeval 0.82.1py3-none-any.whl → 0.84.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

dataeval/__init__.py +7 -2
dataeval/config.py +13 -3
dataeval/metadata/__init__.py +2 -2
dataeval/metadata/_ood.py +144 -27
dataeval/metrics/bias/__init__.py +11 -1
dataeval/metrics/bias/_balance.py +3 -3
dataeval/metrics/bias/_completeness.py +130 -0
dataeval/metrics/estimators/_ber.py +2 -1
dataeval/metrics/stats/_base.py +31 -36
dataeval/metrics/stats/_dimensionstats.py +2 -2
dataeval/metrics/stats/_hashstats.py +2 -2
dataeval/metrics/stats/_imagestats.py +4 -4
dataeval/metrics/stats/_labelstats.py +4 -45
dataeval/metrics/stats/_pixelstats.py +2 -2
dataeval/metrics/stats/_visualstats.py +2 -2
dataeval/outputs/__init__.py +4 -2
dataeval/outputs/_bias.py +31 -22
dataeval/outputs/_metadata.py +7 -0
dataeval/outputs/_stats.py +2 -3
dataeval/typing.py +43 -12
dataeval/utils/_array.py +26 -1
dataeval/utils/_mst.py +1 -2
dataeval/utils/data/_dataset.py +2 -0
dataeval/utils/data/_embeddings.py +115 -32
dataeval/utils/data/_images.py +38 -15
dataeval/utils/data/_selection.py +7 -8
dataeval/utils/data/_split.py +76 -129
dataeval/utils/data/datasets/_base.py +4 -2
dataeval/utils/data/datasets/_cifar10.py +17 -9
dataeval/utils/data/datasets/_milco.py +18 -12
dataeval/utils/data/datasets/_mnist.py +24 -8
dataeval/utils/data/datasets/_ships.py +18 -8
dataeval/utils/data/datasets/_types.py +1 -5
dataeval/utils/data/datasets/_voc.py +47 -24
dataeval/utils/data/selections/__init__.py +2 -0
dataeval/utils/data/selections/_classfilter.py +1 -1
dataeval/utils/data/selections/_prioritize.py +296 -0
dataeval/utils/data/selections/_shuffle.py +13 -4
dataeval/utils/metadata.py +1 -1
dataeval/utils/torch/_gmm.py +3 -2
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/METADATA +4 -4
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/RECORD +44 -43
dataeval/detectors/ood/metadata_ood_mi.py +0 -91
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
-__version__ = "0.82.1"
+__version__ = "0.84.0"
 import logging
@@ -34,7 +34,12 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
     logger = logging.getLogger(__name__)
     if handler is None:
         handler = logging.StreamHandler() if handler is None else handler
-        handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
+        handler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s %(levelname)-8s %(name)s.%(filename)s:%(lineno)s - %(funcName)10s() | %(message)s"
+            )
+        )
     logger.addHandler(handler)
     logger.setLevel(level)
+    logging.DEBUG
     logger.debug(f"Added logging handler {handler} to logger: {__name__}")

dataeval/config.py CHANGED Viewed

@@ -17,10 +17,18 @@ else:
 import numpy as np
 import torch
+### GLOBALS ###
 _device: torch.device | None = None
 _processes: int | None = None
 _seed: int | None = None
+### CONSTS ###
+EPSILON = 1e-10
+### TYPES ###
 DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
 """
 Type alias for types that are acceptable for specifying a torch.device.
@@ -30,18 +38,20 @@ See Also
 `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
 """
+### FUNCS ###
 def _todevice(device: DeviceLike) -> torch.device:
     return torch.device(*device) if isinstance(device, tuple) else torch.device(device)
-def set_device(device: DeviceLike) -> None:
+def set_device(device: DeviceLike | None) -> None:
     """
     Sets the default device to use when executing against a PyTorch backend.
     Parameters
     ----------
-    device : DeviceLike
+    device : DeviceLike or None
         The default device to use. See documentation for more information.
     See Also
@@ -49,7 +59,7 @@ def set_device(device: DeviceLike) -> None:
     `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
     """
     global _device
-    _device = _todevice(device)
+    _device = None if device is None else _todevice(device)
 def get_device(override: DeviceLike | None = None) -> torch.device:

dataeval/metadata/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Explanatory functions using metadata and additional features such as ood or drift"""
-__all__ = ["most_deviated_factors", "metadata_distance"]
+__all__ = ["find_ood_predictors", "metadata_distance", "find_most_deviated_factors"]
 from dataeval.metadata._distance import metadata_distance
-from dataeval.metadata._ood import most_deviated_factors
+from dataeval.metadata._ood import find_most_deviated_factors, find_ood_predictors

dataeval/metadata/_ood.py CHANGED Viewed

@@ -6,14 +6,44 @@ import warnings
 import numpy as np
 from numpy.typing import NDArray
+from sklearn.feature_selection import mutual_info_classif
+from dataeval.config import get_seed
 from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
-from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput
+from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorOutput
 from dataeval.outputs._base import set_metadata
 from dataeval.utils.data import Metadata
-def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[str], list[NDArray], list[NDArray]]:
+def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
+    """Combines the discrete and continuous data of a :class:`Metadata` object
+    Returns
+    -------
+    Tuple[list[str], NDArray]
+        The combined list of factors names and the combined discrete and continuous data
+    Note
+    ----
+    Discrete and continuous data must have the same number of samples
+    """
+    names = []
+    data = []
+    if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
+        names.extend(metadata.discrete_factor_names)
+        data.append(metadata.discrete_data)
+    if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
+        names.extend(metadata.continuous_factor_names)
+        data.append(metadata.continuous_data)
+    return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
+def _combine_metadata(
+    metadata_1: Metadata, metadata_2: Metadata
+) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
     """
     Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
     match exactly and data has the same number of columns (factors).
@@ -42,8 +72,8 @@ def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[
         If the length of keys do not match the length of the data
     """
     factor_names: list[str] = []
-    m1_data: list[NDArray] = []
-    m2_data: list[NDArray] = []
+    m1_data: list[NDArray[np.int64 | np.float64]] = []
+    m2_data: list[NDArray[np.int64 | np.float64]] = []
     # Both metadata must have the same number of factors (cols), but not necessarily samples (row)
     if metadata_1.total_num_factors != metadata_2.total_num_factors:
@@ -121,36 +151,37 @@ def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
 @set_metadata
-def most_deviated_factors(
-    metadata_1: Metadata,
-    metadata_2: Metadata,
+def find_most_deviated_factors(
+    metadata_ref: Metadata,
+    metadata_tst: Metadata,
     ood: OODOutput,
 ) -> MostDeviatedFactorsOutput:
     """
-    Determines greatest deviation in metadata features per out of distribution sample in metadata_2.
+    Determine greatest deviation in metadata features per out of distribution sample in test metadata.
     Parameters
     ----------
-    metadata_1 : Metadata
+    metadata_ref : Metadata
         A reference set of Metadata containing factor names and samples
         with discrete and/or continuous values per factor
-    metadata_2 : Metadata
+    metadata_tst : Metadata
         The set of Metadata that is tested against the reference metadata.
         This set must have the same number of features but does not require the same number of samples.
     ood : OODOutput
-        A class output by the DataEval's OOD functions that contains which examples are OOD.
+        A class output by DataEval's OOD functions that contains which examples are OOD.
     Returns
     -------
-    list[tuple[str, float]]
-        An array of the factor name and deviation of the highest metadata deviation for each OOD example in metadata_2.
+    MostDeviatedFactorsOutput
+        An output class containing the factor name and deviation of the highest metadata deviations for each
+        OOD example in the test metadata.
     Notes
     -----
     1. Both :class:`.Metadata` inputs must have discrete and continuous data in the shape (samples, factors)
        and have equivalent factor names and lengths
     2. The flag at index `i` in :attr:`.OODOutput.is_ood` must correspond
-       directly to sample `i` of `metadata_2` being out-of-distribution from `metadata_1`
+       directly to sample `i` of `metadata_tst` being out-of-distribution from `metadata_ref`
     Examples
     --------
@@ -160,13 +191,13 @@ def most_deviated_factors(
     All samples are out-of-distribution
     >>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
-    >>> most_deviated_factors(metadata1, metadata2, is_ood)
+    >>> find_most_deviated_factors(metadata1, metadata2, is_ood)
     MostDeviatedFactorsOutput([('time', 2.0), ('time', 2.592), ('time', 3.51)])
-    If there are no out-of-distribution samples, a list is returned
+    No samples are out-of-distribution
     >>> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
-    >>> most_deviated_factors(metadata1, metadata2, is_ood)
+    >>> find_most_deviated_factors(metadata1, metadata2, is_ood)
     MostDeviatedFactorsOutput([])
     """
@@ -179,31 +210,30 @@ def most_deviated_factors(
     # Combines reference and test factor names and data if exists and match exactly
     # shape -> (samples, factors)
     factor_names, md_1, md_2 = _combine_metadata(
-        metadata_1=metadata_1,
-        metadata_2=metadata_2,
+        metadata_1=metadata_ref,
+        metadata_2=metadata_tst,
     )
     # Stack discrete and continuous factors as separate factors. Must have equal sample counts
-    metadata_ref = np.hstack(md_1) if md_1 else np.array([])
-    metadata_tst = np.hstack(md_2) if md_2 else np.array([])
+    ref_data = np.hstack(md_1) if md_1 else np.array([])  # (S, Fd + Fc)
+    tst_data = np.hstack(md_2) if md_2 else np.array([])  # (S, Fd + Fc)
-    if len(metadata_ref) < 3:
+    if len(ref_data) < 3:
         warnings.warn(
-            f"At least 3 reference metadata samples are needed, got {len(metadata_ref)}",
+            f"At least 3 reference metadata samples are needed, got {len(ref_data)}",
             UserWarning,
         )
         return MostDeviatedFactorsOutput([])
-    if len(metadata_tst) != len(ood_mask):
+    if len(tst_data) != len(ood_mask):
         raise ValueError(
-            f"ood and test metadata must have the same length, "
-            f"got {len(ood_mask)} and {len(metadata_tst)} respectively."
+            f"ood and test metadata must have the same length, got {len(ood_mask)} and {len(tst_data)} respectively."
         )
     # Calculates deviations of all samples in m2_data
     # from the median values of the corresponding index in m1_data
     # Guaranteed for inputs to not be empty
-    deviations = _calc_median_deviations(metadata_ref, metadata_tst)
+    deviations = _calc_median_deviations(ref_data, tst_data)
     # Get most impactful factor deviation of each sample for ood samples only
     deviation = np.max(deviations, axis=1)[ood_mask].astype(np.float16)
@@ -217,3 +247,90 @@ def most_deviated_factors(
     # List of tuples matching the factor name with its deviation
     return MostDeviatedFactorsOutput([(factor, dev) for factor, dev in zip(most_ood_factors, deviation)])
+_NATS2BITS = 1.442695
+"""
+_NATS2BITS is the reciprocal of natural log of 2. If you have an information/entropy-type quantity measured in nats,
+which is what many library functions return, multiply it by _NATS2BITS to get it in bits.
+"""
+def find_ood_predictors(
+    metadata: Metadata,
+    ood: OODOutput,
+) -> OODPredictorOutput:
+    """Computes mutual information between a set of metadata features and per sample out-of-distribution flags.
+    Given a set of metadata features per sample and a corresponding OODOutput that indicates whether a sample was
+    determined to be out of distribution, this function calculates the mutual information between each factor and being
+    out of distribution. In other words, it finds which metadata factors most likely correlate to an
+    out of distribution sample.
+    Note
+    ----
+    A high mutual information between a factor and ood samples is an indication of correlation, but not causation.
+    Additional analysis should be done to determine how to handle factors with a high mutual information.
+    Parameters
+    ----------
+    metadata : Metadata
+        A set of arrays of values, indexed by metadata feature names, with one value per data example per feature.
+    ood : OODOutput
+        A class output by DataEval's OOD functions that contains which examples are OOD.
+    Returns
+    -------
+    OODPredictorOutput
+        A dictionary with keys corresponding to metadata feature names, and values indicating the strength of
+        association between each named feature and the OOD flag, as mutual information measured in bits.
+    Examples
+    --------
+    >>> from dataeval.outputs import OODOutput
+    All samples are out-of-distribution
+    >>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
+    >>> find_ood_predictors(metadata1, is_ood)
+    OODPredictorOutput({'time': 8.008566032557951e-17, 'altitude': 8.008566032557951e-17})
+    No out-of-distribution samples
+    >> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
+    >> find_ood_predictors(metadata1, is_ood)
+    OODPredictorOutput({})
+    """
+    ood_mask: NDArray[np.bool] = ood.is_ood
+    discrete_features_count = len(metadata.discrete_factor_names)
+    factors, data = _combine_discrete_continuous(metadata)  # (F, ), (S, F) => F = Fd + Fc
+    # No metadata correlated with out of distribution data, return 0.0 for all factors
+    if not any(ood_mask):
+        return OODPredictorOutput(dict.fromkeys(factors, 0.0))
+    if len(data) != len(ood_mask):
+        raise ValueError(
+            f"ood and metadata must have the same length, got {len(ood_mask)} and {len(data)} respectively."
+        )
+    # Calculate mean, std of each factor over all samples
+    scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1)  # (S, F)
+    discrete_features = np.zeros_like(factors, dtype=np.bool)
+    discrete_features[:discrete_features_count] = True
+    mutual_info_values = (
+        mutual_info_classif(
+            X=scaled_data,
+            y=ood_mask,
+            discrete_features=discrete_features,  # type: ignore -> sklearn issue - NDArray[bool] not of accepted type Union[ArrayLike, 'auto']
+            random_state=get_seed(),
+        )
+        * _NATS2BITS
+    )
+    return OODPredictorOutput({k: mutual_info_values[i] for i, k in enumerate(factors)})

dataeval/metrics/bias/__init__.py CHANGED Viewed

@@ -6,10 +6,12 @@ representation which may impact model performance.
 __all__ = [
     "BalanceOutput",
     "CoverageOutput",
+    "CompletenessOutput",
     "DiversityOutput",
     "LabelParityOutput",
     "ParityOutput",
     "balance",
+    "completeness",
     "coverage",
     "diversity",
     "label_parity",
@@ -17,7 +19,15 @@ __all__ = [
 ]
 from dataeval.metrics.bias._balance import balance
+from dataeval.metrics.bias._completeness import completeness
 from dataeval.metrics.bias._coverage import coverage
 from dataeval.metrics.bias._diversity import diversity
 from dataeval.metrics.bias._parity import label_parity, parity
-from dataeval.outputs._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
+from dataeval.outputs._bias import (
+    BalanceOutput,
+    CompletenessOutput,
+    CoverageOutput,
+    DiversityOutput,
+    LabelParityOutput,
+    ParityOutput,
+)

dataeval/metrics/bias/_balance.py CHANGED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 import scipy as sp
 from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
-from dataeval.config import get_seed
+from dataeval.config import EPSILON, get_seed
 from dataeval.outputs import BalanceOutput
 from dataeval.outputs._base import set_metadata
 from dataeval.utils._bin import get_counts
@@ -128,7 +128,7 @@ def balance(
     # Normalization via entropy
     bin_cnts = get_counts(discretized_data)
     ent_factor = sp.stats.entropy(bin_cnts, axis=0)
-    norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + 1e-6
+    norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + EPSILON
     # in principle MI should be symmetric, but it is not in practice.
     nmi = 0.5 * (mi + mi.T) / norm_factor
@@ -157,7 +157,7 @@ def balance(
     # Classwise normalization via entropy
     classwise_bin_cnts = get_counts(tgt_bin)
     ent_tgt_bin = sp.stats.entropy(classwise_bin_cnts, axis=0)
-    norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) + 1e-6
+    norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) + EPSILON
     classwise = classwise_mi / norm_factor
     # Grabbing factor names for plotting function

dataeval/metrics/bias/_completeness.py ADDED Viewed

@@ -0,0 +1,130 @@
+from __future__ import annotations
+import itertools
+__all__ = []
+import numpy as np
+from dataeval.config import EPSILON
+from dataeval.outputs import CompletenessOutput
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import ensure_embeddings
+def completeness(embeddings: ArrayLike, quantiles: int) -> CompletenessOutput:
+    """
+    Calculate the fraction of boxes in a grid defined by quantiles that
+    contain at least one data point.
+    Also returns the center coordinates of each empty box.
+    Parameters
+    ----------
+    embeddings : ArrayLike
+        Embedded dataset (or other low-dimensional data) (nxp)
+    quantiles : int
+        number of quantile values to use for partitioning each dimension
+        e.g., 1 would create a grid of 2^p boxes, 2, 3^p etc..
+    Returns
+    -------
+    CompletenessOutput
+        - fraction_filled: float - Fraction of boxes that contain at least one
+          data point
+        - empty_box_centers: List[np.ndarray] - List of coordinates for centers of empty
+          boxes
+    Raises
+    ------
+    ValueError
+        If embeddings are too high-dimensional (>10)
+    ValueError
+        If there are too many quantiles (>2)
+    ValueError
+        If embedding is invalid shape
+    Example
+    -------
+    >>> embs = np.array([[1, 0], [0, 1], [1, 1]])
+    >>> quantiles = 1
+    >>> result = completeness(embs, quantiles)
+    >>> result.fraction_filled
+    0.75
+    Reference
+    ---------
+    This implementation is based on https://arxiv.org/abs/2002.03147.
+    [1] Byun, Taejoon, and Sanjai Rayadurgam. “Manifold for Machine Learning Assurance.”
+    Proceedings of the ACM/IEEE 42nd International Conference on Software Engineering
+    """
+    # Ensure proper data format
+    embeddings = ensure_embeddings(embeddings, dtype=np.float64, unit_interval=False)
+    # Get data dimensions
+    n, p = embeddings.shape
+    if quantiles > 2 or quantiles <= 0:
+        raise ValueError(
+            f"Number of quantiles ({quantiles}) is greater than 2 or is nonpositive. \
+            The metric scales exponentially in this value. Please 1 or 2 quantiles."
+        )
+    if p > 10:
+        raise ValueError(
+            f"Dimension of embeddings ({p}) is greater than 10. \
+            The metric scales exponentially in this value. Please reduce the embedding dimension."
+        )
+    if n == 0 or p == 0:
+        raise ValueError("Your provided embeddings do not contain any data!")
+    # n+2 edges partition the embedding dimension (e.g. [0,0.5,1] for quantiles = 1)
+    quantile_vec = np.linspace(0, 1, quantiles + 2)
+    # Calculate the bin edges for each dimension based on quantiles
+    bin_edges = []
+    for dim in range(p):
+        # Calculate the quantile values for this feature
+        edges = np.array(np.quantile(embeddings[:, dim], quantile_vec))
+        # Make sure the last bin contains all the remaining points
+        edges[-1] += EPSILON
+        bin_edges.append(edges)
+    # Convert each data point into its corresponding grid cell indices
+    grid_indices = []
+    for dim in range(p):
+        # For each dimension, find which bin each data point belongs to
+        # Digitize is 1 indexed so we subtract 1
+        indices = np.digitize(embeddings[:, dim], bin_edges[dim]) - 1
+        grid_indices.append(indices)
+    # Make the rows the data point and the column the grid index
+    grid_coords = np.array(grid_indices).T
+    # Use set to find unique tuple of grid coordinates
+    occupied_cells = set(map(tuple, grid_coords))
+    # For the fraction
+    num_occupied_cells = len(occupied_cells)
+    # Calculate total possible cells in the grid
+    num_bins_per_dim = [len(edges) - 1 for edges in bin_edges]
+    total_possible_cells = np.prod(num_bins_per_dim)
+    # Generate all possible grid cells
+    all_cells = set(itertools.product(*[range(bins) for bins in num_bins_per_dim]))
+    # Find the empty cells (cells with no data points)
+    empty_cells = all_cells - occupied_cells
+    # Calculate center points of empty boxes
+    empty_box_centers = []
+    for cell in empty_cells:
+        center_coords = []
+        for dim, idx in enumerate(cell):
+            # Calculate center of the bin as midpoint between edges
+            center = (bin_edges[dim][idx] + bin_edges[dim][idx + 1]) / 2
+            center_coords.append(center)
+        empty_box_centers.append(np.array(center_coords))
+    # Calculate the fraction
+    fraction = float(num_occupied_cells / total_possible_cells)
+    empty_box_centers = np.array(empty_box_centers)
+    return CompletenessOutput(fraction, empty_box_centers)

dataeval/metrics/estimators/_ber.py CHANGED Viewed

@@ -19,6 +19,7 @@ from numpy.typing import NDArray
 from scipy.sparse import coo_matrix
 from scipy.stats import mode
+from dataeval.config import EPSILON
 from dataeval.outputs import BEROutput
 from dataeval.outputs._base import set_metadata
 from dataeval.typing import ArrayLike
@@ -82,7 +83,7 @@ def ber_knn(images: NDArray[np.float64], labels: NDArray[np.int_], k: int) -> tu
 def knn_lowerbound(value: float, classes: int, k: int) -> float:
     """Several cases for computing the BER lower bound"""
-    if value <= 1e-10:
+    if value <= EPSILON:
         return 0.0
     if classes == 2 and k != 1:

dataeval 0.82.1__py3-none-any.whl → 0.84.0__py3-none-any.whl

dataeval 0.82.1py3-none-any.whl → 0.84.0py3-none-any.whl