PyPI - dataeval - Versions diffs - 0.66.0__py3-none-any.whl → 0.68.0__py3-none-any.whl - Mend

dataeval 0.66.0py3-none-any.whl → 0.68.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

dataeval/__init__.py +1 -1
dataeval/_internal/detectors/duplicates.py +50 -21
dataeval/_internal/detectors/merged_stats.py +78 -0
dataeval/_internal/detectors/outliers.py +45 -17
dataeval/_internal/metrics/balance.py +42 -84
dataeval/_internal/metrics/coverage.py +11 -15
dataeval/_internal/metrics/diversity.py +45 -73
dataeval/_internal/metrics/stats.py +10 -0
dataeval/_internal/output.py +1 -1
dataeval/metrics/bias/__init__.py +2 -4
{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/METADATA +1 -1
{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/RECORD +14 -13
{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.66.0"
+__version__ = "0.68.0"
 from importlib.util import find_spec

dataeval/_internal/detectors/duplicates.py CHANGED Viewed

@@ -1,28 +1,37 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Iterable
+from typing import Generic, Iterable, Sequence, TypeVar, cast
 from numpy.typing import ArrayLike
+from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
 from dataeval._internal.flags import ImageStat
 from dataeval._internal.metrics.stats import StatsOutput, imagestats
 from dataeval._internal.output import OutputMetadata, set_metadata
+DuplicateGroup = list[int]
+DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
+TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
 @dataclass(frozen=True)
-class DuplicatesOutput(OutputMetadata):
+class DuplicatesOutput(Generic[TIndexCollection], OutputMetadata):
     """
     Attributes
     ----------
-    exact : List[List[int]]
+    exact : list[list[int] | dict[int, list[int]]]
         Indices of images that are exact matches
-    near: List[List[int]]
+    near: list[list[int] | dict[int, list[int]]]
         Indices of images that are near matches
+    - For a single dataset, indices are returned as a list of index groups.
+    - For multiple datasets, indices are returned as dictionaries where the key is the
+      index of the dataset, and the value is the list index groups from that dataset.
     """
-    exact: list[list[int]]
-    near: list[list[int]]
+    exact: list[TIndexCollection]
+    near: list[TIndexCollection]
 class Duplicates:
@@ -54,18 +63,18 @@ class Duplicates:
     def _get_duplicates(self) -> dict[str, list[list[int]]]:
         stats_dict = self.stats.dict()
         if "xxhash" in stats_dict:
-            exact = {}
+            exact_dict: dict[int, list] = {}
             for i, value in enumerate(stats_dict["xxhash"]):
-                exact.setdefault(value, []).append(i)
-            exact = [v for v in exact.values() if len(v) > 1]
+                exact_dict.setdefault(value, []).append(i)
+            exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
         else:
             exact = []
         if "pchash" in stats_dict and not self.only_exact:
-            near = {}
+            near_dict: dict[int, list] = {}
             for i, value in enumerate(stats_dict["pchash"]):
-                near.setdefault(value, []).append(i)
-            near = [v for v in near.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
+                near_dict.setdefault(value, []).append(i)
+            near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
         else:
             near = []
@@ -75,14 +84,14 @@ class Duplicates:
         }
     @set_metadata("dataeval.detectors", ["only_exact"])
-    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> DuplicatesOutput:
+    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> DuplicatesOutput:
         """
         Returns duplicate image indices for both exact matches and near matches
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput
-            A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
+        data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
+            A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
         Returns
         -------
@@ -98,12 +107,32 @@ class Duplicates:
         >>> dups.evaluate(images)
         DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
         """  # noqa: E501
-        if isinstance(data, StatsOutput):
-            if not data.xxhash:
+        stats, dataset_steps = combine_stats(data)
+        if isinstance(stats, StatsOutput):
+            if not stats.xxhash:
                 raise ValueError("StatsOutput must include xxhash information of the images.")
-            if not self.only_exact and not data.pchash:
+            if not self.only_exact and not stats.pchash:
                 raise ValueError("StatsOutput must include pchash information of the images for near matches.")
-            self.stats = data
+            self.stats = stats
         else:
-            self.stats = imagestats(data, ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH))
-        return DuplicatesOutput(**self._get_duplicates())
+            flags = ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH)
+            self.stats = imagestats(cast(Iterable[ArrayLike], data), flags)
+        duplicates = self._get_duplicates()
+        # split up results from combined dataset into individual dataset buckets
+        if dataset_steps:
+            dup_list: list[list[int]]
+            for dup_type, dup_list in duplicates.items():
+                dup_list_dict = []
+                for idxs in dup_list:
+                    dup_dict = {}
+                    for idx in idxs:
+                        k, v = get_dataset_step_from_idx(idx, dataset_steps)
+                        dup_dict.setdefault(k, []).append(v)
+                    dup_list_dict.append(dup_dict)
+                duplicates[dup_type] = dup_list_dict
+        return DuplicatesOutput(**duplicates)

dataeval/_internal/detectors/merged_stats.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import Sequence, cast
+from warnings import warn
+import numpy as np
+from dataeval._internal.metrics.stats import StatsOutput
+from dataeval._internal.output import populate_defaults
+def add_stats(a: StatsOutput, b: StatsOutput) -> StatsOutput:
+    if not isinstance(a, StatsOutput) or not isinstance(b, StatsOutput):
+        raise TypeError(f"Cannot add object of type {type(a)} and type {type(b)}.")
+    a_dict = a.dict()
+    b_dict = b.dict()
+    a_keys = set(a_dict)
+    b_keys = set(b_dict)
+    missing_keys = a_keys - b_keys
+    if missing_keys:
+        raise ValueError(f"Required keys are missing: {missing_keys}.")
+    extra_keys = b_keys - a_keys
+    if extra_keys:
+        warn(f"Extraneous keys will be dropped: {extra_keys}.")
+    # perform add of multi-channel stats
+    if "ch_idx_map" in a_dict:
+        for k, v in a_dict.items():
+            if k == "ch_idx_map":
+                offset = sum([len(idxs) for idxs in v.values()])
+                for ch_k, ch_v in b_dict[k].items():
+                    if ch_k not in v:
+                        v[ch_k] = []
+                    a_dict[k][ch_k].extend([idx + offset for idx in ch_v])
+            else:
+                for ch_k in b_dict[k]:
+                    if ch_k not in v:
+                        v[ch_k] = b_dict[k][ch_k]
+                    else:
+                        v[ch_k] = np.concatenate((v[ch_k], b_dict[k][ch_k]), axis=1)
+    else:
+        for k in a_dict:
+            if isinstance(a_dict[k], list):
+                a_dict[k].extend(b_dict[k])
+            else:
+                a_dict[k] = np.concatenate((a_dict[k], b_dict[k]))
+    return StatsOutput(**populate_defaults(a_dict, StatsOutput))
+def combine_stats(stats) -> tuple[StatsOutput | None, list[int]]:
+    dataset_steps = []
+    if isinstance(stats, StatsOutput):
+        return stats, dataset_steps
+    output = None
+    if isinstance(stats, Sequence) and isinstance(stats[0], StatsOutput):
+        stats = cast(Sequence[StatsOutput], stats)
+        cur_len = 0
+        for s in stats:
+            output = s if output is None else add_stats(output, s)
+            cur_len += len(s)
+            dataset_steps.append(cur_len)
+    return output, dataset_steps
+def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
+    last_step = 0
+    for i, step in enumerate(dataset_steps):
+        if idx < step:
+            return i, idx - last_step
+        last_step = step
+    return -1, idx

dataeval/_internal/detectors/outliers.py CHANGED Viewed

@@ -1,27 +1,39 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Iterable, Literal
+from typing import Iterable, Literal, Sequence, cast
+from warnings import warn
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
+from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
 from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
 from dataeval._internal.metrics.stats import StatsOutput, imagestats
 from dataeval._internal.output import OutputMetadata, set_metadata
+IndexIssueMap = dict[int, dict[str, float]]
+DatasetIndexIssueMap = dict[int, IndexIssueMap]
+"""
+Mapping of image indices to a dictionary of issue types and calculated values
+"""
 @dataclass(frozen=True)
 class OutliersOutput(OutputMetadata):
     """
     Attributes
     ----------
-    issues : Dict[int, Dict[str, float]]
-        Dictionary containing the indices of outliers and a dictionary showing
-        the issues and calculated values for the given index.
+    issues : dict[int, dict[str, float]] | dict[int, dict[int, dict[str, float]]]
+        Indices of image outliers with their associated issue type and calculated values.
+    - For a single dataset, a dictionary containing the indices of outliers and
+      a dictionary showing the issues and calculated values for the given index.
+    - For multiple datasets, a map of dataset indices to the indices of outliers
+      and their associated issues and calculated values.
     """
-    issues: dict[int, dict[str, float]]
+    issues: IndexIssueMap | DatasetIndexIssueMap
 def _get_outlier_mask(
@@ -64,7 +76,7 @@ class Outliers:
     Attributes
     ----------
-    stats : Dict[str, Any]
+    stats : dict[str, Any]
         Dictionary to hold the value of each metric for each image
     See Also
@@ -135,14 +147,14 @@ class Outliers:
         return dict(sorted(flagged_images.items()))
     @set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
-    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> OutliersOutput:
+    def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> OutliersOutput:
         """
         Returns indices of outliers with the issues identified for each
         Parameters
         ----------
-        data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput
-            A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
+        data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput | Sequence[StatsOutput]
+            A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
         Returns
         -------
@@ -157,13 +169,29 @@ class Outliers:
         >>> outliers.evaluate(images)
         OutliersOutput(issues={18: {'brightness': 0.78}, 25: {'brightness': 0.98}})
         """
-        if isinstance(data, StatsOutput):
-            flags = set(to_distinct(self.flags).values())
-            stats = set(data.dict())
-            missing = flags - stats
+        stats, dataset_steps = combine_stats(data)
+        if isinstance(stats, StatsOutput):
+            selected_flags = set(to_distinct(self.flags).values())
+            provided = set(stats.dict())
+            missing = selected_flags - provided
             if missing:
-                raise ValueError(f"StatsOutput is missing {missing} from the required stats: {flags}.")
-            self.stats = data
+                warn(
+                    f"StatsOutput provided {provided} and is missing {missing} \
+                        from the selected stat flags: {selected_flags}."
+                )
+            self.stats = stats
         else:
-            self.stats = imagestats(data, self.flags)
-        return OutliersOutput(self._get_outliers())
+            self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
+        outliers = self._get_outliers()
+        # split up results from combined dataset into individual dataset buckets
+        if dataset_steps:
+            out_dict = {}
+            for idx, issue in outliers.items():
+                k, v = get_dataset_step_from_idx(idx, dataset_steps)
+                out_dict.setdefault(k, {})[v] = issue
+            outliers = out_dict
+        return OutliersOutput(outliers)

dataeval/_internal/metrics/balance.py CHANGED Viewed

@@ -17,11 +17,17 @@ class BalanceOutput(OutputMetadata):
     """
     Attributes
     ----------
-    mutual_information : NDArray[np.float64]
+    balance : NDArray[np.float64]
         Estimate of mutual information between metadata factors and class label
+    factors : NDArray[np.float64]
+        Estimate of inter/intra-factor mutual information
+    classwise : NDArray[np.float64]
+        Estimate of mutual information between metadata factors and individual class labels
     """
-    mutual_information: NDArray[np.float64]
+    balance: NDArray[np.float64]
+    factors: NDArray[np.float64]
+    classwise: NDArray[np.float64]
 def validate_num_neighbors(num_neighbors: int) -> int:
@@ -77,17 +83,22 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
     -------
     Return balance (mutual information) of factors with class_labels
-    >>> balance(class_labels, metadata).mutual_information[0]
-    array([0.99999822, 0.13363788, 0.        , 0.02994455])
+    >>> bal = balance(class_labels, metadata)
+    >>> bal.balance
+    array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
-    Return balance (mutual information) of metadata factors with class_labels
-    and each other
+    Return intra/interfactor balance (mutual information)
-    >>> balance(class_labels, metadata).mutual_information
-    array([[0.99999822, 0.13363788, 0.        , 0.02994455],
-           [0.13363788, 0.99999843, 0.01389763, 0.09725766],
-           [0.        , 0.01389763, 0.48549233, 0.15314612],
-           [0.02994455, 0.09725766, 0.15314612, 0.99999856]])
+    >>> bal.factors
+    array([[0.99999843, 0.03510422, 0.09725766],
+           [0.03510422, 0.08433558, 0.15621459],
+           [0.09725766, 0.15621459, 0.99999856]])
+    Return classwise balance (mutual information) of factors with individual class_labels
+    >>> bal.classwise
+    array([[0.99999822, 0.13363788, 0.        , 0.        ],
+           [0.99999822, 0.13363788, 0.        , 0.        ]])
     See Also
     --------
@@ -102,13 +113,9 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
     mi[:] = np.nan
     for idx in range(num_factors):
-        tgt = data[:, idx]
+        tgt = data[:, idx].astype(int)
         if is_categorical[idx]:
-            if tgt.dtype == float:
-                # map to unique integers if categorical
-                _, tgt = np.unique(tgt, return_inverse=True)
-            # categorical target
             mi[idx, :] = mutual_info_classif(
                 data,
                 tgt,
@@ -129,89 +136,40 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
     norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
     # in principle MI should be symmetric, but it is not in practice.
     nmi = 0.5 * (mi + mi.T) / norm_factor
+    balance = nmi[0]
+    factors = nmi[1:, 1:]
-    return BalanceOutput(nmi)
-@set_metadata("dataeval.metrics")
-def balance_classwise(class_labels: Sequence[int], metadata: list[dict], num_neighbors: int = 5) -> BalanceOutput:
-    """
-    Compute mutual information (analogous to correlation) between metadata factors
-    (class label, metadata, label/image properties) with individual class labels.
-    Parameters
-    ----------
-    class_labels: Sequence[int]
-        List of class labels for each image
-    metadata: List[Dict]
-        List of metadata factors for each image
-    num_neighbors: int, default 5
-        Number of nearest neighbors to use for computing MI between discrete
-        and continuous variables.
-    Notes
-    -----
-    We use `mutual_info_classif` from sklearn since class label is categorical.
-    `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
-    seed. MI is computed differently for categorical and continuous variables, so we
-    have to specify with is_categorical.
-    Returns
-    -------
-    BalanceOutput
-        (num_classes x num_factors) estimate of mutual information between
-        num_factors metadata factors and individual class labels.
-    Example
-    -------
-    Return classwise balance (mutual information) of factors with individual class_labels
-    >>> balance_classwise(class_labels, metadata).mutual_information
-    array([[0.13363788, 0.54085156, 0.        ],
-           [0.13363788, 0.54085156, 0.        ]])
-    See Also
-    --------
-    sklearn.feature_selection.mutual_info_classif
-    sklearn.feature_selection.mutual_info_regression
-    sklearn.metrics.mutual_info_score
-    compute_mutual_information
-    """
-    num_neighbors = validate_num_neighbors(num_neighbors)
-    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
-    num_factors = len(names)
     # unique class labels
     class_idx = names.index("class_label")
-    class_data = data[:, class_idx]
+    class_data = data[:, class_idx].astype(int)
     u_cls = np.unique(class_data)
     num_classes = len(u_cls)
-    data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
     # assume class is a factor
-    mi = np.empty((num_classes, num_factors - 1))
-    mi[:] = np.nan
+    classwise_mi = np.empty((num_classes, num_factors))
+    classwise_mi[:] = np.nan
     # categorical variables, excluding class label
     cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
+    tgt_bin = np.stack([class_data == cls for cls in u_cls]).T.astype(int)
+    ent_tgt_bin = entropy(
+        tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
+    )
     # classification MI for discrete/categorical features
-    for idx, cls in enumerate(u_cls):
-        tgt = class_data == cls
+    for idx in range(num_classes):
+        # tgt = class_data == cls
         # units: nat
-        mi[idx, :] = mutual_info_classif(
-            data_no_class,
-            tgt,
+        classwise_mi[idx, :] = mutual_info_classif(
+            data,
+            tgt_bin[:, idx],
             discrete_features=cat_mask,  # type: ignore
             n_neighbors=num_neighbors,
             random_state=0,
         )
-    # let this recompute for all features including class label
-    ent_all = entropy(data, names, is_categorical)
-    ent_tgt = ent_all[class_idx]
-    ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
-    norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
-    nmi = mi / norm_factor
-    return BalanceOutput(nmi)
+    norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_all) + 1e-6
+    classwise = classwise_mi / norm_factor
+    return BalanceOutput(balance, factors, classwise)

dataeval/_internal/metrics/coverage.py CHANGED Viewed

@@ -66,27 +66,22 @@ def coverage(
     Note
     ----
-    Embeddings should be on the unit interval.
+    Embeddings should be on the unit interval [0-1].
     Example
     -------
-    >>> coverage(embeddings)
-    CoverageOutput(indices=array([], dtype=int64), radii=array([0.59307666, 0.56956307, 0.56328616, 0.70660265, 0.57778087,
-           0.53738624, 0.58968217, 1.27721334, 0.84378694, 0.67767021,
-           0.69680335, 1.35532621, 0.59764166, 0.8691945 , 0.83627602,
-           0.84187303, 0.62212358, 1.09039732, 0.67956797, 0.60134383,
-           0.83713908, 0.91784263, 1.12901193, 0.73907618, 0.63943983,
-           0.61188447, 0.47872713, 0.57207771, 0.92885883, 0.54750511,
-           0.83015726, 1.20721778, 0.50421928, 0.98312246, 0.59764166,
-           0.61009202, 0.73864073, 1.0381061 , 0.77598609, 0.72984036,
-           0.67573006, 0.48056064, 1.00050879, 0.89532971, 0.58395529,
-           0.95954793, 0.60134383, 1.10096454, 0.51955314, 0.73038702]), critical_value=0)
+    >>> results = coverage(embeddings)
+    >>> results.indices
+    array([447, 412,   8,  32,  63])
+    >>> results.critical_value
+    0.8459038956941765
     Reference
     ---------
     This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
     [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
-    """  # noqa: E501
+    """
     # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
     embeddings = to_numpy(embeddings)
@@ -105,8 +100,9 @@ def coverage(
         pvals = np.where(crit > rho)[0]
     elif radius_type == "adaptive":
         # Use data adaptive cutoff as rho
-        rho = int(n * percent)
-        pvals = np.argsort(crit)[::-1][:rho]
+        selection = int(max(n * percent, 1))
+        pvals = np.argsort(crit)[::-1][:selection]
+        rho = float(np.mean(np.sort(crit)[::-1][selection - 1 : selection + 1]))
     else:
         raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
     return CoverageOutput(pvals, crit, rho)

dataeval/_internal/metrics/diversity.py CHANGED Viewed

@@ -17,9 +17,12 @@ class DiversityOutput(OutputMetadata):
     ----------
     diversity_index : NDArray[np.float64]
         Diversity index for classes and factors
+    classwise : NDArray[np.float64]
+        Classwise diversity index [n_class x n_factor]
     """
     diversity_index: NDArray[np.float64]
+    classwise: NDArray[np.float64]
 def diversity_shannon(
@@ -39,6 +42,13 @@ def diversity_shannon(
     Parameters
     ----------
+    data: NDArray
+        Array containing numerical values for metadata factors
+    names: list[str]
+        Names of metadata factors -- keys of the metadata dictionary
+    is_categorical: list[bool]
+        List of flags to identify whether variables are categorical (True) or
+        continuous (False)
     subset_mask: NDArray[np.bool_] | None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
@@ -76,14 +86,20 @@ def diversity_simpson(
     Compute diversity for discrete/categorical variables and, through standard
     histogram binning, for continuous variables.
-    We define diversity as a normalized form of the inverse Simpson diversity
-    index.
+    We define diversity as the inverse Simpson diversity index linearly rescaled to the unit interval.
     diversity = 1 implies that samples are evenly distributed across a particular factor
-    diversity = 1/num_categories implies that all samples belong to one category/bin
+    diversity = 0 implies that all samples belong to one category/bin
     Parameters
     ----------
+    data: NDArray
+        Array containing numerical values for metadata factors
+    names: list[str]
+        Names of metadata factors -- keys of the metadata dictionary
+    is_categorical: list[bool]
+        List of flags to identify whether variables are categorical (True) or
+        continuous (False)
     subset_mask: NDArray[np.bool_] | None
         Boolean mask of samples to bin (e.g. when computing per class).  True -> include in histogram counts
@@ -91,10 +107,7 @@ def diversity_simpson(
     -----
     For continuous variables, histogram bins are chosen automatically.  See
         numpy.histogram for details.
-    The expression is undefined for q=1, but it approaches the Shannon entropy
-        in the limit.
-    If there is only one category, the diversity index takes a value of 1 =
-        1/N = 1/1.  Entropy will take a value of 0.
+    If there is only one category, the diversity index takes a value of 0.
     Returns
     -------
@@ -116,8 +129,8 @@ def diversity_simpson(
         # relative frequencies
         p_i = cnts / cnts.sum()
         # inverse Simpson index normalized by (number of bins)
-        ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
+        s_0 = 1 / np.sum(p_i**2) / num_bins[col]
+        ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
     return ev_index
@@ -129,9 +142,11 @@ def diversity(
     class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
 ) -> DiversityOutput:
     """
-    Compute diversity for discrete/categorical variables and, through standard
+    Compute diversity and classwise diversity for discrete/categorical variables and, through standard
     histogram binning, for continuous variables.
+    We define diversity as a normalized form of the inverse Simpson diversity index.
     diversity = 1 implies that samples are evenly distributed across a particular factor
     diversity = 0 implies that all samples belong to one category/bin
@@ -141,95 +156,51 @@ def diversity(
         List of class labels for each image
     metadata: List[Dict]
         List of metadata factors for each image
-    metric: Literal["shannon", "simpson"], default "simpson"
-        string variable indicating which diversity index should be used.
-        Permissible values include "simpson" and "shannon"
+    method: Literal["shannon", "simpson"], default "simpson"
+        Indicates which diversity index should be computed
     Notes
     -----
     - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
+    - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
+    - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
     Returns
     -------
     DiversityOutput
-        Diversity index per column of self.data or each factor in self.names
+        Diversity index per column of self.data or each factor in self.names and
+        classwise diversity [n_class x n_factor]
     Example
     -------
     Compute Simpson diversity index of metadata and class labels
-    >>> diversity(class_labels, metadata, method="simpson").diversity_index
-    array([0.34482759, 0.34482759, 0.90909091])
+    >>> div_simp = diversity(class_labels, metadata, method="simpson")
+    >>> div_simp.diversity_index
+    array([0.18103448, 0.18103448, 0.88636364])
+    >>> div_simp.classwise
+    array([[0.17241379, 0.39473684],
+           [0.2       , 0.2       ]])
     Compute Shannon diversity index of metadata and class labels
-    >>> diversity(class_labels, metadata, method="shannon").diversity_index
+    >>> div_shan = diversity(class_labels, metadata, method="shannon")
+    >>> div_shan.diversity_index
     array([0.37955133, 0.37955133, 0.96748876])
-    See Also
-    --------
-    numpy.histogram
-    """
-    diversity_fn = get_method(DIVERSITY_FN_MAP, method)
-    data, names, is_categorical = preprocess_metadata(class_labels, metadata)
-    diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
-    return DiversityOutput(diversity_index)
-@set_metadata("dataeval.metrics")
-def diversity_classwise(
-    class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
-) -> DiversityOutput:
-    """
-    Compute diversity for discrete/categorical variables and, through standard
-    histogram binning, for continuous variables.
-    We define diversity as a normalized form of the inverse Simpson diversity
-    index.
-    diversity = 1 implies that samples are evenly distributed across a particular factor
-    diversity = 1/num_categories implies that all samples belong to one category/bin
-    Parameters
-    ----------
-    class_labels: Sequence[int]
-        List of class labels for each image
-    metadata: List[Dict]
-        List of metadata factors for each image
-    Notes
-    -----
-    - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
-    - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
-    - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
-    Returns
-    -------
-    DiversityOutput
-        Diversity index [n_class x n_factor]
-    Example
-    -------
-    Compute classwise Simpson diversity index of metadata and class labels
-    >>> diversity_classwise(class_labels, metadata, method="simpson").diversity_index
-    array([[0.33793103, 0.51578947],
-           [0.36      , 0.36      ]])
-    Compute classwise Shannon diversity index of metadata and class labels
-    >>> diversity_classwise(class_labels, metadata, method="shannon").diversity_index
+    >>> div_shan.classwise
     array([[0.43156028, 0.83224889],
            [0.57938016, 0.57938016]])
     See Also
     --------
     numpy.histogram
     """
     diversity_fn = get_method(DIVERSITY_FN_MAP, method)
     data, names, is_categorical = preprocess_metadata(class_labels, metadata)
+    diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
     class_idx = names.index("class_label")
     class_lbl = data[:, class_idx]
@@ -241,4 +212,5 @@ def diversity_classwise(
         subset_mask = class_lbl == cls
         diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
     div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
-    return DiversityOutput(div_no_class)
+    return DiversityOutput(diversity_index, div_no_class)

dataeval/_internal/metrics/stats.py CHANGED Viewed

@@ -89,6 +89,16 @@ class StatsOutput(OutputMetadata):
     def dict(self):
         return {k: v for k, v in self.__dict__.items() if not k.startswith("_") and len(v) > 0}
+    def __len__(self) -> int:
+        if self.ch_idx_map:
+            return sum([len(idxs) for idxs in self.ch_idx_map.values()])
+        else:
+            for a in self.__annotations__:
+                attr = getattr(self, a, None)
+                if attr is not None and hasattr(a, "__len__") and len(attr) > 0:
+                    return len(attr)
+        return 0
 QUARTILES = (0, 25, 50, 75, 100)

dataeval/_internal/output.py CHANGED Viewed

@@ -11,7 +11,7 @@ from dataeval import __version__
 class OutputMetadata:
     _name: str
-    _execution_time: str
+    _execution_time: datetime
     _execution_duration: float
     _arguments: dict[str, str]
     _state: dict[str, str]

dataeval/metrics/bias/__init__.py CHANGED Viewed

@@ -1,14 +1,12 @@
-from dataeval._internal.metrics.balance import balance, balance_classwise
+from dataeval._internal.metrics.balance import balance
 from dataeval._internal.metrics.coverage import coverage
-from dataeval._internal.metrics.diversity import diversity, diversity_classwise
+from dataeval._internal.metrics.diversity import diversity
 from dataeval._internal.metrics.parity import label_parity, parity
 __all__ = [
     "balance",
-    "balance_classwise",
     "coverage",
     "diversity",
-    "diversity_classwise",
     "label_parity",
     "parity",
 ]

{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.66.0
+Version: 0.68.0
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT

{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dataeval/__init__.py,sha256=dshMbJco8lxfbbIg0DO5fSDsvgu4DKPGE5PzA7pwvPQ,590
+dataeval/__init__.py,sha256=fV-lc8AokA2hnkUSOdX-Bxy0xmEfPTXVFB3VcYAoiA8,590
 dataeval/_internal/detectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dataeval/_internal/detectors/clusterer.py,sha256=hJwELUeAdZZ3OVLIfwalw2P7Zz13q2ZqrV6gx90s44E,20695
 dataeval/_internal/detectors/drift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -8,7 +8,8 @@ dataeval/_internal/detectors/drift/ks.py,sha256=aoDx7ps-5vrSI8Q9ii6cwmKnAyaD8tjG
 dataeval/_internal/detectors/drift/mmd.py,sha256=xUMQDaLOcqc3Uq2xDvNR7hbt3WnmCR2etZlGCwYlu2c,7489
 dataeval/_internal/detectors/drift/torch.py,sha256=YhIN85MbUV3C4IJcRvqYdXSWLj5lUeEOb05T5DgB3xo,11552
 dataeval/_internal/detectors/drift/uncertainty.py,sha256=Ot8L42AnFbkij4J3Tis7VzXLv3hfBxoOWBP4UoCEnVs,5125
-dataeval/_internal/detectors/duplicates.py,sha256=BQMWHT4j3zMuzD-S9hUXuQjZDFsSrtG1GQiTjPEIJSI,3421
+dataeval/_internal/detectors/duplicates.py,sha256=qkzbdWuJuUozFLqpnD6CYAGXQb7-aWw2mHr_cxXAfPo,4922
+dataeval/_internal/detectors/merged_stats.py,sha256=WVPxz7n5fUkFKW3kobD_TkKkof51YjfIz4M_4CHh-1s,2517
 dataeval/_internal/detectors/ood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dataeval/_internal/detectors/ood/ae.py,sha256=k8pZP7oPwVyQlv6YcoacNMzpmQZy7W222yYrdXGTYZI,2031
 dataeval/_internal/detectors/ood/aegmm.py,sha256=pffThqXRoLx3GuZXEQBd-xEy5DjAZHV7WSeP2HgM_TI,2403
@@ -16,17 +17,17 @@ dataeval/_internal/detectors/ood/base.py,sha256=Pw34uFEWOJZiG4ciM0ArUkqhiM8WCGl2
 dataeval/_internal/detectors/ood/llr.py,sha256=tCo8G7V8VaVuIZ09rg0ZXZmdE0N_zGm7vCfFUnGbGvo,10102
 dataeval/_internal/detectors/ood/vae.py,sha256=WbQugS-bBUTTqQ9PRLHBmSUtk7O2_PN4PBLJE9ieMjw,2921
 dataeval/_internal/detectors/ood/vaegmm.py,sha256=pVUSlVF2jo8uokyks2QzfBJnNtcFWmcF8EQl-azs2Bg,2832
-dataeval/_internal/detectors/outliers.py,sha256=e5Hr-MpRfCj96AknqN3Lizz4QoQPcEeY0ZofMVguKOg,6304
+dataeval/_internal/detectors/outliers.py,sha256=tzIraHkooPA4gSb8lG0O3koVK-9fOQg8EPo3xvgL1Y4,7533
 dataeval/_internal/flags.py,sha256=FHRgm8NKB9AjQgPcAESYeSbqIszgxbSGfF0Xd_tSkyk,2169
 dataeval/_internal/interop.py,sha256=x4qj4EiBt5NthSxe8prSLrPDAEcipAdyyLwbNyCBaFk,1059
 dataeval/_internal/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dataeval/_internal/metrics/balance.py,sha256=g-YYFpq0qy2xq4iHjBKZDMjOn5R9Rit6sSb53anBeis,7744
+dataeval/_internal/metrics/balance.py,sha256=eAHvgjiGCH893XSQLqh9j9wgvAECoNPVT8k0u_9Ijzg,6097
 dataeval/_internal/metrics/ber.py,sha256=Onsi47AbT9rMvng-Pbu8LIrYRfLpI13En1FxkFoMKQs,4668
-dataeval/_internal/metrics/coverage.py,sha256=9ZvcNjItE9rEyA2UHPE1K9zpTbbib4xqk8WpPpDN8ok,4037
+dataeval/_internal/metrics/coverage.py,sha256=EZVES1rbZW2j_CtQv1VFfSO-UmWcrt5nmqxDErtrG14,3473
 dataeval/_internal/metrics/divergence.py,sha256=nmMUfr9FGnH798eb6xzEiMj4C42rQVthh5HeexiY6EE,4119
-dataeval/_internal/metrics/diversity.py,sha256=2xEkLnaRhPOvsd2DCTDT-dVvPPEZOH4PKm0vufrgBq4,8207
+dataeval/_internal/metrics/diversity.py,sha256=nGjYQ-NLjb8mPt1PAYnvkWH4D58kjM39IPs2FULfis4,7503
 dataeval/_internal/metrics/parity.py,sha256=suv1Pf7gPj0_NxsS0_M6ewfUndsFJyEhbt5NPp6ktMI,15457
-dataeval/_internal/metrics/stats.py,sha256=Xbm7lLB0OZtsoxClMIrfULSqT8VymQiQmohJFtN7oz8,16332
+dataeval/_internal/metrics/stats.py,sha256=-gLGn8Yy-Xx0kkaF-Z_3RitqPLZJhhbflksSjBRN3iY,16702
 dataeval/_internal/metrics/uap.py,sha256=w-wvXXnX16kUq-weaZD2SrJi22LJ8EjOFbOhPxeGejI,2043
 dataeval/_internal/metrics/utils.py,sha256=mSYa-3cHGcsQwPr7zbdpzrnK_8jIXCiAcu2HCcvrtaY,13007
 dataeval/_internal/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -41,7 +42,7 @@ dataeval/_internal/models/tensorflow/losses.py,sha256=pZH5RnlM9R0RrBde9Lgq32muwA
 dataeval/_internal/models/tensorflow/pixelcnn.py,sha256=lRpRNebMgkCJUnEk1xouVaTfS_YGMQgQhI01wNKAjeM,48420
 dataeval/_internal/models/tensorflow/trainer.py,sha256=xNY0Iw7Qa1TnCuy9N1b77_VduFoW_BhbZjfQCxOVby4,4082
 dataeval/_internal/models/tensorflow/utils.py,sha256=l6jXKMWyQAEI4LpAONq95Xwr7CPgrs408ypf9TuNxkY,8732
-dataeval/_internal/output.py,sha256=7JEmbrbsDs6jgzqXgKNN9h1dMdfcB2iOP2wBsGCwA1c,3044
+dataeval/_internal/output.py,sha256=bFC2qJxXUc_daQwJHHa9KfFNLuxZANGb7Dpget_TXYs,3049
 dataeval/_internal/utils.py,sha256=gK0z4buuQoUYblkrCiRV9pIESzyikcY-3a08XsQkD7E,1585
 dataeval/_internal/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dataeval/_internal/workflows/sufficiency.py,sha256=0k7Dbk3QmEGkZp2IW4OcZBcrxb4zAp9hC9nXGN1v1cY,18199
@@ -53,7 +54,7 @@ dataeval/detectors/linters/__init__.py,sha256=1yxsJw8CFpHsZwn_YUlWpb-4YBet5U6uB-
 dataeval/detectors/ood/__init__.py,sha256=ybWhwbMmWygIwE1A-nYihDfugrj3j0GiuABmVvD7264,583
 dataeval/flags/__init__.py,sha256=qo06_Tk0ul4lOhKSEs0HE2G6WBFvMwNJq77vRX1ynww,72
 dataeval/metrics/__init__.py,sha256=42szGyZrLekNU-T-rwJu-pUoDBdOoStuScB-mnGzjw4,81
-dataeval/metrics/bias/__init__.py,sha256=IV34GPYPOdpy3PtcCZYWaV9M9C8h_oYP56DliQcAYr0,427
+dataeval/metrics/bias/__init__.py,sha256=xqpxCttgzz-hMZQI7_IlaNn4OGZaGVz3KKRd26GbSKE,335
 dataeval/metrics/estimators/__init__.py,sha256=fWQZUIxu88u5POYXN1yoFc-Hxx5B1fveEiiSXmK5kPk,210
 dataeval/metrics/stats/__init__.py,sha256=N5UvO7reDkYX1xFdAQjwALyJwcC2FAbruzd7ZYYW_4I,123
 dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -66,7 +67,7 @@ dataeval/torch/models/__init__.py,sha256=YnDnePYpRIKHyYn3F5qR1OObMSb-g0FGvI8X-uT
 dataeval/torch/trainer/__init__.py,sha256=Te-qElt8h-Zv8NN0r-VJOEdCPHTQ2yO3rd2MhRiZGZs,93
 dataeval/utils/__init__.py,sha256=ExQ1xj62MjcM9uIu1-g1P2fW0EPJpcIofnvxjQ908c4,172
 dataeval/workflows/__init__.py,sha256=gkU2B6yUiefexcYrBwqfZKNl8BvX8abUjfeNvVBXF4E,186
-dataeval-0.66.0.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
-dataeval-0.66.0.dist-info/METADATA,sha256=P04dHyQOp4_6lg0IkoUEXTGJAPPpgRwf5ZAwdYpuatc,4217
-dataeval-0.66.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-dataeval-0.66.0.dist-info/RECORD,,
+dataeval-0.68.0.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
+dataeval-0.68.0.dist-info/METADATA,sha256=XWLDiMY9JE2dxIDnRnJMQMLS8GPWFH2mbMDXkeP7Y5Q,4217
+dataeval-0.68.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+dataeval-0.68.0.dist-info/RECORD,,

{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{dataeval-0.66.0.dist-info → dataeval-0.68.0.dist-info}/WHEEL RENAMED Viewed

File without changes

dataeval 0.66.0__py3-none-any.whl → 0.68.0__py3-none-any.whl

dataeval 0.66.0py3-none-any.whl → 0.68.0py3-none-any.whl