PyPI - dataeval - Versions diffs - 0.76.0__py3-none-any.whl → 0.76.1__py3-none-any.whl - Mend

dataeval 0.76.0py3-none-any.whl → 0.76.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

dataeval/__init__.py +1 -1
dataeval/detectors/linters/outliers.py +30 -30
dataeval/metrics/bias/parity.py +2 -4
dataeval/metrics/stats/labelstats.py +21 -21
dataeval/utils/metadata.py +223 -76
{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/METADATA +41 -15
{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/RECORD +9 -9
{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/WHEEL +0 -0

dataeval/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
 from __future__ import annotations
 __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
-__version__ = "0.76.0"
+__version__ = "0.76.1"
 import logging

dataeval/detectors/linters/outliers.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 __all__ = []
-# import contextlib
+import contextlib
 from dataclasses import dataclass
 from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
@@ -18,8 +18,8 @@ from dataeval.metrics.stats.pixelstats import PixelStatsOutput
 from dataeval.metrics.stats.visualstats import VisualStatsOutput
 from dataeval.output import Output, set_metadata
-# with contextlib.suppress(ImportError):
-#     import pandas as pd
+with contextlib.suppress(ImportError):
+    import pandas as pd
 IndexIssueMap = dict[int, dict[str, float]]
@@ -69,16 +69,16 @@ def _create_table(metrics, class_wise):
     return table
-# def _create_pandas_dataframe(class_wise):
-#     """Create data for pandas dataframe"""
-#     data = []
-#     for label, metrics_dict in class_wise.items():
-#         row = {"Class": label}
-#         total = sum(metrics_dict.values())
-#         row.update(metrics_dict)  # Add metric counts
-#         row["Total"] = total
-#         data.append(row)
-#     return data
+def _create_pandas_dataframe(class_wise):
+    """Create data for pandas dataframe"""
+    data = []
+    for label, metrics_dict in class_wise.items():
+        row = {"Class": label}
+        total = sum(metrics_dict.values())
+        row.update(metrics_dict)  # Add metric counts
+        row["Total"] = total
+        data.append(row)
+    return data
 @dataclass(frozen=True)
@@ -120,23 +120,23 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
             table = "\n\n".join(outertable)
         return table
-    # def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
-    #     import pandas as pd
-    #     if isinstance(self.issues, dict):
-    #         _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
-    #         data = _create_pandas_dataframe(classwise)
-    #         df = pd.DataFrame(data)
-    #     else:
-    #         df_list = []
-    #         for i, d in enumerate(self.issues):
-    #             _, classwise = _reorganize_by_class_and_metric(d, labelstats)
-    #             data = _create_pandas_dataframe(classwise)
-    #             single_df = pd.DataFrame(data)
-    #             single_df["Dataset"] = i
-    #             df_list.append(single_df)
-    #         df = pd.concat(df_list)
-    #     return df
+    def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
+        import pandas as pd
+        if isinstance(self.issues, dict):
+            _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
+            data = _create_pandas_dataframe(classwise)
+            df = pd.DataFrame(data)
+        else:
+            df_list = []
+            for i, d in enumerate(self.issues):
+                _, classwise = _reorganize_by_class_and_metric(d, labelstats)
+                data = _create_pandas_dataframe(classwise)
+                single_df = pd.DataFrame(data)
+                single_df["Dataset"] = i
+                df_list.append(single_df)
+            df = pd.concat(df_list)
+        return df
 def _get_outlier_mask(

dataeval/metrics/bias/parity.py CHANGED Viewed

@@ -253,13 +253,11 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
     >>> from dataeval.utils.metadata import preprocess
     >>> rng = np.random.default_rng(175)
     >>> labels = rng.choice([0, 1, 2], (100))
-    >>> metadata_dict = [
-    ...     {
+    >>> metadata_dict = {
     ...         "age": list(rng.choice([25, 30, 35, 45], (100))),
     ...         "income": list(rng.choice([50000, 65000, 80000], (100))),
     ...         "gender": list(rng.choice(["M", "F"], (100))),
-    ...     }
-    ... ]
+    ... }
     >>> continuous_factor_bincounts = {"age": 4, "income": 3}
     >>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
     >>> parity(metadata)

dataeval/metrics/stats/labelstats.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 __all__ = []
-# import contextlib
+import contextlib
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping, TypeVar
@@ -13,8 +13,8 @@ from numpy.typing import ArrayLike
 from dataeval.interop import as_numpy
 from dataeval.output import Output, set_metadata
-# with contextlib.suppress(ImportError):
-#     import pandas as pd
+with contextlib.suppress(ImportError):
+    import pandas as pd
 @dataclass(frozen=True)
@@ -73,24 +73,24 @@ class LabelStatsOutput(Output):
         return table_str
-    # def to_dataframe(self) -> pd.DataFrame:
-    #     import pandas as pd
-    #     class_list = []
-    #     total_count = []
-    #     image_count = []
-    #     for cls in self.label_counts_per_class:
-    #         class_list.append(cls)
-    #         total_count.append(self.label_counts_per_class[cls])
-    #         image_count.append(self.image_counts_per_label[cls])
-    #     return pd.DataFrame(
-    #         {
-    #             "Label": class_list,
-    #             "Total Count": total_count,
-    #             "Image Count": image_count,
-    #         }
-    #     )
+    def to_dataframe(self) -> pd.DataFrame:
+        import pandas as pd
+        class_list = []
+        total_count = []
+        image_count = []
+        for cls in self.label_counts_per_class:
+            class_list.append(cls)
+            total_count.append(self.label_counts_per_class[cls])
+            image_count.append(self.image_counts_per_label[cls])
+        return pd.DataFrame(
+            {
+                "Label": class_list,
+                "Total Count": total_count,
+                "Image Count": image_count,
+            }
+        )
 TKey = TypeVar("TKey", int, str)

dataeval/utils/metadata.py CHANGED Viewed

@@ -9,6 +9,7 @@ __all__ = ["Metadata", "preprocess", "merge", "flatten"]
 import warnings
 from dataclasses import dataclass
+from enum import Enum
 from typing import Any, Iterable, Literal, Mapping, TypeVar, overload
 import numpy as np
@@ -20,6 +21,13 @@ from dataeval.output import Output, set_metadata
 DISCRETE_MIN_WD = 0.054
 CONTINUOUS_MIN_SAMPLE_SIZE = 20
+DEFAULT_IMAGE_INDEX_KEY = "_image_index"
+class DropReason(Enum):
+    INCONSISTENT_KEY = "inconsistent_key"
+    INCONSISTENT_SIZE = "inconsistent_size"
+    NESTED_LIST = "nested_list"
 T = TypeVar("T")
@@ -41,8 +49,8 @@ def _convert_type(data: str) -> int | float | str: ...
 def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
     """
-    Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
-    `float`, or `string`.
+    Converts a value or a list of values to the simplest form possible,
+    in preferred order of `int`, `float`, or `string`.
     Parameters
     ----------
@@ -99,8 +107,16 @@ def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], i
     return indices
+def _sorted_drop_reasons(d: dict[str, set[DropReason]]) -> dict[str, list[str]]:
+    return {k: sorted({vv.value for vv in v}) for k, v in sorted(d.items(), key=lambda item: item[1])}
 def _flatten_dict_inner(
-    d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
+    d: Mapping[str, Any],
+    dropped: dict[tuple[str, ...], set[DropReason]],
+    parent_keys: tuple[str, ...],
+    size: int | None = None,
+    nested: bool = False,
 ) -> tuple[dict[tuple[str, ...], Any], int | None]:
     """
     Recursive internal function for flattening a dictionary.
@@ -109,6 +125,8 @@ def _flatten_dict_inner(
     ----------
     d : dict[str, Any]
         Dictionary to flatten
+    dropped: set[tuple[str, ...]]
+        Reference to set of dropped keys from the dictionary
     parent_keys : tuple[str, ...]
         Parent keys to the current dictionary being flattened
     size : int or None, default None
@@ -119,33 +137,62 @@ def _flatten_dict_inner(
     Returns
     -------
     tuple[dict[tuple[str, ...], Any], int | None]
-        - [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
+        - [0]: Dictionary of flattened values with the keys reformatted as a
+               hierarchical tuple of strings
         - [1]: Size, if any, of the current list of values
     """
     items: dict[tuple[str, ...], Any] = {}
     for k, v in d.items():
         new_keys: tuple[str, ...] = parent_keys + (k,)
         if isinstance(v, dict):
-            fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
+            fd, size = _flatten_dict_inner(v, dropped, new_keys, size=size, nested=nested)
             items.update(fd)
         elif isinstance(v, (list, tuple)):
-            if not nested and (size is None or size == len(v)):
+            if nested:
+                dropped.setdefault(parent_keys + (k,), set()).add(DropReason.NESTED_LIST)
+            elif size is not None and size != len(v):
+                dropped.setdefault(parent_keys + (k,), set()).add(DropReason.INCONSISTENT_SIZE)
+            else:
                 size = len(v)
                 if all(isinstance(i, dict) for i in v):
                     for sub_dict in v:
-                        fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
+                        fd, size = _flatten_dict_inner(sub_dict, dropped, new_keys, size=size, nested=True)
                         for fk, fv in fd.items():
                             items.setdefault(fk, []).append(fv)
                 else:
                     items[new_keys] = v
-            else:
-                warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
         else:
             items[new_keys] = v
     return items, size
-def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> tuple[dict[str, Any], int]:
+@overload
+def flatten(
+    d: Mapping[str, Any],
+    return_dropped: Literal[True],
+    sep: str = "_",
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+) -> tuple[dict[str, Any], int, dict[str, list[str]]]: ...
+@overload
+def flatten(
+    d: Mapping[str, Any],
+    return_dropped: Literal[False] = False,
+    sep: str = "_",
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+) -> tuple[dict[str, Any], int]: ...
+def flatten(
+    d: Mapping[str, Any],
+    return_dropped: bool = False,
+    sep: str = "_",
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+):
     """
     Flattens a dictionary and converts values to numeric values when possible.
@@ -153,33 +200,53 @@ def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified:
     ----------
     d : dict[str, Any]
         Dictionary to flatten
-    sep : str
+    return_dropped: bool, default False
+        Option to return a dictionary of dropped keys and the reason(s) for dropping
+    sep : str, default "_"
         String separator to use when concatenating key names
-    ignore_lists : bool
+    ignore_lists : bool, default False
         Option to skip expanding lists within metadata
-    fully_qualified : bool
-        Option to return dictionary keys full qualified instead of reduced
+    fully_qualified : bool, default False
+        Option to return dictionary keys fully qualified instead of reduced
     Returns
     -------
-    tuple[dict[str, Any], int]
-        A tuple of the flattened dictionary and the length of detected lists in metadata
+    dict[str, Any]
+        Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
+    int
+        Size of the values in the flattened dictionary
+    dict[str, list[str]], Optional
+        Dictionary containing dropped keys and reason(s) for dropping
     """
-    expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
+    dropped_inner: dict[tuple[str, ...], set[DropReason]] = {}
+    expanded, size = _flatten_dict_inner(d, dropped=dropped_inner, parent_keys=(), nested=ignore_lists)
     output = {}
-    if fully_qualified:
-        expanded = {sep.join(k): v for k, v in expanded.items()}
-    else:
-        keys = _get_key_indices(expanded)
-        expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
     for k, v in expanded.items():
         cv = _convert_type(v)
-        if isinstance(cv, list) and len(cv) == size:
-            output[k] = cv
+        if isinstance(cv, list):
+            if len(cv) == size:
+                output[k] = cv
+            else:
+                dropped_inner.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
         elif not isinstance(cv, list):
             output[k] = cv if not size else [cv] * size
-    return output, size if size is not None else 1
+    if fully_qualified:
+        output = {sep.join(k): v for k, v in output.items()}
+    else:
+        keys = _get_key_indices(output)
+        output = {sep.join(k[keys[k] :]): v for k, v in output.items()}
+    size = size if size is not None else 1
+    dropped = {sep.join(k): v for k, v in dropped_inner.items()}
+    if return_dropped:
+        return output, size, _sorted_drop_reasons(dropped)
+    else:
+        if dropped:
+            warnings.warn(f"Metadata keys {list(dropped)} were dropped.")
+        return output, size
 def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
@@ -197,48 +264,75 @@ def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
     return set(metadata[keys[0]]) == set(metadata[keys[1]])
+@overload
+def merge(
+    metadata: Iterable[Mapping[str, Any]],
+    return_dropped: Literal[True],
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    return_numpy: bool = False,
+) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
+@overload
+def merge(
+    metadata: Iterable[Mapping[str, Any]],
+    return_dropped: Literal[False] = False,
+    ignore_lists: bool = False,
+    fully_qualified: bool = False,
+    return_numpy: bool = False,
+) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
 def merge(
     metadata: Iterable[Mapping[str, Any]],
+    return_dropped: bool = False,
     ignore_lists: bool = False,
     fully_qualified: bool = False,
-    as_numpy: bool = False,
-) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], NDArray[np.int_]]:
+    return_numpy: bool = False,
+):
     """
-    Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
+    Merges a collection of metadata dictionaries into a single flattened
+    dictionary of keys and values.
-    Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
-    expanding into multiple hierarchical trees is not supported.
+    Nested dictionaries are flattened, and lists are expanded. Nested lists are
+    dropped as the expanding into multiple hierarchical trees is not supported.
+    The function adds an internal "_image_index" key to the metadata dictionary
+    for consumption by the preprocess function.
     Parameters
     ----------
     metadata : Iterable[Mapping[str, Any]]
         Iterable collection of metadata dictionaries to flatten and merge
+    return_dropped: bool, default False
+        Option to return a dictionary of dropped keys and the reason(s) for dropping
     ignore_lists : bool, default False
         Option to skip expanding lists within metadata
     fully_qualified : bool, default False
         Option to return dictionary keys full qualified instead of minimized
-    as_numpy : bool, default False
+    return_numpy : bool, default False
         Option to return results as lists or NumPy arrays
     Returns
     -------
-    dict[str, list[Any]] or dict[str, NDArray[Any]]
+    dict[str, list[Any]] | dict[str, NDArray[Any]]
         A single dictionary containing the flattened data as lists or NumPy arrays
-    NDArray[np.int_]
-        Array defining where individual images start, helpful when working with object detection metadata
+    dict[str, list[str]], Optional
+        Dictionary containing dropped keys and reason(s) for dropping
     Note
     ----
-    Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
+    Nested lists of values and inconsistent keys are dropped in the merged
+    metadata dictionary
     Example
     -------
     >>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3, "c": 5}, {"a": 2, "b": 4}], "source": "example"}]
-    >>> reorganized_metadata, image_indicies = merge(list_metadata)
+    >>> reorganized_metadata, dropped_keys = merge(list_metadata, return_dropped=True)
     >>> reorganized_metadata
-    {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
-    >>> image_indicies
-    array([0])
+    {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example'], '_image_index': [0, 0]}
+    >>> dropped_keys
+    {'target_c': ['inconsistent_key']}
     """
     merged: dict[str, list[Any]] = {}
     isect: set[str] = set()
@@ -255,37 +349,51 @@ def merge(
     else:
         dicts = list(metadata)
-    image_repeats = np.zeros(len(dicts))
+    image_repeats = np.zeros(len(dicts), dtype=np.int_)
+    dropped: dict[str, set[DropReason]] = {}
     for i, d in enumerate(dicts):
-        flattened, image_repeats[i] = flatten(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
+        flattened, image_repeats[i], dropped_inner = flatten(
+            d,
+            return_dropped=True,
+            ignore_lists=ignore_lists,
+            fully_qualified=fully_qualified,
+        )
         isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
-        union = union.union(flattened.keys())
+        union.update(flattened.keys())
+        for k, v in dropped_inner.items():
+            dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
         for k, v in flattened.items():
             merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
-    if len(union) > len(isect):
-        warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
-    output: dict[str, Any] = {}
+    for k in union - isect:
+        dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
     if image_repeats.sum() == image_repeats.size:
-        image_indicies = np.arange(image_repeats.size)
+        image_indices = np.arange(image_repeats.size)
     else:
         image_ids = np.arange(image_repeats.size)
         image_data = np.concatenate(
             [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
         )
-        _, image_unsorted = np.unique(image_data, return_index=True)
-        image_indicies = np.sort(image_unsorted)
+        _, image_unsorted = np.unique(image_data, return_inverse=True)
+        image_indices = np.sort(image_unsorted)
+    output: dict[str, Any] = {}
     if keys:
-        output["keys"] = np.array(keys) if as_numpy else keys
+        output["keys"] = np.array(keys) if return_numpy else keys
     for k in (key for key in merged if key in isect):
         cv = _convert_type(merged[k])
-        output[k] = np.array(cv) if as_numpy else cv
+        output[k] = np.array(cv) if return_numpy else cv
+    output[DEFAULT_IMAGE_INDEX_KEY] = np.array(image_indices) if return_numpy else list(image_indices)
-    return output, image_indicies
+    if return_dropped:
+        return output, _sorted_drop_reasons(dropped)
+    else:
+        if dropped:
+            warnings.warn(f"Metadata keys {list(dropped)} were dropped.")
+        return output
 @dataclass(frozen=True)
@@ -296,13 +404,16 @@ class Metadata(Output):
     Attributes
     ----------
     discrete_factor_names : list[str]
-        List containing factor names for the original data that was discrete and the binned continuous data
+        List containing factor names for the original data that was discrete and
+        the binned continuous data
     discrete_data : NDArray[np.int]
-        Array containing values for the original data that was discrete and the binned continuous data
+        Array containing values for the original data that was discrete and the
+        binned continuous data
     continuous_factor_names : list[str]
         List containing factor names for the original continuous data
     continuous_data : NDArray[np.int or np.double] | None
-        Array containing values for the original continuous data or None if there was no continuous data
+        Array containing values for the original continuous data or None if there
+        was no continuous data
     class_labels : NDArray[np.int]
         Numerical class labels for the images/objects
     class_names : NDArray[Any]
@@ -322,11 +433,12 @@ class Metadata(Output):
 @set_metadata
 def preprocess(
-    raw_metadata: Iterable[Mapping[str, Any]],
+    metadata: dict[str, list[Any]] | dict[str, NDArray[Any]],
     class_labels: ArrayLike | str,
     continuous_factor_bins: Mapping[str, int | Iterable[float]] | None = None,
     auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
     exclude: Iterable[str] | None = None,
+    image_index_key: str = "_image_index",
 ) -> Metadata:
     """
     Restructures the metadata to be in the correct format for the bias functions.
@@ -338,28 +450,54 @@ def preprocess(
     Parameters
     ----------
-    raw_metadata : Iterable[Mapping[str, Any]]
-        Iterable collection of metadata dictionaries to flatten and merge.
+    metadata : dict[str, list[Any] | NDArray[Any]]
+        A flat dictionary which contains all of the metadata on a per image (classification)
+        or per object (object detection) basis. Length of lists/array should match the length
+        of the label list/array.
     class_labels : ArrayLike or string
-        If arraylike, expects the labels for each image (image classification) or each object (object detection).
-        If the labels are included in the metadata dictionary, pass in the key value.
+        If arraylike, expects the labels for each image (image classification)
+        or each object (object detection). If the labels are included in the
+        metadata dictionary, pass in the key value.
     continuous_factor_bins : Mapping[str, int or Iterable[float]] or None, default None
-        User provided dictionary specifying how to bin the continuous metadata factors where the value is either
-        an int to represent the number of bins, or a list of floats representing the edges for each bin.
+        User provided dictionary specifying how to bin the continuous metadata
+        factors where the value is either an int to represent the number of bins,
+        or a list of floats representing the edges for each bin.
     auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
-        Method by which the function will automatically bin continuous metadata factors. It is recommended
-        that the user provide the bins through the `continuous_factor_bins`.
+        Method by which the function will automatically bin continuous metadata factors.
+        It is recommended that the user provide the bins through the `continuous_factor_bins`.
     exclude : Iterable[str] or None, default None
         User provided collection of metadata keys to exclude when processing metadata.
+    image_index_key : str, default "_image_index"
+        User provided metadata key which maps the metadata entry to the source image.
     Returns
     -------
     Metadata
         Output class containing the binned metadata
+    See Also
+    --------
+    merge
     """
-    # Transform metadata into single, flattened dictionary
-    metadata, image_repeats = merge(raw_metadata)
+    # Check that metadata is a single, flattened dictionary with uniform array lengths
+    check_length = -1
+    for k, v in metadata.items():
+        if not isinstance(v, (list, tuple, np.ndarray)):
+            raise TypeError(
+                "Metadata dictionary needs to be a single dictionary whose values "
+                "are arraylike containing the metadata on a per image or per object basis."
+            )
+        else:
+            if check_length == -1:
+                check_length = len(v)
+            else:
+                if check_length != len(v):
+                    raise ValueError(
+                        "The lists/arrays in the metadata dict have varying lengths. "
+                        "Preprocess needs them to be uniform in length."
+                    )
+    # Grab continuous factors if supplied
     continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else None
     # Drop any excluded metadata keys
@@ -368,19 +506,28 @@ def preprocess(
         if continuous_factor_bins:
             continuous_factor_bins.pop(k, None)
-    # Get the class label array in numeric form
+    # Get the class label array in numeric form and check its dimensions
     class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
     if class_array.ndim > 1:
         raise ValueError(
             f"Got class labels with {class_array.ndim}-dimensional "
             f"shape {class_array.shape}, but expected a 1-dimensional array."
         )
+    # Check if the label array is the same length as the metadata arrays
+    elif len(class_array) != check_length:
+        raise ValueError(
+            f"The length of the label array {len(class_array)} is not the same as "
+            f"the length of the metadata arrays {check_length}."
+        )
     if not np.issubdtype(class_array.dtype, np.int_):
         unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
     else:
         numerical_labels = class_array
         unique_classes = np.unique(class_array)
+    # Determine if _image_index is given
+    image_indices = as_numpy(metadata[image_index_key]) if image_index_key in metadata else np.arange(check_length)
     # Bin according to user supplied bins
     continuous_metadata = {}
     discrete_metadata = {}
@@ -401,7 +548,7 @@ def preprocess(
     for key in remaining_keys:
         data = to_numpy(metadata[key])
         if np.issubdtype(data.dtype, np.number):
-            result = _is_continuous(data, image_repeats)
+            result = _is_continuous(data, image_indices)
             if result:
                 continuous_metadata[key] = data
             unique_samples, ordinal_data = np.unique(data, return_inverse=True)
@@ -419,7 +566,7 @@ def preprocess(
         else:
             _, discrete_metadata[key] = np.unique(data, return_inverse=True)
-    # splitting out the dictionaries into the keys and values
+    # Split out the dictionaries into the keys and values
     discrete_factor_names = list(discrete_metadata.keys())
     discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
     continuous_factor_names = list(continuous_metadata.keys())
@@ -499,7 +646,7 @@ def _bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
     return np.digitize(data, bin_edges)  # type: ignore
-def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
+def _is_continuous(data: NDArray[np.number], image_indices: NDArray[np.number]) -> bool:
     """
     Determines whether the data is continuous or discrete using the Wasserstein distance.
@@ -518,11 +665,11 @@ def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number])
     measured from a uniform distribution is greater or less than 0.054, respectively.
     """
     # Check if the metadata is image specific
-    _, data_indicies_unsorted = np.unique(data, return_index=True)
-    if data_indicies_unsorted.size == image_indicies.size:
-        data_indicies = np.sort(data_indicies_unsorted)
-        if (data_indicies == image_indicies).all():
-            data = data[data_indicies]
+    _, data_indices_unsorted = np.unique(data, return_index=True)
+    if data_indices_unsorted.size == image_indices.size:
+        data_indices = np.sort(data_indices_unsorted)
+        if (data_indices == image_indices).all():
+            data = data[data_indices]
     # OLD METHOD
     # uvals = np.unique(data)
@@ -570,7 +717,7 @@ def get_counts(data: NDArray[np.int_], min_num_bins: int | None = None) -> NDArr
     Returns
     -------
-    NDArray[np.int_]
+    NDArray[np.int]
         Bin counts per column of data.
     """
     max_value = data.max() + 1 if min_num_bins is None else min_num_bins

{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dataeval
-Version: 0.76.0
+Version: 0.76.1
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Home-page: https://dataeval.ai/
 License: MIT
@@ -21,8 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Scientific/Engineering
 Provides-Extra: all
-Requires-Dist: matplotlib ; extra == "all"
+Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
 Requires-Dist: numpy (>=1.24.2)
+Requires-Dist: pandas (>=2.0) ; extra == "all"
 Requires-Dist: pillow (>=10.3.0)
 Requires-Dist: requests
 Requires-Dist: scikit-learn (>=1.5.0)
@@ -38,13 +39,17 @@ Description-Content-Type: text/markdown
 # DataEval
-To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
+To view our extensive collection of tutorials, how-to's, explanation guides,
+and reference material, please visit our documentation on
+**[Read the Docs](https://dataeval.readthedocs.io/)**
 ## About DataEval
 <!-- start tagline -->
-DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
+DataEval curates datasets to train and test performant, robust, unbiased and
+reliable AI models and monitors for data shifts that impact performance of
+deployed models.
 <!-- end tagline -->
@@ -52,22 +57,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
 <!-- start needs -->
-DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
+DataEval is an effective, powerful, and reliable set of tools for any T&E
+engineer. Throughout all stages of the machine learning lifecycle, DataEval
+supports model development, data analysis, and monitoring with state-of-the-art
+algorithms to help you solve difficult problems. With a focus on computer
+vision tasks, DataEval provides simple, but effective metrics for performance
+estimation, bias detection, and dataset linting.
 <!-- end needs -->
 <!-- start JATIC interop -->
-DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
-DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
+DataEval is easy to install, supports a wide range of Python versions, and is
+compatible with many of the most popular packages in the scientific and T&E
+communities.
+DataEval also has native interopability between JATIC's suite of tools when
+using MAITE-compliant datasets and models.
 <!-- end JATIC interop -->
 ## Getting Started
 **Python versions:** 3.9 - 3.12
-**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
+**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
+*Gradient*
-Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
+Choose your preferred method of installation below or follow our
+[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
 * [Installing with pip](#installing-with-pip)
 * [Installing with conda/mamba](#installing-with-conda)
@@ -75,7 +91,8 @@ Choose your preferred method of installation below or follow our [installation g
 ### **Installing with pip**
-You can install DataEval directly from pypi.org using the following command.  The optional dependencies of DataEval are `all`.
+You can install DataEval directly from pypi.org using the following command.
+The optional dependencies of DataEval are `all`.
 ```bash
 pip install dataeval[all]
@@ -83,8 +100,9 @@ pip install dataeval[all]
 ### **Installing with conda**
-DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file.  As some dependencies
-are installed from the `pytorch` channel, the channel is specified in the below example.
+DataEval can be installed in a Conda/Mamba environment using the provided
+`environment.yaml` file.  As some dependencies are installed from the `pytorch`
+channel, the channel is specified in the below example.
 ```bash
 micromamba create -f environment\environment.yaml -c pytorch
@@ -92,7 +110,9 @@ micromamba create -f environment\environment.yaml -c pytorch
 ### **Installing from GitHub**
-To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
+To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
+download larger, binary source files and `poetry` for project dependency
+management.
 ```bash
 sudo apt-get install git-lfs
@@ -112,7 +132,9 @@ Install DataEval with optional dependencies for development.
 poetry install --all-extras --with dev
 ```
-Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
+Now that DataEval is installed, you can run commands in the poetry virtual
+environment by prefixing shell commands with `poetry run`, or activate the
+virtual environment directly in the shell.
 ```bash
 poetry shell
@@ -131,7 +153,11 @@ If you have any questions, feel free to reach out to the people below:
 ### CDAO Funding Acknowledgement
-This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
+This material is based upon work supported by the Chief Digital and Artificial
+Intelligence Office under Contract No. W519TC-23-9-2033. The views and
+conclusions contained herein are those of the author(s) and should not be
+interpreted as necessarily representing the official policies or endorsements,
+either expressed or implied, of the U.S. Government.
 <!-- end acknowledgement -->

{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dataeval/__init__.py,sha256=TSINwIPlGIGiYd66kY8gnBnEpBhcgWm7_029htFBgv8,1474
+dataeval/__init__.py,sha256=vqyenyxYGE0OXW3C8PC1YDZRak1uLFIYd45-vh9qafQ,1474
 dataeval/detectors/__init__.py,sha256=iifG-Z08mH5B4QhkKtAieDGJBKldKvmCXpDQJD9qVY8,206
 dataeval/detectors/drift/__init__.py,sha256=wO294Oz--l0GuZTAkBpyGwZphbQsot57HoiEX6kjNOc,652
 dataeval/detectors/drift/base.py,sha256=8zHUnUpmgpWMzDv5C-tUX61lbpDjhJ-eAIiNxaNvWP8,14469
@@ -12,7 +12,7 @@ dataeval/detectors/linters/__init__.py,sha256=CZV5naeYQYL3sHXO_CXB26AXkyTeKHI-TM
 dataeval/detectors/linters/clusterer.py,sha256=V-bNs4ut2E6SIqU4MR1Y96WBZcs4cavQhvXBB0vFZPw,20937
 dataeval/detectors/linters/duplicates.py,sha256=Ba-Nmbjqg_HDMlEBqlWW1aFO_BA-HSc-uWHc3cmI394,5620
 dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
-dataeval/detectors/linters/outliers.py,sha256=aGGGOJKs0FTObQtj1m-ench0MHADOhrhC8idf1wRB0s,13786
+dataeval/detectors/linters/outliers.py,sha256=o0LtAHdazLfj5GM2HcVDjVY_AfSU5GpBUjxHPC9VfIc,13728
 dataeval/detectors/ood/__init__.py,sha256=Ws6_un4pFWNknki7Bp7qjrslZVB9pYNE-K72u2lF65k,291
 dataeval/detectors/ood/ae.py,sha256=SL8oKTERhMwaZTQWwDhQQ6H07UKj8ozXqEWO3TaOAos,2151
 dataeval/detectors/ood/base.py,sha256=-ApcC9lyZJAgk-joMpLXF20sJqtvlAugg-W18TcAsEw,3010
@@ -28,7 +28,7 @@ dataeval/metrics/bias/__init__.py,sha256=SIg4Qxza9BqXyKNQLIY0bpqoFvZfK5-GaejpTH6
 dataeval/metrics/bias/balance.py,sha256=B1sPackyodiBct9Hs88BR4nJde_R61JyjwSBIG_CFug,9171
 dataeval/metrics/bias/coverage.py,sha256=igVDWJSrO2MvaTEiDUhVzVWPGNB1QOZvngCi8UF0RwA,5746
 dataeval/metrics/bias/diversity.py,sha256=nF1y2FaQIU0yHQtckoddjqoty2hsVVMqwaXWHRdGfqA,8521
-dataeval/metrics/bias/parity.py,sha256=rzi7Z0Z6injCaj2vkbSsZvbKMfk1EN648oKinv5y5Dk,12760
+dataeval/metrics/bias/parity.py,sha256=2gSpXkg6ASnkywRTqqx3b3k1T5Qg1Jm-ihMKNZgEwys,12732
 dataeval/metrics/estimators/__init__.py,sha256=oY_9jX7V-Kg7-4KpvMNB4rUhsk8QTA0DIoM8d2VtVIg,380
 dataeval/metrics/estimators/ber.py,sha256=vcndXr0PNLRlYz7u7K74f-B5g3DnUkaTO_WigGdj0cg,5012
 dataeval/metrics/estimators/divergence.py,sha256=joqqlH0AQFibJkHCCb7i7dMJIGF28fmZIR-tGupQQJQ,4247
@@ -39,7 +39,7 @@ dataeval/metrics/stats/boxratiostats.py,sha256=PS1wvWwhTCMJX56erfPW-BZymXrevvXnK
 dataeval/metrics/stats/datasetstats.py,sha256=mt5t5WhlVI7mo56dmhqgnk1eH8oBV7dahgmqkFDcKo0,7387
 dataeval/metrics/stats/dimensionstats.py,sha256=AlPor23dUH718jFNiVNedHQVaQzN-6OKQEVDQbnGE50,4027
 dataeval/metrics/stats/hashstats.py,sha256=5nNSJ3Tl8gPqpYlWpxl7EHfW6pJd1BtbXYUiuGgH4Eo,5070
-dataeval/metrics/stats/labelstats.py,sha256=v9EAg-9h0OtuoU0r3K5TJbHj87fjmnWnNdtg0EPp8co,7030
+dataeval/metrics/stats/labelstats.py,sha256=MW6kB7V8pdIc7yHdXzRwlD6xSl6SYZonNsLUPKAVILI,6992
 dataeval/metrics/stats/pixelstats.py,sha256=tfvu0tYPgDS0jCCSY2sZ2Ice5r1nNuKx-LYXxZQCw7s,4220
 dataeval/metrics/stats/visualstats.py,sha256=pEQnAPFg-zQ1U5orwF0-U7kfHuZGjMJDsdEMAoDZd4I,4634
 dataeval/output.py,sha256=Dyfv1xlrwSbCe7HdDyq8t-kiIRJbBeaMEmMROr1FrVQ,4034
@@ -50,7 +50,7 @@ dataeval/utils/dataset/datasets.py,sha256=7tSqN3d8UncqmXh4eiEwarXgVxc4sMuIKPTqBC
 dataeval/utils/dataset/read.py,sha256=Q_RaNTFXhkMsx3PrgJEIySdHAA-QxGuih6eq6mnJv-4,1524
 dataeval/utils/dataset/split.py,sha256=1vNy5I1zZx-LIf8B0y57dUaO_UdVd1hyJggUANkwNtM,18958
 dataeval/utils/image.py,sha256=AQljELyMFkYsf2AoNOH5dZG8DYE4hPw0MCk85eIXqAw,1926
-dataeval/utils/metadata.py,sha256=SjYPXvM7x_3OyQbdfn4WsViqMplEjRxTdz8tjSJEP3E,22497
+dataeval/utils/metadata.py,sha256=tRcXgJsM1l7vt_naNJj8g8_EHD_AB5MGi1uWxqZsA6M,27431
 dataeval/utils/plot.py,sha256=YyFL1KoJgnl2Bip7m73WVBJa6zbsBnn5c1b3skFfUrA,7068
 dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
 dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
@@ -61,7 +61,7 @@ dataeval/utils/torch/models.py,sha256=Df3B_9x5uu-Y5ZOyhRZYpKJnDvxt0hgMeJLy1E4oxp
 dataeval/utils/torch/trainer.py,sha256=Qay0LK63RuyoGYiJ5zI2C5BVym309ORvp6shhpcrIU4,5589
 dataeval/workflows/__init__.py,sha256=L9yfBipNFGnYuN2JbMknIHDvziwfa2XAGFnOwifZbls,216
 dataeval/workflows/sufficiency.py,sha256=jf53J1PAlfRHSjGpMCWRJzImitLtCQvTMCaMm28ZuPM,18675
-dataeval-0.76.0.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
-dataeval-0.76.0.dist-info/METADATA,sha256=zk12Bkp0R6Glx-VSrG7ip45aTU4y6i_P_mPw2c_SQ6w,5140
-dataeval-0.76.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-dataeval-0.76.0.dist-info/RECORD,,
+dataeval-0.76.1.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
+dataeval-0.76.1.dist-info/METADATA,sha256=w02IzEy_S5kgRZFRGbWayMg98uFdn3jJT4Gl6MOQzek,5196
+dataeval-0.76.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+dataeval-0.76.1.dist-info/RECORD,,

{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{dataeval-0.76.0.dist-info → dataeval-0.76.1.dist-info}/WHEEL RENAMED Viewed

File without changes

dataeval 0.76.0__py3-none-any.whl → 0.76.1__py3-none-any.whl

dataeval 0.76.0py3-none-any.whl → 0.76.1py3-none-any.whl