PyPI - dataeval - Versions diffs - 0.75.0__py3-none-any.whl → 0.76.0__py3-none-any.whl - Mend

dataeval 0.75.0py3-none-any.whl → 0.76.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

dataeval/__init__.py +3 -3
dataeval/detectors/drift/base.py +2 -2
dataeval/detectors/drift/ks.py +2 -1
dataeval/detectors/drift/mmd.py +3 -2
dataeval/detectors/drift/uncertainty.py +2 -2
dataeval/detectors/drift/updates.py +1 -1
dataeval/detectors/linters/clusterer.py +3 -2
dataeval/detectors/linters/duplicates.py +4 -4
dataeval/detectors/linters/outliers.py +96 -3
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/base.py +1 -17
dataeval/detectors/ood/output.py +1 -1
dataeval/interop.py +1 -1
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +1 -1
dataeval/metrics/bias/balance.py +3 -3
dataeval/metrics/bias/coverage.py +1 -1
dataeval/metrics/bias/diversity.py +14 -10
dataeval/metrics/bias/parity.py +5 -5
dataeval/metrics/estimators/ber.py +4 -3
dataeval/metrics/estimators/divergence.py +3 -3
dataeval/metrics/estimators/uap.py +3 -3
dataeval/metrics/stats/__init__.py +1 -1
dataeval/metrics/stats/base.py +24 -8
dataeval/metrics/stats/boxratiostats.py +5 -5
dataeval/metrics/stats/datasetstats.py +39 -6
dataeval/metrics/stats/dimensionstats.py +4 -4
dataeval/metrics/stats/hashstats.py +2 -2
dataeval/metrics/stats/labelstats.py +89 -6
dataeval/metrics/stats/pixelstats.py +7 -5
dataeval/metrics/stats/visualstats.py +6 -4
dataeval/output.py +23 -14
dataeval/utils/__init__.py +2 -2
dataeval/utils/dataset/read.py +1 -1
dataeval/utils/dataset/split.py +1 -1
dataeval/utils/metadata.py +42 -44
dataeval/utils/plot.py +129 -6
dataeval/workflows/sufficiency.py +2 -2
{dataeval-0.75.0.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
{dataeval-0.75.0.dist-info → dataeval-0.76.0.dist-info}/METADATA +18 -17
dataeval-0.76.0.dist-info/RECORD +67 -0
dataeval-0.75.0.dist-info/RECORD +0 -67
{dataeval-0.75.0.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/datasetstats.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Iterable
 from numpy.typing import ArrayLike
-from dataeval.metrics.stats.base import BaseStatsOutput, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
 from dataeval.metrics.stats.dimensionstats import (
     DimensionStatsOutput,
     DimensionStatsProcessor,
@@ -16,12 +16,13 @@ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
 from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
 from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
 from dataeval.output import Output, set_metadata
+from dataeval.utils.plot import channel_histogram_plot
 @dataclass(frozen=True)
-class DatasetStatsOutput(Output):
+class DatasetStatsOutput(Output, HistogramPlotMixin):
     """
-    Output class for :func:`datasetstats` stats metric
+    Output class for :func:`datasetstats` stats metric.
     This class represents the outputs of various stats functions against a single
     dataset, such that each index across all stat outputs are representative of
@@ -41,6 +42,8 @@ class DatasetStatsOutput(Output):
     visualstats: VisualStatsOutput
     labelstats: LabelStatsOutput | None = None
+    _excluded_keys = ["histogram", "percentiles"]
     def _outputs(self) -> list[Output]:
         return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
@@ -53,10 +56,33 @@ class DatasetStatsOutput(Output):
             raise ValueError("All StatsOutput classes must contain the same number of image sources.")
+def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
+    raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
+    if isinstance(channel_index, int):
+        max_channels = 1 if channel_index < raw_channels else raw_channels
+        ch_mask = cls.pixelstats.get_channel_mask(channel_index)
+    elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
+        max_channels = len(list(channel_index))
+        ch_mask = cls.pixelstats.get_channel_mask(channel_index)
+    elif isinstance(channel_limit, int):
+        max_channels = channel_limit
+        ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
+    else:
+        max_channels = raw_channels
+        ch_mask = None
+    if max_channels > raw_channels:
+        max_channels = raw_channels
+    if ch_mask is not None and not any(ch_mask):
+        ch_mask = None
+    return max_channels, ch_mask
 @dataclass(frozen=True)
 class ChannelStatsOutput(Output):
     """
-    Output class for :func:`channelstats` stats metric
+    Output class for :func:`channelstats` stats metric.
     This class represents the outputs of various per-channel stats functions against
     a single dataset, such that each index across all stat outputs are representative
@@ -83,6 +109,13 @@ class ChannelStatsOutput(Output):
         if not all(length == lengths[0] for length in lengths):
             raise ValueError("All StatsOutput classes must contain the same number of image sources.")
+    def plot(
+        self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
+    ) -> None:
+        max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
+        data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
+        channel_histogram_plot(data_dict, log, max_channels, ch_mask)
 @set_metadata
 def datasetstats(
@@ -91,7 +124,7 @@ def datasetstats(
     labels: Iterable[ArrayLike] | None = None,
 ) -> DatasetStatsOutput:
     """
-    Calculates various :term:`statistics<Statistics>` for each image
+    Calculates various :term:`statistics<Statistics>` for each image.
     This function computes dimension, pixel and visual metrics
     on the images or individual bounding boxes for each image as
@@ -135,7 +168,7 @@ def channelstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> ChannelStatsOutput:
     """
-    Calculates various per-channel statistics for each image
+    Calculates various per-channel :term:`statistics` for each image.
     This function computes pixel and visual metrics on the images
     or individual bounding boxes for each image.

dataeval/metrics/stats/dimensionstats.py CHANGED Viewed

@@ -8,15 +8,15 @@ from typing import Any, Callable, Iterable
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
 from dataeval.output import set_metadata
 from dataeval.utils.image import get_bitdepth
 @dataclass(frozen=True)
-class DimensionStatsOutput(BaseStatsOutput):
+class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`dimensionstats` stats metric
+    Output class for :func:`dimensionstats` stats metric.
     Attributes
     ----------
@@ -79,7 +79,7 @@ def dimensionstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> DimensionStatsOutput:
     """
-    Calculates dimension :term:`statistics<Statistics>` for each image
+    Calculates dimension :term:`statistics<Statistics>` for each image.
     This function computes various dimensional metrics (e.g., width, height, channels)
     on the images or individual bounding boxes for each image.

dataeval/metrics/stats/hashstats.py CHANGED Viewed

@@ -25,7 +25,7 @@ MAX_FACTOR = 4
 @dataclass(frozen=True)
 class HashStatsOutput(BaseStatsOutput):
     """
-    Output class for :func:`hashstats` stats metric
+    Output class for :func:`hashstats` stats metric.
     Attributes
     ----------
@@ -126,7 +126,7 @@ def hashstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> HashStatsOutput:
     """
-    Calculates hashes for each image
+    Calculates hashes for each image.
     This function computes hashes from the images including exact hashes and perception-based
     hashes. These hash values can be used to determine if images are exact or near matches.

dataeval/metrics/stats/labelstats.py CHANGED Viewed

@@ -2,20 +2,25 @@ from __future__ import annotations
 __all__ = []
+# import contextlib
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping, TypeVar
+import numpy as np
 from numpy.typing import ArrayLike
-from dataeval.interop import to_numpy
+from dataeval.interop import as_numpy
 from dataeval.output import Output, set_metadata
+# with contextlib.suppress(ImportError):
+#     import pandas as pd
 @dataclass(frozen=True)
 class LabelStatsOutput(Output):
     """
-    Output class for :func:`labelstats` stats metric
+    Output class for :func:`labelstats` stats metric.
     Attributes
     ----------
@@ -46,6 +51,47 @@ class LabelStatsOutput(Output):
     class_count: int
     label_count: int
+    def to_table(self) -> str:
+        max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
+        max_char = max(max_char, 5)
+        max_label = max(list(self.label_counts_per_class.values()))
+        max_img = max(list(self.image_counts_per_label.values()))
+        max_num = int(np.ceil(np.log10(max(max_label, max_img))))
+        max_num = max(max_num, 11)
+        # Display basic counts
+        table_str = f"Class Count: {self.class_count}\n"
+        table_str += f"Label Count: {self.label_count}\n"
+        table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
+        table_str += "--------------------------------------\n"
+        # Display counts per class
+        table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
+        for cls in self.label_counts_per_class:
+            table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
+            table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
+        return table_str
+    # def to_dataframe(self) -> pd.DataFrame:
+    #     import pandas as pd
+    #     class_list = []
+    #     total_count = []
+    #     image_count = []
+    #     for cls in self.label_counts_per_class:
+    #         class_list.append(cls)
+    #         total_count.append(self.label_counts_per_class[cls])
+    #         image_count.append(self.image_counts_per_label[cls])
+    #     return pd.DataFrame(
+    #         {
+    #             "Label": class_list,
+    #             "Total Count": total_count,
+    #             "Image Count": image_count,
+    #         }
+    #     )
 TKey = TypeVar("TKey", int, str)
@@ -57,12 +103,47 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
     return dict(sorted(d.items(), key=lambda x: x[0]))
+def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
+    if isinstance(labels, np.ndarray):
+        return labels[:, None]
+    else:
+        return [[lbl] for lbl in labels]  # type: ignore
+def _get_list_depth(lst):
+    if isinstance(lst, list) and lst:
+        return 1 + max(_get_list_depth(item) for item in lst)
+    return 0
+def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
+    # Check for nested lists beyond 2 levels
+    if isinstance(labels, np.ndarray):
+        if labels.ndim == 1:
+            return _ensure_2d(labels)
+        elif labels.ndim == 2:
+            return labels
+        else:
+            raise ValueError("The label array must not have more than 2 dimensions.")
+    elif isinstance(labels, list):
+        depth = _get_list_depth(labels)
+        if depth == 1:
+            return _ensure_2d(labels)
+        elif depth == 2:
+            return labels
+        else:
+            raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
+    else:
+        raise TypeError("Labels must be either a NumPy array or a list.")
 @set_metadata
 def labelstats(
     labels: Iterable[ArrayLike],
 ) -> LabelStatsOutput:
     """
-    Calculates :term:`statistics<Statistics>` for data labels
+    Calculates :term:`statistics<Statistics>` for data labels.
     This function computes counting metrics (e.g., total per class, total per image)
     on the labels.
@@ -99,10 +180,12 @@ def labelstats(
     index_location = defaultdict(list[int])
     label_per_image: list[int] = []
-    for i, group in enumerate(labels):
-        # Count occurrences of each label in all sublists
-        group = to_numpy(group)
+    labels_2d = _check_labels_dimension(labels)
+    for i, group in enumerate(labels_2d):
+        group = as_numpy(group)
+        # Count occurrences of each label in all sublists
         label_counts.update(group)
         # Get the number of labels per image

dataeval/metrics/stats/pixelstats.py CHANGED Viewed

@@ -9,14 +9,14 @@ import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import entropy, kurtosis, skew
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
 from dataeval.output import set_metadata
 @dataclass(frozen=True)
-class PixelStatsOutput(BaseStatsOutput):
+class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`pixelstats` stats metric
+    Output class for :func:`pixelstats` stats metric.
     Attributes
     ----------
@@ -44,11 +44,13 @@ class PixelStatsOutput(BaseStatsOutput):
     histogram: NDArray[np.uint32]
     entropy: NDArray[np.float16]
+    _excluded_keys = ["histogram"]
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
     output_class: type = PixelStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda self: np.mean(self.scaled),
+        "mean": lambda x: np.mean(x.scaled),
         "std": lambda x: np.std(x.scaled),
         "var": lambda x: np.var(x.scaled),
         "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
@@ -74,7 +76,7 @@ def pixelstats(
     per_channel: bool = False,
 ) -> PixelStatsOutput:
     """
-    Calculates pixel :term:`statistics<Statistics>` for each image
+    Calculates pixel :term:`statistics<Statistics>` for each image.
     This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
     on the images as a whole.

dataeval/metrics/stats/visualstats.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Callable, Iterable
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
 from dataeval.output import set_metadata
 from dataeval.utils.image import edge_filter
@@ -16,9 +16,9 @@ QUARTILES = (0, 25, 50, 75, 100)
 @dataclass(frozen=True)
-class VisualStatsOutput(BaseStatsOutput):
+class VisualStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`visualstats` stats metric
+    Output class for :func:`visualstats` stats metric.
     Attributes
     ----------
@@ -46,6 +46,8 @@ class VisualStatsOutput(BaseStatsOutput):
     zeros: NDArray[np.float16]
     percentiles: NDArray[np.float16]
+    _excluded_keys = ["percentiles"]
 class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
     output_class: type = VisualStatsOutput
@@ -81,7 +83,7 @@ def visualstats(
     per_channel: bool = False,
 ) -> VisualStatsOutput:
     """
-    Calculates visual statistics for each image
+    Calculates visual :term:`statistics` for each image.
     This function computes various visual metrics (e.g., :term:`brightness<Brightness>`, darkness, contrast, blurriness)
     on the images as a whole.

dataeval/output.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 __all__ = []
 import inspect
+import logging
 import sys
 from collections.abc import Mapping
 from datetime import datetime, timezone
@@ -81,29 +82,37 @@ def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None =
                 return f"{v.__class__.__name__}: len={len(v)}"
             return f"{v.__class__.__name__}"
-        time = datetime.now(timezone.utc)
-        result = fn(*args, **kwargs)
-        duration = (datetime.now(timezone.utc) - time).total_seconds()
-        fn_params = inspect.signature(fn).parameters
+        # Collect function metadata
         # set all params with defaults then update params with mapped arguments and explicit keyword args
+        fn_params = inspect.signature(fn).parameters
         arguments = {k: None if v.default is inspect.Parameter.empty else v.default for k, v in fn_params.items()}
         arguments.update(zip(fn_params, args))
         arguments.update(kwargs)
         arguments = {k: fmt(v) for k, v in arguments.items()}
-        state_attrs = (
-            {k: fmt(getattr(args[0], k)) for k in state if "self" in arguments} if "self" in arguments and state else {}
-        )
-        name = (
-            f"{args[0].__class__.__module__}.{args[0].__class__.__name__}.{fn.__name__}"
-            if "self" in arguments
-            else f"{fn.__module__}.{fn.__qualname__}"
-        )
+        is_method = "self" in arguments
+        state_attrs = {k: fmt(getattr(args[0], k)) for k in state or []} if is_method else {}
+        module = args[0].__class__.__module__ if is_method else fn.__module__.removeprefix("src.")
+        class_prefix = f".{args[0].__class__.__name__}." if is_method else "."
+        name = f"{module}{class_prefix}{fn.__name__}"
+        arguments = {k: v for k, v in arguments.items() if k != "self"}
+        _logger = logging.getLogger(module)
+        time = datetime.now(timezone.utc)
+        _logger.log(logging.INFO, f">>> Executing '{name}': args={arguments} state={state} <<<")
+        ##### EXECUTE FUNCTION #####
+        result = fn(*args, **kwargs)
+        ############################
+        duration = (datetime.now(timezone.utc) - time).total_seconds()
+        _logger.log(logging.INFO, f">>> Completed '{name}': args={arguments} state={state} duration={duration} <<<")
+        # Update output with recorded metadata
         metadata = {
             "_name": name,
             "_execution_time": time,
             "_execution_duration": duration,
-            "_arguments": {k: v for k, v in arguments.items() if k != "self"},
+            "_arguments": arguments,
             "_state": state_attrs,
             "_version": __version__,
         }

dataeval/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
-The utility classes and functions are provided by DataEval to assist users
-in setting up data and architectures that are guaranteed to work with applicable
+The utility classes and functions are provided by DataEval to assist users \
+in setting up data and architectures that are guaranteed to work with applicable \
 DataEval metrics.
 """

dataeval/utils/dataset/read.py CHANGED Viewed

@@ -10,7 +10,7 @@ from torch.utils.data import Dataset
 def read_dataset(dataset: Dataset[Any]) -> list[list[Any]]:
     """
-    Extract information from a dataset at each index into individual lists of each information position
+    Extract information from a dataset at each index into individual lists of each information position.
     Parameters
     ----------

dataeval/utils/dataset/split.py CHANGED Viewed

@@ -26,7 +26,7 @@ class TrainValSplit(NamedTuple):
 @dataclass(frozen=True)
 class SplitDatasetOutput(Output):
     """
-    Output class containing test indices and a list of TrainValSplits
+    Output class containing test indices and a list of TrainValSplits.
     Attributes
     ----------

dataeval/utils/metadata.py CHANGED Viewed

@@ -1,11 +1,11 @@
 """
-Metadata related utility functions that help organize raw metadata into :class:`Metadata` objects
-for use within `DataEval`.
+Metadata related utility functions that help organize raw metadata into \
+:class:`Metadata` objects for use within `DataEval`.
 """
 from __future__ import annotations
-__all__ = ["Metadata", "preprocess", "merge"]
+__all__ = ["Metadata", "preprocess", "merge", "flatten"]
 import warnings
 from dataclasses import dataclass
@@ -18,7 +18,6 @@ from scipy.stats import wasserstein_distance as wd
 from dataeval.interop import as_numpy, to_numpy
 from dataeval.output import Output, set_metadata
-TNum = TypeVar("TNum", int, float)
 DISCRETE_MIN_WD = 0.054
 CONTINUOUS_MIN_SAMPLE_SIZE = 20
@@ -146,9 +145,7 @@ def _flatten_dict_inner(
     return items, size
-def _flatten_dict(
-    d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool
-) -> tuple[dict[str, Any], int]:
+def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> tuple[dict[str, Any], int]:
     """
     Flattens a dictionary and converts values to numeric values when possible.
@@ -161,12 +158,12 @@ def _flatten_dict(
     ignore_lists : bool
         Option to skip expanding lists within metadata
     fully_qualified : bool
-        Option to return dictionary keys full qualified instead of minimized
+        Option to return dictionary keys full qualified instead of reduced
     Returns
     -------
-    dict[str, Any]
-        A flattened dictionary
+    tuple[dict[str, Any], int]
+        A tuple of the flattened dictionary and the length of detected lists in metadata
     """
     expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
@@ -260,9 +257,7 @@ def merge(
     image_repeats = np.zeros(len(dicts))
     for i, d in enumerate(dicts):
-        flattened, image_repeats[i] = _flatten_dict(
-            d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified
-        )
+        flattened, image_repeats[i] = flatten(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
         isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
         union = union.union(flattened.keys())
         for k, v in flattened.items():
@@ -296,7 +291,7 @@ def merge(
 @dataclass(frozen=True)
 class Metadata(Output):
     """
-    Dataclass containing binned metadata from the :func:`preprocess` function
+    Dataclass containing binned metadata from the :func:`preprocess` function.
     Attributes
     ----------
@@ -329,7 +324,7 @@ class Metadata(Output):
 def preprocess(
     raw_metadata: Iterable[Mapping[str, Any]],
     class_labels: ArrayLike | str,
-    continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
+    continuous_factor_bins: Mapping[str, int | Iterable[float]] | None = None,
     auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
     exclude: Iterable[str] | None = None,
 ) -> Metadata:
@@ -348,8 +343,9 @@ def preprocess(
     class_labels : ArrayLike or string
         If arraylike, expects the labels for each image (image classification) or each object (object detection).
         If the labels are included in the metadata dictionary, pass in the key value.
-    continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
-        User provided dictionary specifying how to bin the continuous metadata factors
+    continuous_factor_bins : Mapping[str, int or Iterable[float]] or None, default None
+        User provided dictionary specifying how to bin the continuous metadata factors where the value is either
+        an int to represent the number of bins, or a list of floats representing the edges for each bin.
     auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
         Method by which the function will automatically bin continuous metadata factors. It is recommended
         that the user provide the bins through the `continuous_factor_bins`.
@@ -364,11 +360,13 @@ def preprocess(
     # Transform metadata into single, flattened dictionary
     metadata, image_repeats = merge(raw_metadata)
+    continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else None
     # Drop any excluded metadata keys
-    if exclude:
-        for k in list(metadata):
-            if k in exclude:
-                metadata.pop(k)
+    for k in exclude or ():
+        metadata.pop(k, None)
+        if continuous_factor_bins:
+            continuous_factor_bins.pop(k, None)
     # Get the class label array in numeric form
     class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
@@ -394,8 +392,8 @@ def preprocess(
                 "but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
                 "or add corresponding entries to the `metadata` dictionary."
             )
-        for factor, grouping in continuous_factor_bins.items():
-            discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
+        for factor, bins in continuous_factor_bins.items():
+            discrete_metadata[factor] = _digitize_data(metadata[factor], bins)
             continuous_metadata[factor] = metadata[factor]
     # Determine category of the rest of the keys
@@ -417,7 +415,7 @@ def preprocess(
                     "bins using the continuous_factor_bins parameter.",
                     UserWarning,
                 )
-                discrete_metadata[key] = _binning_function(data, auto_bin_method)
+                discrete_metadata[key] = _bin_data(data, auto_bin_method)
         else:
             _, discrete_metadata[key] = np.unique(data, return_inverse=True)
@@ -439,7 +437,7 @@ def preprocess(
     )
-def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
+def _digitize_data(data: list[Any] | NDArray[Any], bins: int | Iterable[float]) -> NDArray[np.intp]:
     """
     Digitizes a list of values into a given number of bins.
@@ -447,8 +445,8 @@ def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[
     ----------
     data : list | NDArray
         The values to be digitized.
-    binning :  int | list[tuple[TNum, TNum]]
-        The number of bins for the discrete values that data will be digitized into.
+    bins : int | Iterable[float]
+        The number of bins or list of bin edges for the discrete values that data will be digitized into.
     Returns
     -------
@@ -461,16 +459,16 @@ def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[
             "Encountered a data value with non-numeric type when digitizing a factor. "
             "Ensure all occurrences of continuous factors are numeric types."
         )
-    if type(binning) is int:
-        _, bin_edges = np.histogram(data, bins=binning)
+    if isinstance(bins, int):
+        _, bin_edges = np.histogram(data, bins=bins)
         bin_edges[-1] = np.inf
         bin_edges[0] = -np.inf
     else:
-        bin_edges = binning
+        bin_edges = list(bins)
     return np.digitize(data, bin_edges)
-def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
+def _bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
     """
     Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
     """
@@ -482,19 +480,19 @@ def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
         )
         bin_method = "uniform_width"
-    if bin_method != "clusters":
-        counts, bin_edges = np.histogram(data, bins="auto")
-        n_bins = counts.size
-        if counts[counts > 0].min() < 10:
-            for _ in range(20):
-                n_bins -= 1
-                counts, bin_edges = np.histogram(data, bins=n_bins)
-                if counts[counts > 0].min() >= 10 or n_bins < 2:
-                    break
-        if bin_method == "uniform_count":
-            quantiles = np.linspace(0, 100, n_bins + 1)
-            bin_edges = np.asarray(np.percentile(data, quantiles))
+    # if bin_method != "clusters":  # restore this when clusters bin_method is available
+    counts, bin_edges = np.histogram(data, bins="auto")
+    n_bins = counts.size
+    if counts[counts > 0].min() < 10:
+        counter = 20
+        while counts[counts > 0].min() < 10 and n_bins >= 2 and counter > 0:
+            counter -= 1
+            n_bins -= 1
+            counts, bin_edges = np.histogram(data, bins=n_bins)
+    if bin_method == "uniform_count":
+        quantiles = np.linspace(0, 100, n_bins + 1)
+        bin_edges = np.asarray(np.percentile(data, quantiles))
     bin_edges[0] = -np.inf  # type: ignore # until the clusters speed up is merged
     bin_edges[-1] = np.inf  # type: ignore # and the _binning_by_clusters can be uncommented

dataeval 0.75.0__py3-none-any.whl → 0.76.0__py3-none-any.whl

dataeval 0.75.0py3-none-any.whl → 0.76.0py3-none-any.whl