PyPI - dataeval - Versions diffs - 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl - Mend

dataeval 0.74.2py3-none-any.whl → 0.76.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

dataeval/__init__.py +27 -23
dataeval/detectors/__init__.py +2 -2
dataeval/detectors/drift/__init__.py +14 -12
dataeval/detectors/drift/base.py +3 -3
dataeval/detectors/drift/cvm.py +1 -1
dataeval/detectors/drift/ks.py +3 -2
dataeval/detectors/drift/mmd.py +9 -7
dataeval/detectors/drift/torch.py +12 -12
dataeval/detectors/drift/uncertainty.py +5 -4
dataeval/detectors/drift/updates.py +1 -1
dataeval/detectors/linters/__init__.py +4 -4
dataeval/detectors/linters/clusterer.py +5 -9
dataeval/detectors/linters/duplicates.py +10 -14
dataeval/detectors/linters/outliers.py +100 -5
dataeval/detectors/ood/__init__.py +4 -11
dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
dataeval/detectors/ood/base.py +47 -160
dataeval/detectors/ood/metadata_ks_compare.py +34 -42
dataeval/detectors/ood/metadata_least_likely.py +3 -3
dataeval/detectors/ood/metadata_ood_mi.py +6 -5
dataeval/detectors/ood/mixin.py +146 -0
dataeval/detectors/ood/output.py +63 -0
dataeval/interop.py +7 -6
dataeval/{logging.py → log.py} +2 -0
dataeval/metrics/__init__.py +3 -3
dataeval/metrics/bias/__init__.py +10 -13
dataeval/metrics/bias/balance.py +13 -11
dataeval/metrics/bias/coverage.py +53 -5
dataeval/metrics/bias/diversity.py +56 -24
dataeval/metrics/bias/parity.py +20 -17
dataeval/metrics/estimators/__init__.py +2 -2
dataeval/metrics/estimators/ber.py +7 -4
dataeval/metrics/estimators/divergence.py +4 -4
dataeval/metrics/estimators/uap.py +4 -4
dataeval/metrics/stats/__init__.py +19 -19
dataeval/metrics/stats/base.py +28 -12
dataeval/metrics/stats/boxratiostats.py +13 -14
dataeval/metrics/stats/datasetstats.py +49 -20
dataeval/metrics/stats/dimensionstats.py +8 -8
dataeval/metrics/stats/hashstats.py +14 -10
dataeval/metrics/stats/labelstats.py +94 -11
dataeval/metrics/stats/pixelstats.py +11 -14
dataeval/metrics/stats/visualstats.py +10 -13
dataeval/output.py +23 -14
dataeval/utils/__init__.py +5 -14
dataeval/utils/dataset/__init__.py +7 -0
dataeval/utils/{torch → dataset}/datasets.py +2 -0
dataeval/utils/dataset/read.py +63 -0
dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
dataeval/utils/image.py +2 -2
dataeval/utils/metadata.py +317 -14
dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
dataeval/utils/torch/__init__.py +2 -17
dataeval/utils/torch/gmm.py +29 -6
dataeval/utils/torch/{utils.py → internal.py} +82 -58
dataeval/utils/torch/models.py +10 -8
dataeval/utils/torch/trainer.py +6 -85
dataeval/workflows/__init__.py +2 -5
dataeval/workflows/sufficiency.py +18 -8
{dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
dataeval-0.76.0.dist-info/METADATA +137 -0
dataeval-0.76.0.dist-info/RECORD +67 -0
dataeval/detectors/ood/base_torch.py +0 -109
dataeval/metrics/bias/metadata_preprocessing.py +0 -285
dataeval/utils/gmm.py +0 -26
dataeval-0.74.2.dist-info/METADATA +0 -120
dataeval-0.74.2.dist-info/RECORD +0 -66
{dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/base.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+from dataeval.utils.plot import histogram_plot
 __all__ = []
 import re
@@ -100,19 +102,33 @@ class BaseStatsOutput(Output):
         for source_index in list(self.source_index) + [None]:
             if source_index is None or source_index.image > cur_image:
                 mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
-                if source_index is None:
-                    break
-                cur_image = source_index.image
-                cur_max_channel = 0
-                cur_mask.clear()
-            cur_mask.append(matches(source_index.channel, channel_index))
-            cur_max_channel = max(cur_max_channel, source_index.channel or 0)
+                if source_index is not None:
+                    cur_image = source_index.image
+                    cur_max_channel = 0
+                    cur_mask.clear()
+            if source_index is not None:
+                cur_mask.append(matches(source_index.channel, channel_index))
+                cur_max_channel = max(cur_max_channel, source_index.channel or 0)
         return mask
     def __len__(self) -> int:
         return len(self.source_index)
+def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
+    return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
+class HistogramPlotMixin:
+    _excluded_keys: Iterable[str] = []
+    def dict(self) -> dict[str, Any]: ...
+    def plot(self, log: bool) -> None:
+        data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
+        histogram_plot(data_dict, log)
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
@@ -126,7 +142,7 @@ class StatsProcessor(Generic[TStatsOutput]):
         self.raw = image
         self.width: int = image.shape[-1]
         self.height: int = image.shape[-2]
-        self.box: NDArray[Any] = np.array([0, 0, self.width, self.height]) if box is None else box
+        self.box: NDArray[np.int64] = np.array([0, 0, self.width, self.height]) if box is None else box.astype(np.int64)
         self._per_channel = per_channel
         self._image = None
         self._shape = None
@@ -193,7 +209,7 @@ class StatsProcessorOutput(NamedTuple):
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
     box_counts: list[int]
-    warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
+    warnings_list: list[str]
 def process_stats(
@@ -206,13 +222,13 @@ def process_stats(
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
-    warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
+    warnings_list: list[str] = []
     nboxes = [None] if boxes is None else normalize_box_shape(boxes)
     for i_b, box in enumerate(nboxes):
         i_b = None if box is None else i_b
         processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
         if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
-            warnings_list.append((i, i_b, box, image.shape))
+            warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
             source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
@@ -302,7 +318,7 @@ def run_stats(
     # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
     for w in warning_list:
-        warnings.warn(f"Bounding box [{w[0]}][{w[1]}]: {w[2]} is out of bounds of {w[3]}.", UserWarning)
+        warnings.warn(w, UserWarning)
     output = {}
     for results in results_list:

dataeval/metrics/stats/boxratiostats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["boxratiostats"]
+__all__ = []
 import copy
 from typing import Any, Callable, Generic, TypeVar, cast
@@ -26,7 +26,7 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
         def __getitem__(self, key: str) -> NDArray[np.float64]:
             _stat = cast(np.ndarray, getattr(self._stats, key)).astype(np.float64)
             _shape = _stat[0].shape
-            _slice = _stat[self._slice[0] : self._slice[1]]
+            _slice = _stat[int(self._slice[0]) : int(self._slice[1])]
             return _slice.reshape(-1, self._channels, *_shape) if self._channels else _slice.reshape(-1, *_shape)
     box: StatSlicer
@@ -102,7 +102,7 @@ def boxratiostats(
     imgstats: TStatOutput,
 ) -> TStatOutput:
     """
-    Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs
+    Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs.
     Parameters
     ----------
@@ -130,17 +130,16 @@ def boxratiostats(
     --------
     Calculating the box ratio statistics using the dimension stats of the boxes and images
-    >>> imagestats = dimensionstats(images)
-    >>> boxstats = dimensionstats(images, bboxes)
+    >>> from dataeval.metrics.stats import dimensionstats
+    >>> imagestats = dimensionstats(stats_images)
+    >>> boxstats = dimensionstats(stats_images, bboxes)
     >>> ratiostats = boxratiostats(boxstats, imagestats)
     >>> print(ratiostats.aspect_ratio)
-    [ 1.15169271  0.78450521 21.33333333  1.5234375   2.25651042  0.77799479
-      0.88867188  3.40625     1.73307292  1.11132812  0.75018315  0.45018315
-      0.69596354 20.          5.11197917  2.33333333  0.75        0.70019531]
+    [ 0.86376953  0.58837891 16.          0.85714286  1.26959707  0.43772894
+      0.66650391  3.83296703  1.95018315]
     >>> print(ratiostats.size)
-    [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
-     0.01220703 0.0168457  0.01057943 0.00976562 0.00130208 0.01098633
-     0.02246094 0.0012207  0.01123047 0.00911458 0.02636719 0.06835938]
+    [0.0255127  0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
+     0.00915527 0.03369141 0.02115885]
     """
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):
@@ -148,13 +147,13 @@ def boxratiostats(
     if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
         raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
     if all(count == 0 for count in boxstats.box_count):
-        raise TypeError("Input for boxstats must contain box information.")
+        raise ValueError("Input for boxstats must contain box information.")
     if any(count != 0 for count in imgstats.box_count):
-        raise TypeError("Input for imgstats must not contain box information.")
+        raise ValueError("Input for imgstats must not contain box information.")
     boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
     imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
     if boxstats_has_channels != imgstats_has_channels:
-        raise TypeError("Input for boxstats and imgstats must have matching channel information.")
+        raise ValueError("Input for boxstats and imgstats must have matching channel information.")
     output_dict = {}
     for key in boxstats.dict():

dataeval/metrics/stats/datasetstats.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from __future__ import annotations
-__all__ = ["DatasetStatsOutput", "ChannelStatsOutput", "datasetstats", "channelstats"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Iterable
 from numpy.typing import ArrayLike
-from dataeval.metrics.stats.base import BaseStatsOutput, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
 from dataeval.metrics.stats.dimensionstats import (
     DimensionStatsOutput,
     DimensionStatsProcessor,
@@ -16,16 +16,17 @@ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
 from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
 from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
 from dataeval.output import Output, set_metadata
+from dataeval.utils.plot import channel_histogram_plot
 @dataclass(frozen=True)
-class DatasetStatsOutput(Output):
+class DatasetStatsOutput(Output, HistogramPlotMixin):
     """
-    Output class for :func:`datasetstats` stats metric
+    Output class for :func:`datasetstats` stats metric.
     This class represents the outputs of various stats functions against a single
     dataset, such that each index across all stat outputs are representative of
-    the same source image.  Modifying or mixing outputs will result in inaccurate
+    the same source image. Modifying or mixing outputs will result in inaccurate
     outlier calculations if not created correctly.
     Attributes
@@ -41,6 +42,8 @@ class DatasetStatsOutput(Output):
     visualstats: VisualStatsOutput
     labelstats: LabelStatsOutput | None = None
+    _excluded_keys = ["histogram", "percentiles"]
     def _outputs(self) -> list[Output]:
         return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
@@ -53,14 +56,37 @@ class DatasetStatsOutput(Output):
             raise ValueError("All StatsOutput classes must contain the same number of image sources.")
+def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
+    raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
+    if isinstance(channel_index, int):
+        max_channels = 1 if channel_index < raw_channels else raw_channels
+        ch_mask = cls.pixelstats.get_channel_mask(channel_index)
+    elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
+        max_channels = len(list(channel_index))
+        ch_mask = cls.pixelstats.get_channel_mask(channel_index)
+    elif isinstance(channel_limit, int):
+        max_channels = channel_limit
+        ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
+    else:
+        max_channels = raw_channels
+        ch_mask = None
+    if max_channels > raw_channels:
+        max_channels = raw_channels
+    if ch_mask is not None and not any(ch_mask):
+        ch_mask = None
+    return max_channels, ch_mask
 @dataclass(frozen=True)
 class ChannelStatsOutput(Output):
     """
-    Output class for :func:`channelstats` stats metric
+    Output class for :func:`channelstats` stats metric.
     This class represents the outputs of various per-channel stats functions against
     a single dataset, such that each index across all stat outputs are representative
-    of the same source image.  Modifying or mixing outputs will result in inaccurate
+    of the same source image. Modifying or mixing outputs will result in inaccurate
     outlier calculations if not created correctly.
     Attributes
@@ -83,6 +109,13 @@ class ChannelStatsOutput(Output):
         if not all(length == lengths[0] for length in lengths):
             raise ValueError("All StatsOutput classes must contain the same number of image sources.")
+    def plot(
+        self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
+    ) -> None:
+        max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
+        data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
+        channel_histogram_plot(data_dict, log, max_channels, ch_mask)
 @set_metadata
 def datasetstats(
@@ -91,7 +124,7 @@ def datasetstats(
     labels: Iterable[ArrayLike] | None = None,
 ) -> DatasetStatsOutput:
     """
-    Calculates various :term:`statistics<Statistics>` for each image
+    Calculates various :term:`statistics<Statistics>` for each image.
     This function computes dimension, pixel and visual metrics
     on the images or individual bounding boxes for each image as
@@ -119,13 +152,11 @@ def datasetstats(
     --------
     Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
-    >>> stats = datasetstats(images, bboxes)
+    >>> stats = datasetstats(stats_images, bboxes)
     >>> print(stats.dimensionstats.aspect_ratio)
-    [ 0.864   0.5884 16.      1.143   1.692   0.5835  0.6665  2.555   1.3
-      0.8335  1.      0.6     0.522  15.      3.834   1.75    0.75    0.7   ]
-    >>> print(stats.visualstats.contrast)
-    [1.744   1.946   0.1164  0.0635  0.0633  0.06274 0.0429  0.0317  0.0317
-     0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
+    [ 0.864   0.5884 16.      1.143   1.692   0.5835  0.6665  2.555   1.3   ]
+    >>> print(stats.visualstats.sharpness)
+    [4.04   4.434  0.2778 4.957  5.145  5.22   4.957  3.076  2.855 ]
     """
     outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
     return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None)  # type: ignore
@@ -137,7 +168,7 @@ def channelstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> ChannelStatsOutput:
     """
-    Calculates various per-channel statistics for each image
+    Calculates various per-channel :term:`statistics` for each image.
     This function computes pixel and visual metrics on the images
     or individual bounding boxes for each image.
@@ -162,12 +193,10 @@ def channelstats(
     --------
     Calculating the per-channel pixel and visual stats for a dataset
-    >>> stats = channelstats(images)
+    >>> stats = channelstats(stats_images)
     >>> print(stats.visualstats.darkness)
-    [0.07495 0.1748  0.275   0.1047  0.11096 0.1172  0.2047  0.2109  0.2172
-     0.3047  0.311   0.3171  0.4048  0.411   0.4172  0.505   0.5107  0.517
-     0.6045  0.611   0.617   0.7046  0.711   0.7173  0.8047  0.811   0.8174
-     0.905   0.911   0.917  ]
+    [0.1499 0.3499 0.55   0.2094 0.2219 0.2344 0.4194 0.6094 0.622  0.6343
+     0.8154]
     """
     outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
     return ChannelStatsOutput(*outputs)  # type: ignore

dataeval/metrics/stats/dimensionstats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["DimensionStatsOutput", "dimensionstats"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Callable, Iterable
@@ -8,15 +8,15 @@ from typing import Any, Callable, Iterable
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
 from dataeval.output import set_metadata
 from dataeval.utils.image import get_bitdepth
 @dataclass(frozen=True)
-class DimensionStatsOutput(BaseStatsOutput):
+class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`dimensionstats` stats metric
+    Output class for :func:`dimensionstats` stats metric.
     Attributes
     ----------
@@ -79,7 +79,7 @@ def dimensionstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> DimensionStatsOutput:
     """
-    Calculates dimension :term:`statistics<Statistics>` for each image
+    Calculates dimension :term:`statistics<Statistics>` for each image.
     This function computes various dimensional metrics (e.g., width, height, channels)
     on the images or individual bounding boxes for each image.
@@ -106,10 +106,10 @@ def dimensionstats(
     --------
     Calculating the dimension statistics on the images, whose shape is (C, H, W)
-    >>> results = dimensionstats(images)
+    >>> results = dimensionstats(stats_images)
     >>> print(results.aspect_ratio)
-    [0.75  0.75  0.75  0.75  0.75  0.75  1.333 0.75  0.75  1.   ]
+    [1.     1.     1.333  1.     0.6665]
     >>> print(results.channels)
-    [1 1 1 1 1 1 3 1 1 3]
+    [3 3 1 3 1]
     """
     return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]

dataeval/metrics/stats/hashstats.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
-__all__ = ["HashStatsOutput", "hashstats"]
+import warnings
+__all__ = []
 from dataclasses import dataclass
 from typing import Callable, Iterable
@@ -23,7 +25,7 @@ MAX_FACTOR = 4
 @dataclass(frozen=True)
 class HashStatsOutput(BaseStatsOutput):
     """
-    Output class for :func:`hashstats` stats metric
+    Output class for :func:`hashstats` stats metric.
     Attributes
     ----------
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
     """
     Performs a perceptual hash on an image by resizing to a square NxN image
     using the Lanczos algorithm where N is 32x32 or the largest multiple of
-    8 that is smaller than the input image dimensions.  The resampled image
+    8 that is smaller than the input image dimensions. The resampled image
     is compressed using a discrete cosine transform and the lowest frequency
     component is encoded as a bit array of greater or less than median value
     and returned as a hex string.
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
     Returns
     -------
     str
-        The hex string hash of the image using perceptual hashing
+        The hex string hash of the image using perceptual hashing, or empty
+        string if the image is too small to be hashed
     """
     # Verify that the image is at least larger than an 8x8 image
     arr = as_numpy(image)
     min_dim = min(arr.shape[-2:])
     if min_dim < HASH_SIZE + 1:
-        raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
+        warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
+        return ""
     # Calculates the dimensions of the resized square image
     resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
 def xxhash(image: ArrayLike) -> str:
     """
     Performs a fast non-cryptographic hash using the xxhash algorithm
-    (xxhash.com) against the image as a flattened bytearray.  The hash
+    (xxhash.com) against the image as a flattened bytearray. The hash
     is returned as a hex string.
     Parameters
@@ -122,7 +126,7 @@ def hashstats(
     bboxes: Iterable[ArrayLike] | None = None,
 ) -> HashStatsOutput:
     """
-    Calculates hashes for each image
+    Calculates hashes for each image.
     This function computes hashes from the images including exact hashes and perception-based
     hashes. These hash values can be used to determine if images are exact or near matches.
@@ -147,10 +151,10 @@ def hashstats(
     --------
     Calculating the statistics on the images, whose shape is (C, H, W)
-    >>> results = hashstats(images)
+    >>> results = hashstats(stats_images)
     >>> print(results.xxhash)
-    ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
+    ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
     >>> print(results.pchash)
-    ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
+    ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
     """
     return run_stats(images, bboxes, False, [HashStatsProcessor])[0]

dataeval/metrics/stats/labelstats.py CHANGED Viewed

@@ -1,21 +1,26 @@
 from __future__ import annotations
-__all__ = ["LabelStatsOutput", "labelstats"]
+__all__ = []
+# import contextlib
 from collections import Counter, defaultdict
 from dataclasses import dataclass
 from typing import Any, Iterable, Mapping, TypeVar
+import numpy as np
 from numpy.typing import ArrayLike
-from dataeval.interop import to_numpy
+from dataeval.interop import as_numpy
 from dataeval.output import Output, set_metadata
+# with contextlib.suppress(ImportError):
+#     import pandas as pd
 @dataclass(frozen=True)
 class LabelStatsOutput(Output):
     """
-    Output class for :func:`labelstats` stats metric
+    Output class for :func:`labelstats` stats metric.
     Attributes
     ----------
@@ -46,6 +51,47 @@ class LabelStatsOutput(Output):
     class_count: int
     label_count: int
+    def to_table(self) -> str:
+        max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
+        max_char = max(max_char, 5)
+        max_label = max(list(self.label_counts_per_class.values()))
+        max_img = max(list(self.image_counts_per_label.values()))
+        max_num = int(np.ceil(np.log10(max(max_label, max_img))))
+        max_num = max(max_num, 11)
+        # Display basic counts
+        table_str = f"Class Count: {self.class_count}\n"
+        table_str += f"Label Count: {self.label_count}\n"
+        table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
+        table_str += "--------------------------------------\n"
+        # Display counts per class
+        table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
+        for cls in self.label_counts_per_class:
+            table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
+            table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
+        return table_str
+    # def to_dataframe(self) -> pd.DataFrame:
+    #     import pandas as pd
+    #     class_list = []
+    #     total_count = []
+    #     image_count = []
+    #     for cls in self.label_counts_per_class:
+    #         class_list.append(cls)
+    #         total_count.append(self.label_counts_per_class[cls])
+    #         image_count.append(self.image_counts_per_label[cls])
+    #     return pd.DataFrame(
+    #         {
+    #             "Label": class_list,
+    #             "Total Count": total_count,
+    #             "Image Count": image_count,
+    #         }
+    #     )
 TKey = TypeVar("TKey", int, str)
@@ -57,12 +103,47 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
     return dict(sorted(d.items(), key=lambda x: x[0]))
+def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
+    if isinstance(labels, np.ndarray):
+        return labels[:, None]
+    else:
+        return [[lbl] for lbl in labels]  # type: ignore
+def _get_list_depth(lst):
+    if isinstance(lst, list) and lst:
+        return 1 + max(_get_list_depth(item) for item in lst)
+    return 0
+def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
+    # Check for nested lists beyond 2 levels
+    if isinstance(labels, np.ndarray):
+        if labels.ndim == 1:
+            return _ensure_2d(labels)
+        elif labels.ndim == 2:
+            return labels
+        else:
+            raise ValueError("The label array must not have more than 2 dimensions.")
+    elif isinstance(labels, list):
+        depth = _get_list_depth(labels)
+        if depth == 1:
+            return _ensure_2d(labels)
+        elif depth == 2:
+            return labels
+        else:
+            raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
+    else:
+        raise TypeError("Labels must be either a NumPy array or a list.")
 @set_metadata
 def labelstats(
     labels: Iterable[ArrayLike],
 ) -> LabelStatsOutput:
     """
-    Calculates :term:`statistics<Statistics>` for data labels
+    Calculates :term:`statistics<Statistics>` for data labels.
     This function computes counting metrics (e.g., total per class, total per image)
     on the labels.
@@ -86,23 +167,25 @@ def labelstats(
     >>> stats = labelstats(labels)
     >>> stats.label_counts_per_class
-    {'chicken': 3, 'cow': 8, 'horse': 9, 'pig': 7, 'sheep': 7}
+    {'chicken': 12, 'cow': 5, 'horse': 4, 'pig': 7, 'sheep': 4}
     >>> stats.label_counts_per_image
-    [3, 2, 3, 4, 1, 5, 4, 4, 4, 4]
+    [3, 3, 5, 3, 2, 5, 5, 2, 2, 2]
     >>> stats.image_counts_per_label
-    {'chicken': 2, 'cow': 6, 'horse': 7, 'pig': 5, 'sheep': 7}
+    {'chicken': 8, 'cow': 4, 'horse': 4, 'pig': 7, 'sheep': 4}
     >>> (stats.image_count, stats.class_count, stats.label_count)
-    (10, 5, 34)
+    (10, 5, 32)
     """
     label_counts = Counter()
     image_counts = Counter()
     index_location = defaultdict(list[int])
     label_per_image: list[int] = []
-    for i, group in enumerate(labels):
-        # Count occurrences of each label in all sublists
-        group = to_numpy(group)
+    labels_2d = _check_labels_dimension(labels)
+    for i, group in enumerate(labels_2d):
+        group = as_numpy(group)
+        # Count occurrences of each label in all sublists
         label_counts.update(group)
         # Get the number of labels per image

dataeval/metrics/stats/pixelstats.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-__all__ = ["PixelStatsOutput", "pixelstats"]
+__all__ = []
 from dataclasses import dataclass
 from typing import Any, Callable, Iterable
@@ -9,14 +9,14 @@ import numpy as np
 from numpy.typing import ArrayLike, NDArray
 from scipy.stats import entropy, kurtosis, skew
-from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
+from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
 from dataeval.output import set_metadata
 @dataclass(frozen=True)
-class PixelStatsOutput(BaseStatsOutput):
+class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
     """
-    Output class for :func:`pixelstats` stats metric
+    Output class for :func:`pixelstats` stats metric.
     Attributes
     ----------
@@ -44,11 +44,13 @@ class PixelStatsOutput(BaseStatsOutput):
     histogram: NDArray[np.uint32]
     entropy: NDArray[np.float16]
+    _excluded_keys = ["histogram"]
 class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
     output_class: type = PixelStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
-        "mean": lambda self: np.mean(self.scaled),
+        "mean": lambda x: np.mean(x.scaled),
         "std": lambda x: np.std(x.scaled),
         "var": lambda x: np.var(x.scaled),
         "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
@@ -74,7 +76,7 @@ def pixelstats(
     per_channel: bool = False,
 ) -> PixelStatsOutput:
     """
-    Calculates pixel :term:`statistics<Statistics>` for each image
+    Calculates pixel :term:`statistics<Statistics>` for each image.
     This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
     on the images as a whole.
@@ -106,15 +108,10 @@ def pixelstats(
     --------
     Calculating the statistics on the images, whose shape is (C, H, W)
-    >>> results = pixelstats(images)
+    >>> results = pixelstats(stats_images)
     >>> print(results.mean)
-    [0.04828 0.562   0.06726 0.09937 0.1315  0.1636  0.1957  0.2278  0.26
-     0.292   0.3242  0.3562  0.3884  0.4204  0.4526  0.4846  0.5166  0.549
-     0.581   0.6133  0.6455  0.6772  0.7095  0.7417  0.774   0.8057  0.838
-     0.87    0.9023  0.934  ]
+    [0.2903 0.2108 0.397  0.596  0.743 ]
     >>> print(results.entropy)
-    [3.238  3.303  0.8125 1.028  0.8223 1.046  0.8247 1.041  0.8203 1.012
-     0.812  0.9883 0.795  0.9243 0.9243 0.795  0.9907 0.8125 1.028  0.8223
-     1.046  0.8247 1.041  0.8203 1.012  0.812  0.9883 0.795  0.9243 0.9243]
+    [4.99  2.371 1.179 2.406 0.668]
     """
     return run_stats(images, bboxes, per_channel, [PixelStatsProcessor])[0]

dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl

dataeval 0.74.2py3-none-any.whl → 0.76.0py3-none-any.whl