PyPI - dataeval - Versions diffs - 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl - Mend

dataeval 0.81.0py3-none-any.whl → 0.82.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

dataeval/__init__.py +1 -1
dataeval/config.py +68 -11
dataeval/detectors/drift/__init__.py +2 -2
dataeval/detectors/drift/_base.py +8 -64
dataeval/detectors/drift/_mmd.py +12 -38
dataeval/detectors/drift/_torch.py +7 -7
dataeval/detectors/drift/_uncertainty.py +6 -5
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -2
dataeval/detectors/linters/duplicates.py +14 -46
dataeval/detectors/linters/outliers.py +25 -159
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +6 -5
dataeval/detectors/ood/base.py +2 -2
dataeval/detectors/ood/metadata_ood_mi.py +4 -6
dataeval/detectors/ood/mixin.py +3 -4
dataeval/detectors/ood/vae.py +3 -2
dataeval/metadata/__init__.py +2 -1
dataeval/metadata/_distance.py +134 -0
dataeval/metadata/_ood.py +30 -49
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/bias/__init__.py +5 -4
dataeval/metrics/bias/_balance.py +17 -149
dataeval/metrics/bias/_coverage.py +4 -106
dataeval/metrics/bias/_diversity.py +12 -107
dataeval/metrics/bias/_parity.py +7 -71
dataeval/metrics/estimators/__init__.py +5 -4
dataeval/metrics/estimators/_ber.py +2 -20
dataeval/metrics/estimators/_clusterer.py +1 -61
dataeval/metrics/estimators/_divergence.py +2 -19
dataeval/metrics/estimators/_uap.py +2 -16
dataeval/metrics/stats/__init__.py +15 -12
dataeval/metrics/stats/_base.py +41 -128
dataeval/metrics/stats/_boxratiostats.py +13 -13
dataeval/metrics/stats/_dimensionstats.py +17 -58
dataeval/metrics/stats/_hashstats.py +19 -35
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +42 -121
dataeval/metrics/stats/_pixelstats.py +19 -51
dataeval/metrics/stats/_visualstats.py +19 -51
dataeval/outputs/__init__.py +57 -0
dataeval/outputs/_base.py +182 -0
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +186 -0
dataeval/outputs/_metadata.py +54 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +393 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +187 -7
dataeval/utils/_method.py +1 -5
dataeval/utils/_plot.py +2 -2
dataeval/utils/data/__init__.py +5 -1
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +12 -14
dataeval/utils/data/_images.py +30 -27
dataeval/utils/data/_metadata.py +28 -11
dataeval/utils/data/_selection.py +25 -22
dataeval/utils/data/_split.py +5 -29
dataeval/utils/data/_targets.py +14 -2
dataeval/utils/data/datasets/_base.py +5 -5
dataeval/utils/data/datasets/_cifar10.py +1 -1
dataeval/utils/data/datasets/_milco.py +1 -1
dataeval/utils/data/datasets/_mnist.py +1 -1
dataeval/utils/data/datasets/_ships.py +1 -1
dataeval/utils/data/{_types.py → datasets/_types.py} +10 -16
dataeval/utils/data/datasets/_voc.py +1 -1
dataeval/utils/data/selections/_classfilter.py +4 -5
dataeval/utils/data/selections/_indices.py +2 -2
dataeval/utils/data/selections/_limit.py +2 -2
dataeval/utils/data/selections/_reverse.py +2 -2
dataeval/utils/data/selections/_shuffle.py +2 -2
dataeval/utils/torch/_internal.py +5 -5
dataeval/utils/torch/trainer.py +8 -8
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +6 -342
{dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/METADATA +2 -2
dataeval-0.82.1.dist-info/RECORD +105 -0
dataeval/_output.py +0 -137
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/metrics/stats/_datasetstats.py +0 -198
dataeval-0.81.0.dist-info/RECORD +0 -94
{dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/LICENSE.txt +0 -0
{dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/WHEEL +0 -0

dataeval/metrics/stats/_base.py CHANGED Viewed

@@ -4,35 +4,24 @@ __all__ = []
 import re
 import warnings
+from collections import ChainMap
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
-from itertools import repeat
 from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, Optional, Sequence, Sized, TypeVar, Union
+from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
 import numpy as np
 import tqdm
 from numpy.typing import NDArray
-from dataeval._output import Output
 from dataeval.config import get_max_processes
-from dataeval.typing import ArrayLike
-from dataeval.utils._array import to_numpy_iter
+from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
+from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
+from dataeval.utils._array import to_numpy
 from dataeval.utils._image import normalize_image_shape, rescale
-from dataeval.utils._plot import histogram_plot
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
-SOURCE_INDEX = "source_index"
-BOX_COUNT = "box_count"
-OptionalRange = Optional[Union[int, Iterable[int]]]
-def matches(index: int | None, opt_range: OptionalRange) -> bool:
-    if index is None or opt_range is None:
-        return True
-    return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
 def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
@@ -48,87 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
         return bounding_box
-@dataclass
-class SourceIndex:
-    """
-    Attributes
-    ----------
-    image: int
-        Index of the source image
-    box : int | None
-        Index of the box of the source image
-    channel : int | None
-        Index of the channel of the source image
-    """
-    image: int
-    box: int | None
-    channel: int | None
-@dataclass(frozen=True)
-class BaseStatsOutput(Output):
-    """
-    Attributes
-    ----------
-    source_index : List[SourceIndex]
-        Mapping from statistic to source image, box and channel index
-    box_count : NDArray[np.uint16]
-    """
-    source_index: list[SourceIndex]
-    box_count: NDArray[np.uint16]
-    def get_channel_mask(
-        self,
-        channel_index: OptionalRange,
-        channel_count: OptionalRange = None,
-    ) -> list[bool]:
-        """
-        Boolean mask for results filtered to specified channel index and optionally the count
-        of the channels per image.
-        Parameters
-        ----------
-        channel_index : int | Iterable[int] | None
-            Index or indices of channel(s) to filter for
-        channel_count : int | Iterable[int] | None
-            Optional count(s) of channels to filter for
-        """
-        mask: list[bool] = []
-        cur_mask: list[bool] = []
-        cur_image = 0
-        cur_max_channel = 0
-        for source_index in list(self.source_index) + [None]:
-            if source_index is None or source_index.image > cur_image:
-                mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
-                if source_index is not None:
-                    cur_image = source_index.image
-                    cur_max_channel = 0
-                    cur_mask.clear()
-            if source_index is not None:
-                cur_mask.append(matches(source_index.channel, channel_index))
-                cur_max_channel = max(cur_max_channel, source_index.channel or 0)
-        return mask
-    def __len__(self) -> int:
-        return len(self.source_index)
-def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
-    return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
-class HistogramPlotMixin:
-    _excluded_keys: Iterable[str] = []
-    def dict(self) -> dict[str, Any]: ...
-    def plot(self, log: bool) -> None:
-        data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
-        histogram_plot(data_dict, log)
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
         cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
     ) -> TStatsOutput:
         output = {}
-        for key in source:
-            if key not in cls.output_class.__annotations__:
-                continue
-            stat_type: str = cls.output_class.__annotations__[key]
+        attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
+        for key in (key for key in source if key in attrs):
+            stat_type: str = attrs[key]
             dtype_match = re.match(DTYPE_REGEX, stat_type)
             if dtype_match is not None:
                 output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
@@ -215,16 +122,20 @@ class StatsProcessorOutput:
 def process_stats(
     i: int,
-    image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    image, boxes = image_boxes
+    data = dataset[i]
+    image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
+    target = None if not isinstance(target, ObjectDetectionTarget) else target
+    boxes = to_numpy(target.boxes) if target is not None else None
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
     warnings_list: list[str] = []
-    nboxes = [None] if boxes is None else normalize_box_shape(boxes)
+    nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
     for i_b, box in enumerate(nboxes):
         i_b = None if box is None else i_b
         processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
@@ -232,7 +143,7 @@ def process_stats(
             warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
-            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
+            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
         else:
             source_indices.append(SourceIndex(i, i_b, None))
     box_counts.append(0 if boxes is None else len(boxes))
@@ -240,16 +151,18 @@ def process_stats(
 def process_stats_unpack(
-    args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
+    i: int,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
+    return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
 def run_stats(
-    images: Iterable[ArrayLike],
-    bboxes: Iterable[ArrayLike] | None,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> list[TStatsOutput]:
@@ -262,13 +175,11 @@ def run_stats(
     Parameters
     ----------
-    images : Iterable[ArrayLike]
-        An iterable of images (e.g., list of arrays), where each image is represented as an
-        array-like structure (e.g., :term:`NumPy` arrays).
-    bboxes : Iterable[ArrayLike]
-        An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
-        as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
-        iterable should match the length of the input images.
+    data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
+        A dataset of images and targets to compute statistics on.
+    per_box : bool
+        A flag which determines if the statistics should be evaluated on a per-box basis or not.
+        If the dataset does not include bounding boxes, this flag is ignored.
     per_channel : bool
         A flag which determines if the states should be evaluated on a per-channel basis or not.
     stats_processor_cls : Iterable[type[StatsProcessor]]
@@ -276,10 +187,8 @@ def run_stats(
     Returns
     -------
-    dict[str, NDArray]]
-        A dictionary containing the computed statistics for each image.
-        The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
-        with the results of the computations.
+    list[TStatsOutput]
+        A list of output classes containing the computed statistics
     Note
     ----
@@ -293,20 +202,24 @@ def run_stats(
     results_list: list[dict[str, NDArray[np.float64]]] = []
     source_index: list[SourceIndex] = []
     box_count: list[int] = []
-    bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
     warning_list = []
-    total_for_status = len(images) if isinstance(images, Sized) else None
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
     # TODO: Introduce global controls for CPU job parallelism and GPU configurations
     with Pool(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
-                partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
-                enumerate(zip(to_numpy_iter(images), bbox_iter)),
+                partial(
+                    process_stats_unpack,
+                    dataset=dataset,
+                    per_box=per_box,
+                    per_channel=per_channel,
+                    stats_processor_cls=stats_processor_cls,
+                ),
+                range(len(dataset)),
             ),
-            total=total_for_status,
+            total=len(dataset),
         ):
             results_list.extend(r.results)
             source_index.extend(r.source_indices)
@@ -335,13 +248,13 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
     if type(a) is not type(b):
         raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
-    sum_dict = deepcopy(a.dict())
+    sum_dict = deepcopy(a.data())
     for k in sum_dict:
         if isinstance(sum_dict[k], list):
-            sum_dict[k].extend(b.dict()[k])
+            sum_dict[k].extend(b.data()[k])
         else:
-            sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
+            sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
     return type(a)(**sum_dict)

dataeval/metrics/stats/_boxratiostats.py CHANGED Viewed

@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
-from dataeval._output import set_metadata
-from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
-from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
 TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
 ArraySlice = tuple[int, int]
@@ -125,27 +124,28 @@ def boxratiostats(
     Examples
     --------
-    Calculating the box ratio statistics using the dimension stats of the boxes and images
+    Calculate the box ratio statistics using the dimension stats of the images and boxes
+    on a dataset containing 15 targets.
     >>> from dataeval.metrics.stats import dimensionstats
-    >>> imagestats = dimensionstats(stats_images)
-    >>> boxstats = dimensionstats(stats_images, bboxes)
+    >>> imagestats = dimensionstats(dataset, per_box=False)
+    >>> boxstats = dimensionstats(dataset, per_box=True)
     >>> ratiostats = boxratiostats(boxstats, imagestats)
     >>> print(ratiostats.aspect_ratio)
-    [ 0.86376953  0.58837891 16.          0.85714286  1.26959707  0.43772894
-      0.66650391  3.83296703  1.95018315]
+    [ 0.864  0.588 16.     0.857  1.27   0.438  0.667  3.833  1.95   0.833
+      1.     0.6    0.522 15.     3.834]
     >>> print(ratiostats.size)
-    [0.0255127  0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
-     0.00915527 0.03369141 0.02115885]
+    [0.026 0.01  0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
+     0.017 0.001 0.008]
     """
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):
         raise TypeError("Must provide stats outputs of the same type.")
     if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
         raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
-    if all(count == 0 for count in boxstats.box_count):
+    if any(src_idx.box is None for src_idx in boxstats.source_index):
         raise ValueError("Input for boxstats must contain box information.")
-    if any(count != 0 for count in imgstats.box_count):
+    if any(src_idx.box is not None for src_idx in imgstats.source_index):
         raise ValueError("Input for imgstats must not contain box information.")
     boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
     imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
@@ -153,7 +153,7 @@ def boxratiostats(
         raise ValueError("Input for boxstats and imgstats must have matching channel information.")
     output_dict = {}
-    for key in boxstats.dict():
+    for key in boxstats.data():
         output_dict[key] = calculate_ratios(key, boxstats, imgstats)
     return output_cls(**output_dict)

dataeval/metrics/stats/_dimensionstats.py CHANGED Viewed

@@ -2,59 +2,17 @@ from __future__ import annotations
 __all__ = []
-from dataclasses import dataclass
-from typing import Any, Callable, Iterable
+from typing import Any, Callable
 import numpy as np
-from numpy.typing import NDArray
-from dataeval._output import set_metadata
-from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
-from dataeval.typing import ArrayLike
+from dataeval.metrics.stats._base import StatsProcessor, run_stats
+from dataeval.outputs import DimensionStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike, Dataset
 from dataeval.utils._image import get_bitdepth
-@dataclass(frozen=True)
-class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
-    """
-    Output class for :func:`.dimensionstats` stats metric.
-    Attributes
-    ----------
-    left : NDArray[np.int32]
-        Offsets from the left edge of images in pixels
-    top : NDArray[np.int32]
-        Offsets from the top edge of images in pixels
-    width : NDArray[np.uint32]
-        Width of the images in pixels
-    height : NDArray[np.uint32]
-        Height of the images in pixels
-    channels : NDArray[np.uint8]
-        Channel count of the images in pixels
-    size : NDArray[np.uint32]
-        Size of the images in pixels
-    aspect_ratio : NDArray[np.float16]
-        :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
-    depth : NDArray[np.uint8]
-        Color depth of the images in bits
-    center : NDArray[np.uint16]
-        Offset from center in [x,y] coordinates of the images in pixels
-    distance : NDArray[np.float16]
-        Distance in pixels from center
-    """
-    left: NDArray[np.int32]
-    top: NDArray[np.int32]
-    width: NDArray[np.uint32]
-    height: NDArray[np.uint32]
-    channels: NDArray[np.uint8]
-    size: NDArray[np.uint32]
-    aspect_ratio: NDArray[np.float16]
-    depth: NDArray[np.uint8]
-    center: NDArray[np.int16]
-    distance: NDArray[np.float16]
 class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
     output_class: type = DimensionStatsOutput
     image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
@@ -76,8 +34,9 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
 @set_metadata
 def dimensionstats(
-    images: Iterable[ArrayLike],
-    bboxes: Iterable[ArrayLike] | None = None,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
 ) -> DimensionStatsOutput:
     """
     Calculates dimension :term:`statistics<Statistics>` for each image.
@@ -87,10 +46,10 @@ def dimensionstats(
     Parameters
     ----------
-    images : Iterable[ArrayLike]
-        Images to perform calculations on
-    bboxes : Iterable[ArrayLike] or None
-        Bounding boxes in `xyxy` format for each image to perform calculations on
+    dataset : Dataset
+        Dataset to perform calculations on.
+    per_box : bool, default False
+        If True, perform calculations on each bounding box.
     Returns
     -------
@@ -105,12 +64,12 @@ def dimensionstats(
     Examples
     --------
-    Calculating the dimension statistics on the images, whose shape is (C, H, W)
+    Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
-    >>> results = dimensionstats(stats_images)
+    >>> results = dimensionstats(dataset)
     >>> print(results.aspect_ratio)
-    [1.     1.     1.333  1.     0.6665]
+    [1.    1.    1.333 1.    0.667 1.    1.    1.   ]
     >>> print(results.channels)
-    [3 3 1 3 1]
+    [3 3 1 3 1 3 3 3]
     """
-    return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
+    return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]

dataeval/metrics/stats/_hashstats.py CHANGED Viewed

@@ -4,17 +4,17 @@ import warnings
 __all__ = []
-from dataclasses import dataclass
-from typing import Callable, Iterable
+from typing import Any, Callable
 import numpy as np
 import xxhash as xxh
 from PIL import Image
 from scipy.fftpack import dct
-from dataeval._output import set_metadata
-from dataeval.metrics.stats._base import BaseStatsOutput, StatsProcessor, run_stats
-from dataeval.typing import ArrayLike
+from dataeval.metrics.stats._base import StatsProcessor, run_stats
+from dataeval.outputs import HashStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike, Dataset
 from dataeval.utils._array import as_numpy
 from dataeval.utils._image import normalize_image_shape, rescale
@@ -22,23 +22,6 @@ HASH_SIZE = 8
 MAX_FACTOR = 4
-@dataclass(frozen=True)
-class HashStatsOutput(BaseStatsOutput):
-    """
-    Output class for :func:`.hashstats` stats metric.
-    Attributes
-    ----------
-    xxhash : List[str]
-        xxHash hash of the images as a hex string
-    pchash : List[str]
-        :term:`Perception-based Hash` of the images as a hex string
-    """
-    xxhash: list[str]
-    pchash: list[str]
 def pchash(image: ArrayLike) -> str:
     """
     Performs a perceptual hash on an image by resizing to a square NxN image
@@ -122,8 +105,9 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
 @set_metadata
 def hashstats(
-    images: Iterable[ArrayLike],
-    bboxes: Iterable[ArrayLike] | None = None,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
 ) -> HashStatsOutput:
     """
     Calculates hashes for each image.
@@ -133,10 +117,10 @@ def hashstats(
     Parameters
     ----------
-    images : ArrayLike
-        Images to hashing
-    bboxes : Iterable[ArrayLike] or None
-        Bounding boxes in `xyxy` format for each image
+    dataset : Dataset
+        Dataset to perform calculations on.
+    per_box : bool, default False
+        If True, perform calculations on each bounding box.
     Returns
     -------
@@ -149,12 +133,12 @@ def hashstats(
     Examples
     --------
-    Calculating the statistics on the images, whose shape is (C, H, W)
+    Calculate the hashes of a dataset of images, whose shape is (C, H, W)
-    >>> results = hashstats(stats_images)
-    >>> print(results.xxhash)
-    ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
-    >>> print(results.pchash)
-    ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
+    >>> results = hashstats(dataset)
+    >>> print(results.xxhash[:5])
+    ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
+    >>> print(results.pchash[:5])
+    ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
     """
-    return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
+    return run_stats(dataset, per_box, False, [HashStatsProcessor])[0]

dataeval/metrics/stats/_imagestats.py ADDED Viewed

@@ -0,0 +1,94 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any, Literal, overload
+from dataeval.metrics.stats._base import run_stats
+from dataeval.metrics.stats._dimensionstats import DimensionStatsProcessor
+from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
+from dataeval.metrics.stats._visualstats import VisualStatsProcessor
+from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike, Dataset
+@overload
+def imagestats(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
+    per_channel: Literal[True],
+) -> ChannelStatsOutput: ...
+@overload
+def imagestats(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
+    per_channel: Literal[False] = False,
+) -> ImageStatsOutput: ...
+@set_metadata
+def imagestats(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
+    per_channel: bool = False,
+) -> ImageStatsOutput | ChannelStatsOutput:
+    """
+    Calculates various :term:`statistics<Statistics>` for each image.
+    This function computes dimension, pixel and visual metrics
+    on the images or individual bounding boxes for each image. If
+    performing calculations per channel dimension stats are excluded.
+    Parameters
+    ----------
+    dataset : Dataset
+        Dataset to perform calculations on.
+    per_box : bool, default False
+        If True, perform calculations on each bounding box.
+    per_channel : bool, default False
+        If True, perform calculations on each channel.
+    Returns
+    -------
+    ImageStatsOutput or ChannelStatsOutput
+        Output class containing the outputs of various stats functions
+    See Also
+    --------
+    dimensionstats, pixelstats, visualstats
+    Examples
+    --------
+    Calculate dimension, pixel and visual statistics for a dataset containing 8
+    images.
+    >>> stats = imagestats(dataset)
+    >>> print(stats.aspect_ratio)
+    [1.    1.    1.333 1.    0.667 1.    1.    1.   ]
+    >>> print(stats.sharpness)
+    [20.23 20.23 23.33 20.23 77.06 20.23 20.23 20.23]
+    Calculate the pixel and visual stats for a dataset containing 6 3-channel
+    images and 2 1-channel images for a total of 20 channels.
+    >>> ch_stats = imagestats(dataset, per_channel=True)
+    >>> print(ch_stats.brightness)
+    [0.027 0.152 0.277 0.127 0.135 0.142 0.259 0.377 0.385 0.392 0.508 0.626
+     0.634 0.642 0.751 0.759 0.767 0.876 0.884 0.892]
+    """
+    if per_channel:
+        processors = [PixelStatsProcessor, VisualStatsProcessor]
+        output_cls = ChannelStatsOutput
+    else:
+        processors = [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor]
+        output_cls = ImageStatsOutput
+    outputs = run_stats(dataset, per_box, per_channel, processors)
+    return output_cls(**{k: v for d in outputs for k, v in d.data().items()})

dataeval 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl

dataeval 0.81.0py3-none-any.whl → 0.82.1py3-none-any.whl