PyPI - dataeval - Versions diffs - 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

dataeval/__init__.py +3 -3
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +40 -85
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -5
dataeval/detectors/linters/duplicates.py +13 -36
dataeval/detectors/linters/outliers.py +23 -148
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +30 -9
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/mixin.py +21 -7
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +6 -0
dataeval/metadata/_distance.py +167 -0
dataeval/metadata/_ood.py +217 -0
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +6 -4
dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
dataeval/metrics/bias/_coverage.py +98 -0
dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
dataeval/metrics/estimators/__init__.py +15 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
dataeval/metrics/estimators/_clusterer.py +44 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
dataeval/metrics/stats/__init__.py +16 -13
dataeval/metrics/stats/{base.py → _base.py} +82 -133
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
dataeval/metrics/stats/_dimensionstats.py +75 -0
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +131 -0
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
dataeval/outputs/__init__.py +53 -0
dataeval/{output.py → outputs/_base.py} +55 -25
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +184 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +387 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +234 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +14 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +6 -6
dataeval/utils/data/__init__.py +26 -0
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +104 -0
dataeval/utils/data/_images.py +68 -0
dataeval/utils/data/_metadata.py +360 -0
dataeval/utils/data/_selection.py +126 -0
dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
dataeval/utils/data/_targets.py +85 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_types.py +52 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +57 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +51 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +11 -346
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
dataeval-0.82.0.dist-info/RECORD +104 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/metrics/bias/coverage.py +0 -194
dataeval/metrics/stats/datasetstats.py +0 -202
dataeval/metrics/stats/dimensionstats.py +0 -115
dataeval/metrics/stats/labelstats.py +0 -210
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.1.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0

dataeval/metrics/estimators/{divergence.py → _divergence.py} RENAMED Viewed

@@ -7,32 +7,17 @@ from __future__ import annotations
 __all__ = []
-from dataclasses import dataclass
 from typing import Literal
 import numpy as np
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.interop import as_numpy
-from dataeval.output import Output, set_metadata
-from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
-@dataclass(frozen=True)
-class DivergenceOutput(Output):
-    """
-    Output class for :func:`divergence` estimator metric.
-    Attributes
-    ----------
-    divergence : float
-        :term:`Divergence` value calculated between 2 datasets ranging between 0.0 and 1.0
-    errors : int
-        The number of differing edges between the datasets
-    """
-    divergence: float
-    errors: int
+from dataeval.outputs import DivergenceOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import ensure_embeddings
+from dataeval.utils._method import get_method
+from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
 def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
@@ -78,18 +63,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
     return errors
+_DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
 @set_metadata
-def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
+def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
     """
     Calculates the :term:`divergence` and any errors between the datasets.
     Parameters
     ----------
-    data_a : ArrayLike, shape - (N, P)
-        A dataset in an ArrayLike format to compare.
+    emb_a : ArrayLike, shape - (N, P)
+        Image embeddings in an ArrayLike format to compare.
         Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
-    data_b : ArrayLike, shape - (N, P)
-        A dataset in an ArrayLike format to compare.
+    emb_b : ArrayLike, shape - (N, P)
+        Image embeddings in an ArrayLike format to compare.
         Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
     method : Literal["MST, "FNN"], default "FNN"
         Method used to estimate dataset :term:`divergence<Divergence>`
@@ -125,9 +113,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
     >>> divergence(datasetA, datasetB)
     DivergenceOutput(divergence=0.28, errors=36)
     """
-    div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
-    a = as_numpy(data_a)
-    b = as_numpy(data_b)
+    div_fn = get_method(_DIVERGENCE_FN_MAP, method)
+    a = ensure_embeddings(emb_a, dtype=np.float64)
+    b = ensure_embeddings(emb_b, dtype=np.float64)
     N = a.shape[0]
     M = b.shape[0]

dataeval/metrics/estimators/{uap.py → _uap.py} RENAMED Viewed

@@ -8,27 +8,13 @@ from __future__ import annotations
 __all__ = []
-from dataclasses import dataclass
-from numpy.typing import ArrayLike
 from sklearn.metrics import average_precision_score
-from dataeval.interop import as_numpy
-from dataeval.output import Output, set_metadata
-@dataclass(frozen=True)
-class UAPOutput(Output):
-    """
-    Output class for :func:`uap` estimator metric.
-    Attributes
-    ----------
-    uap : float
-        The empirical mean precision estimate
-    """
-    uap: float
+from dataeval.outputs import UAPOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike
+from dataeval.utils._array import as_numpy
 @set_metadata

dataeval/metrics/stats/__init__.py CHANGED Viewed

@@ -5,15 +5,14 @@ and label statistics against the images and labels of a dataset.
 __all__ = [
     "ChannelStatsOutput",
-    "DatasetStatsOutput",
+    "ImageStatsOutput",
     "DimensionStatsOutput",
     "HashStatsOutput",
     "LabelStatsOutput",
     "PixelStatsOutput",
     "VisualStatsOutput",
     "boxratiostats",
-    "channelstats",
-    "datasetstats",
+    "imagestats",
     "dimensionstats",
     "hashstats",
     "labelstats",
@@ -21,15 +20,19 @@ __all__ = [
     "visualstats",
 ]
-from dataeval.metrics.stats.boxratiostats import boxratiostats
-from dataeval.metrics.stats.datasetstats import (
+from dataeval.metrics.stats._boxratiostats import boxratiostats
+from dataeval.metrics.stats._dimensionstats import dimensionstats
+from dataeval.metrics.stats._hashstats import hashstats
+from dataeval.metrics.stats._imagestats import imagestats
+from dataeval.metrics.stats._labelstats import labelstats
+from dataeval.metrics.stats._pixelstats import pixelstats
+from dataeval.metrics.stats._visualstats import visualstats
+from dataeval.outputs._stats import (
     ChannelStatsOutput,
-    DatasetStatsOutput,
-    channelstats,
-    datasetstats,
+    DimensionStatsOutput,
+    HashStatsOutput,
+    ImageStatsOutput,
+    LabelStatsOutput,
+    PixelStatsOutput,
+    VisualStatsOutput,
 )
-from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
-from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
-from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
-from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
-from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats

dataeval/metrics/stats/{base.py → _base.py} RENAMED Viewed

@@ -1,39 +1,27 @@
 from __future__ import annotations
-from dataeval.utils.plot import histogram_plot
 __all__ = []
 import re
 import warnings
+from collections import ChainMap
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import partial
-from itertools import repeat
 from multiprocessing import Pool
-from typing import Any, Callable, Generic, Iterable, NamedTuple, Optional, TypeVar, Union
+from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
 import numpy as np
 import tqdm
-from numpy.typing import ArrayLike, NDArray
+from numpy.typing import NDArray
-from dataeval.interop import to_numpy_iter
-from dataeval.output import Output
-from dataeval.utils.image import normalize_image_shape, rescale
+from dataeval.config import get_max_processes
+from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
+from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
+from dataeval.utils._array import to_numpy
+from dataeval.utils._image import normalize_image_shape, rescale
 DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
-SOURCE_INDEX = "source_index"
-BOX_COUNT = "box_count"
-# TODO: Replace with global config
-DEFAULT_PROCESSES: int | None = None
-OptionalRange = Optional[Union[int, Iterable[int]]]
-def matches(index: int | None, opt_range: OptionalRange) -> bool:
-    if index is None or opt_range is None:
-        return True
-    return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
 def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
@@ -49,86 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
         return bounding_box
-class SourceIndex(NamedTuple):
-    """
-    Attributes
-    ----------
-    image: int
-        Index of the source image
-    box : int | None
-        Index of the box of the source image
-    channel : int | None
-        Index of the channel of the source image
-    """
-    image: int
-    box: int | None
-    channel: int | None
-@dataclass(frozen=True)
-class BaseStatsOutput(Output):
-    """
-    Attributes
-    ----------
-    source_index : List[SourceIndex]
-        Mapping from statistic to source image, box and channel index
-    box_count : NDArray[np.uint16]
-    """
-    source_index: list[SourceIndex]
-    box_count: NDArray[np.uint16]
-    def get_channel_mask(
-        self,
-        channel_index: OptionalRange,
-        channel_count: OptionalRange = None,
-    ) -> list[bool]:
-        """
-        Boolean mask for results filtered to specified channel index and optionally the count
-        of the channels per image.
-        Parameters
-        ----------
-        channel_index : int | Iterable[int] | None
-            Index or indices of channel(s) to filter for
-        channel_count : int | Iterable[int] | None
-            Optional count(s) of channels to filter for
-        """
-        mask: list[bool] = []
-        cur_mask: list[bool] = []
-        cur_image = 0
-        cur_max_channel = 0
-        for source_index in list(self.source_index) + [None]:
-            if source_index is None or source_index.image > cur_image:
-                mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
-                if source_index is not None:
-                    cur_image = source_index.image
-                    cur_max_channel = 0
-                    cur_mask.clear()
-            if source_index is not None:
-                cur_mask.append(matches(source_index.channel, channel_index))
-                cur_max_channel = max(cur_max_channel, source_index.channel or 0)
-        return mask
-    def __len__(self) -> int:
-        return len(self.source_index)
-def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
-    return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
-class HistogramPlotMixin:
-    _excluded_keys: Iterable[str] = []
-    def dict(self) -> dict[str, Any]: ...
-    def plot(self, log: bool) -> None:
-        data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
-        histogram_plot(data_dict, log)
 TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
         cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
     ) -> TStatsOutput:
         output = {}
-        for key in source:
-            if key not in cls.output_class.__annotations__:
-                continue
-            stat_type: str = cls.output_class.__annotations__[key]
+        attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
+        for key in (key for key in source if key in attrs):
+            stat_type: str = attrs[key]
             dtype_match = re.match(DTYPE_REGEX, stat_type)
             if dtype_match is not None:
                 output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
@@ -205,7 +112,8 @@ class StatsProcessor(Generic[TStatsOutput]):
         return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
-class StatsProcessorOutput(NamedTuple):
+@dataclass
+class StatsProcessorOutput:
     results: list[dict[str, Any]]
     source_indices: list[SourceIndex]
     box_counts: list[int]
@@ -214,16 +122,20 @@ class StatsProcessorOutput(NamedTuple):
 def process_stats(
     i: int,
-    image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    image, boxes = image_boxes
+    data = dataset[i]
+    image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
+    target = None if not isinstance(target, ObjectDetectionTarget) else target
+    boxes = to_numpy(target.boxes) if target is not None else None
     results_list: list[dict[str, Any]] = []
     source_indices: list[SourceIndex] = []
     box_counts: list[int] = []
     warnings_list: list[str] = []
-    nboxes = [None] if boxes is None else normalize_box_shape(boxes)
+    nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
     for i_b, box in enumerate(nboxes):
         i_b = None if box is None else i_b
         processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
@@ -231,7 +143,7 @@ def process_stats(
             warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
         results_list.append({k: v for p in processor_list for k, v in p.process().items()})
         if per_channel:
-            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
+            source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
         else:
             source_indices.append(SourceIndex(i, i_b, None))
     box_counts.append(0 if boxes is None else len(boxes))
@@ -239,16 +151,18 @@ def process_stats(
 def process_stats_unpack(
-    args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
+    i: int,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> StatsProcessorOutput:
-    return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
+    return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
 def run_stats(
-    images: Iterable[ArrayLike],
-    bboxes: Iterable[ArrayLike] | None,
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    per_box: bool,
     per_channel: bool,
     stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
 ) -> list[TStatsOutput]:
@@ -261,26 +175,20 @@ def run_stats(
     Parameters
     ----------
-    images : Iterable[ArrayLike]
-        An iterable of images (e.g., list of arrays), where each image is represented as an
-        array-like structure (e.g., :term:`NumPy` arrays).
-    bboxes : Iterable[ArrayLike]
-        An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
-        as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
-        iterable should match the length of the input images.
+    data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
+        A dataset of images and targets to compute statistics on.
+    per_box : bool
+        A flag which determines if the statistics should be evaluated on a per-box basis or not.
+        If the dataset does not include bounding boxes, this flag is ignored.
     per_channel : bool
         A flag which determines if the states should be evaluated on a per-channel basis or not.
     stats_processor_cls : Iterable[type[StatsProcessor]]
         An iterable of stats processor classes that calculate stats and return output classes.
-    processes : int | None, default None
-        Number of processes to use, defaults to None which uses all available CPU cores.
     Returns
     -------
-    dict[str, NDArray]]
-        A dictionary containing the computed statistics for each image.
-        The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
-        with the results of the computations.
+    list[TStatsOutput]
+        A list of output classes containing the computed statistics
     Note
     ----
@@ -294,20 +202,24 @@ def run_stats(
     results_list: list[dict[str, NDArray[np.float64]]] = []
     source_index: list[SourceIndex] = []
     box_count: list[int] = []
-    bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
     warning_list = []
-    total_for_status = getattr(images, "__len__")() if hasattr(images, "__len__") else None
     stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
     # TODO: Introduce global controls for CPU job parallelism and GPU configurations
-    with Pool(processes=DEFAULT_PROCESSES) as p:
+    with Pool(processes=get_max_processes()) as p:
         for r in tqdm.tqdm(
             p.imap(
-                partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
-                enumerate(zip(to_numpy_iter(images), bbox_iter)),
+                partial(
+                    process_stats_unpack,
+                    dataset=dataset,
+                    per_box=per_box,
+                    per_channel=per_channel,
+                    stats_processor_cls=stats_processor_cls,
+                ),
+                range(len(dataset)),
             ),
-            total=total_for_status,
+            total=len(dataset),
         ):
             results_list.extend(r.results)
             source_index.extend(r.source_indices)
@@ -330,3 +242,40 @@ def run_stats(
     outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
     return outputs
+def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
+    if type(a) is not type(b):
+        raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
+    sum_dict = deepcopy(a.dict())
+    for k in sum_dict:
+        if isinstance(sum_dict[k], list):
+            sum_dict[k].extend(b.dict()[k])
+        else:
+            sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
+    return type(a)(**sum_dict)
+def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
+    output = None
+    dataset_steps = []
+    cur_len = 0
+    for s in stats:
+        output = s if output is None else add_stats(output, s)
+        cur_len += len(s)
+        dataset_steps.append(cur_len)
+    if output is None:
+        raise TypeError("Cannot combine empty sequence of stats.")
+    return output, dataset_steps
+def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
+    last_step = 0
+    for i, step in enumerate(dataset_steps):
+        if idx < step:
+            return i, idx - last_step
+        last_step = step
+    return -1, idx

dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} RENAMED Viewed

@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
 import numpy as np
 from numpy.typing import NDArray
-from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
-from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
-from dataeval.output import set_metadata
+from dataeval.outputs._base import set_metadata
+from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
 TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
 ArraySlice = tuple[int, int]
@@ -50,7 +49,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
             "depth": lambda x: x.box["depth"],
             "distance": lambda x: x.box["distance"],
         }
-    )
+    ),
 }
@@ -87,11 +86,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
         stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
         out_type = type(box_stats)
         use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
-        ratio = (
-            RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
-            if use_override
-            else np.nan_to_num(stats.box[key] / stats.img[key])
-        )
+        with np.errstate(divide="ignore", invalid="ignore"):
+            ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
         out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
     return out_stats
@@ -128,27 +124,28 @@ def boxratiostats(
     Examples
     --------
-    Calculating the box ratio statistics using the dimension stats of the boxes and images
+    Calculate the box ratio statistics using the dimension stats of the images and boxes
+    on a dataset containing 15 targets.
     >>> from dataeval.metrics.stats import dimensionstats
-    >>> imagestats = dimensionstats(stats_images)
-    >>> boxstats = dimensionstats(stats_images, bboxes)
+    >>> imagestats = dimensionstats(dataset, per_box=False)
+    >>> boxstats = dimensionstats(dataset, per_box=True)
     >>> ratiostats = boxratiostats(boxstats, imagestats)
     >>> print(ratiostats.aspect_ratio)
-    [ 0.86376953  0.58837891 16.          0.85714286  1.26959707  0.43772894
-      0.66650391  3.83296703  1.95018315]
+    [ 0.864  0.588 16.     0.857  1.27   0.438  0.667  3.833  1.95   0.833
+      1.     0.6    0.522 15.     3.834]
     >>> print(ratiostats.size)
-    [0.0255127  0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
-     0.00915527 0.03369141 0.02115885]
+    [0.026 0.01  0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
+     0.017 0.001 0.008]
     """
     output_cls = type(boxstats)
     if type(boxstats) is not type(imgstats):
         raise TypeError("Must provide stats outputs of the same type.")
     if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
         raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
-    if all(count == 0 for count in boxstats.box_count):
+    if any(src_idx.box is None for src_idx in boxstats.source_index):
         raise ValueError("Input for boxstats must contain box information.")
-    if any(count != 0 for count in imgstats.box_count):
+    if any(src_idx.box is not None for src_idx in imgstats.source_index):
         raise ValueError("Input for imgstats must not contain box information.")
     boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
     imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)

dataeval/metrics/stats/_dimensionstats.py ADDED Viewed

@@ -0,0 +1,75 @@
+from __future__ import annotations
+__all__ = []
+from typing import Any, Callable
+import numpy as np
+from dataeval.metrics.stats._base import StatsProcessor, run_stats
+from dataeval.outputs import DimensionStatsOutput
+from dataeval.outputs._base import set_metadata
+from dataeval.typing import ArrayLike, Dataset
+from dataeval.utils._image import get_bitdepth
+class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
+    output_class: type = DimensionStatsOutput
+    image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
+        "left": lambda x: x.box[0],
+        "top": lambda x: x.box[1],
+        "width": lambda x: x.box[2] - x.box[0],
+        "height": lambda x: x.box[3] - x.box[1],
+        "channels": lambda x: x.shape[-3],
+        "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
+        "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
+        "depth": lambda x: get_bitdepth(x.image).depth,
+        "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
+        "distance": lambda x: np.sqrt(
+            np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
+            + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
+        ),
+    }
+@set_metadata
+def dimensionstats(
+    dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
+    *,
+    per_box: bool = False,
+) -> DimensionStatsOutput:
+    """
+    Calculates dimension :term:`statistics<Statistics>` for each image.
+    This function computes various dimensional metrics (e.g., width, height, channels)
+    on the images or individual bounding boxes for each image.
+    Parameters
+    ----------
+    dataset : Dataset
+        Dataset to perform calculations on.
+    per_box : bool, default False
+        If True, perform calculations on each bounding box.
+    Returns
+    -------
+    DimensionStatsOutput
+        A dictionary-like object containing the computed dimension statistics for each image or bounding
+        box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
+        are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
+    See Also
+    --------
+    pixelstats, visualstats, Outliers
+    Examples
+    --------
+    Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
+    >>> results = dimensionstats(dataset)
+    >>> print(results.aspect_ratio)
+    [1.    1.    1.333 1.    0.667 1.    1.    1.   ]
+    >>> print(results.channels)
+    [3 3 1 3 1 3 3 3]
+    """
+    return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]

dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl