dataeval 0.69.3__py3-none-any.whl → 0.70.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/datasets.py +300 -0
  3. dataeval/_internal/detectors/drift/base.py +5 -6
  4. dataeval/_internal/detectors/drift/mmd.py +3 -3
  5. dataeval/_internal/detectors/duplicates.py +62 -45
  6. dataeval/_internal/detectors/merged_stats.py +23 -54
  7. dataeval/_internal/detectors/ood/ae.py +3 -3
  8. dataeval/_internal/detectors/outliers.py +133 -61
  9. dataeval/_internal/interop.py +11 -7
  10. dataeval/_internal/metrics/balance.py +9 -9
  11. dataeval/_internal/metrics/ber.py +3 -3
  12. dataeval/_internal/metrics/divergence.py +3 -3
  13. dataeval/_internal/metrics/diversity.py +6 -6
  14. dataeval/_internal/metrics/parity.py +24 -16
  15. dataeval/_internal/metrics/stats/base.py +231 -0
  16. dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  17. dataeval/_internal/metrics/stats/datasetstats.py +97 -0
  18. dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
  19. dataeval/_internal/metrics/stats/hashstats.py +73 -0
  20. dataeval/_internal/metrics/stats/labelstats.py +125 -0
  21. dataeval/_internal/metrics/stats/pixelstats.py +117 -0
  22. dataeval/_internal/metrics/stats/visualstats.py +122 -0
  23. dataeval/_internal/metrics/uap.py +2 -2
  24. dataeval/_internal/metrics/utils.py +28 -13
  25. dataeval/_internal/output.py +3 -18
  26. dataeval/_internal/workflows/sufficiency.py +123 -133
  27. dataeval/metrics/stats/__init__.py +14 -3
  28. dataeval/workflows/__init__.py +2 -2
  29. {dataeval-0.69.3.dist-info → dataeval-0.70.0.dist-info}/METADATA +3 -2
  30. {dataeval-0.69.3.dist-info → dataeval-0.70.0.dist-info}/RECORD +32 -26
  31. {dataeval-0.69.3.dist-info → dataeval-0.70.0.dist-info}/WHEEL +1 -1
  32. dataeval/_internal/flags.py +0 -77
  33. dataeval/_internal/metrics/stats.py +0 -397
  34. dataeval/flags/__init__.py +0 -3
  35. {dataeval-0.69.3.dist-info → dataeval-0.70.0.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,231 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import warnings
5
+ from dataclasses import dataclass
6
+ from typing import Any, Callable, Iterable, NamedTuple, Optional, Union
7
+
8
+ import numpy as np
9
+ from numpy.typing import ArrayLike, NDArray
10
+
11
+ from dataeval._internal.interop import to_numpy_iter
12
+ from dataeval._internal.metrics.utils import normalize_box_shape, normalize_image_shape, rescale
13
+ from dataeval._internal.output import OutputMetadata
14
+
15
+ DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
16
+ SOURCE_INDEX = "source_index"
17
+ BOX_COUNT = "box_count"
18
+
19
+ OptionalRange = Optional[Union[int, Iterable[int]]]
20
+
21
+
22
+ def matches(index: int | None, opt_range: OptionalRange) -> bool:
23
+ if index is None or opt_range is None:
24
+ return True
25
+ return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
26
+
27
+
28
+ class SourceIndex(NamedTuple):
29
+ """
30
+ Attributes
31
+ ----------
32
+ image: int
33
+ Index of the source image
34
+ box : int | None
35
+ Index of the box of the source image
36
+ channel : int | None
37
+ Index of the channel of the source image
38
+ """
39
+
40
+ image: int
41
+ box: int | None
42
+ channel: int | None
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class BaseStatsOutput(OutputMetadata):
47
+ """
48
+ Attributes
49
+ ----------
50
+ source_index : List[SourceIndex]
51
+ Mapping from statistic to source image, box and channel index
52
+ box_count : NDArray[np.uint16]
53
+ """
54
+
55
+ source_index: list[SourceIndex]
56
+ box_count: NDArray[np.uint16]
57
+
58
+ def get_channel_mask(
59
+ self,
60
+ channel_index: OptionalRange,
61
+ channel_count: OptionalRange = None,
62
+ ) -> list[bool]:
63
+ """
64
+ Boolean mask for results filtered to specified channel index and optionally the count
65
+ of the channels per image.
66
+
67
+ Parameters
68
+ ----------
69
+ channel_index : int | Iterable[int] | None
70
+ Index or indices of channel(s) to filter for
71
+ channel_count : int | Iterable[int] | None
72
+ Optional count(s) of channels to filter for
73
+ """
74
+ mask: list[bool] = []
75
+ cur_mask: list[bool] = []
76
+ cur_image = 0
77
+ cur_max_channel = 0
78
+ for source_index in list(self.source_index) + [None]:
79
+ if source_index is None or source_index.image > cur_image:
80
+ mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
81
+ if source_index is None:
82
+ break
83
+ cur_image = source_index.image
84
+ cur_max_channel = 0
85
+ cur_mask.clear()
86
+ cur_mask.append(matches(source_index.channel, channel_index))
87
+ cur_max_channel = max(cur_max_channel, source_index.channel or 0)
88
+ return mask
89
+
90
+ def __len__(self) -> int:
91
+ return len(self.source_index)
92
+
93
+
94
+ class StatsProcessor:
95
+ cache_keys: list[str] = []
96
+ image_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
97
+ channel_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
98
+
99
+ def __init__(self, image: NDArray, box: NDArray | None, per_channel: bool):
100
+ self.raw = image
101
+ self.width = image.shape[-1]
102
+ self.height = image.shape[-2]
103
+ self.box = np.array([0, 0, self.width, self.height]) if box is None else box
104
+ self.per_channel = per_channel
105
+ self._image = None
106
+ self._shape = None
107
+ self._scaled = None
108
+ self.cache = {}
109
+ self.fn_map = self.channel_function_map if per_channel else self.image_function_map
110
+ self.is_valid_slice = box is None or bool(
111
+ box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
112
+ )
113
+
114
+ def get(self, fn_key: str) -> NDArray:
115
+ if fn_key in self.cache_keys:
116
+ if fn_key not in self.cache:
117
+ self.cache[fn_key] = self.fn_map[fn_key](self)
118
+ return self.cache[fn_key]
119
+ else:
120
+ return self.fn_map[fn_key](self)
121
+
122
+ @property
123
+ def image(self) -> NDArray:
124
+ if self._image is None:
125
+ if self.is_valid_slice:
126
+ norm = normalize_image_shape(self.raw)
127
+ self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
128
+ else:
129
+ self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
130
+ return self._image
131
+
132
+ @property
133
+ def shape(self) -> tuple:
134
+ if self._shape is None:
135
+ self._shape = self.image.shape
136
+ return self._shape
137
+
138
+ @property
139
+ def scaled(self) -> NDArray:
140
+ if self._scaled is None:
141
+ self._scaled = rescale(self.image)
142
+ if self.per_channel:
143
+ self._scaled = self._scaled.reshape(self.image.shape[0], -1)
144
+ return self._scaled
145
+
146
+
147
+ def run_stats(
148
+ images: Iterable[ArrayLike],
149
+ bboxes: Iterable[ArrayLike] | None,
150
+ per_channel: bool,
151
+ stats_processor_cls: type,
152
+ output_cls: type,
153
+ ) -> dict:
154
+ """
155
+ Compute specified statistics on a set of images.
156
+
157
+ This function applies a set of statistical operations to each image in the input iterable,
158
+ based on the specified output class. The function determines which statistics to apply
159
+ using a function map. It also supports optional image flattening for pixel-wise calculations.
160
+
161
+ Parameters
162
+ ----------
163
+ images : Iterable[ArrayLike]
164
+ An iterable of images (e.g., list of arrays), where each image is represented as an
165
+ array-like structure (e.g., NumPy arrays).
166
+ bboxes : Iterable[ArrayLike]
167
+ An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
168
+ as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
169
+ iterable should match the length of the input images.
170
+ per_channel : bool
171
+ A flag which determines if the states should be evaluated on a per-channel basis or not.
172
+ output_cls : type
173
+ The output class for which stats values will be calculated.
174
+
175
+ Returns
176
+ -------
177
+ dict[str, NDArray]]
178
+ A dictionary containing the computed statistics for each image.
179
+ The dictionary keys correspond to the names of the statistics, and the values are NumPy arrays
180
+ with the results of the computations.
181
+
182
+ Notes
183
+ -----
184
+ - The function performs image normalization (rescaling the image values)
185
+ before applying some of the statistics.
186
+ - Pixel-level statistics (e.g., brightness, entropy) are computed after
187
+ rescaling and, optionally, flattening the images.
188
+ - For statistics like histograms and entropy, intermediate results may
189
+ be reused to avoid redundant computation.
190
+ """
191
+ results_list: list[dict[str, NDArray]] = []
192
+ output_list = list(output_cls.__annotations__)
193
+ source_index = []
194
+ box_count = []
195
+ bbox_iter = (None for _ in images) if bboxes is None else to_numpy_iter(bboxes)
196
+
197
+ for i, (boxes, image) in enumerate(zip(bbox_iter, to_numpy_iter(images))):
198
+ nboxes = [None] if boxes is None else normalize_box_shape(boxes)
199
+ for i_b, box in enumerate(nboxes):
200
+ i_b = None if box is None else i_b
201
+ processor: StatsProcessor = stats_processor_cls(image, box, per_channel)
202
+ if not processor.is_valid_slice:
203
+ warnings.warn(f"Bounding box {i_b}: {box} is out of bounds of image {i}: {image.shape}.")
204
+ results_list.append({stat: processor.get(stat) for stat in output_list})
205
+ if per_channel:
206
+ source_index.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
207
+ else:
208
+ source_index.append(SourceIndex(i, i_b, None))
209
+ box_count.append(0 if boxes is None else len(boxes))
210
+
211
+ output = {}
212
+ if per_channel:
213
+ for i, results in enumerate(results_list):
214
+ for stat, result in results.items():
215
+ output.setdefault(stat, []).extend(result.tolist())
216
+ else:
217
+ for results in results_list:
218
+ for stat, result in results.items():
219
+ output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
220
+
221
+ for stat in output:
222
+ stat_type: str = output_cls.__annotations__[stat]
223
+
224
+ dtype_match = re.match(DTYPE_REGEX, stat_type)
225
+ if dtype_match is not None:
226
+ output[stat] = np.asarray(output[stat], dtype=np.dtype(dtype_match.group(1)))
227
+
228
+ output[SOURCE_INDEX] = source_index
229
+ output[BOX_COUNT] = np.asarray(box_count, dtype=np.uint16)
230
+
231
+ return output
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import Callable, Generic, TypeVar, cast
5
+
6
+ import numpy as np
7
+ from numpy.typing import NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
10
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
11
+ from dataeval._internal.output import set_metadata
12
+
13
+ TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
14
+ ArraySlice = tuple[int, int]
15
+
16
+
17
+ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
18
+ class StatSlicer:
19
+ def __init__(self, stats: TStatOutput, slice: ArraySlice, channels: int = 0) -> None: # noqa: A002
20
+ self._stats = stats
21
+ self._slice = slice
22
+ self._channels = channels
23
+
24
+ def __getitem__(self, key: str) -> NDArray[np.float64]:
25
+ _stat = cast(np.ndarray, getattr(self._stats, key)).astype(np.float64)
26
+ _shape = _stat[0].shape
27
+ _slice = _stat[self._slice[0] : self._slice[1]]
28
+ return _slice.reshape(-1, self._channels, *_shape) if self._channels else _slice.reshape(-1, *_shape)
29
+
30
+ box: StatSlicer
31
+ img: StatSlicer
32
+ channels: int
33
+
34
+ def __init__(
35
+ self, box_stats: TStatOutput, box_slice: ArraySlice, img_stats: TStatOutput, img_slice: ArraySlice
36
+ ) -> None:
37
+ self.channels = img_slice[1] - img_slice[0]
38
+ self.box = self.StatSlicer(box_stats, box_slice, self.channels)
39
+ self.img = self.StatSlicer(img_stats, img_slice)
40
+
41
+
42
+ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[[BoxImageStatsOutputSlice], NDArray]]] = {
43
+ DimensionStatsOutput: {
44
+ "left": lambda x: x.box["left"] / x.img["width"],
45
+ "top": lambda x: x.box["top"] / x.img["height"],
46
+ "channels": lambda x: x.box["channels"],
47
+ "depth": lambda x: x.box["depth"],
48
+ "distance": lambda x: x.box["distance"],
49
+ }
50
+ }
51
+
52
+
53
+ def get_index_map(stats: BaseStatsOutput) -> list[int]:
54
+ index_map: list[int] = []
55
+ cur_index = -1
56
+ for i, s in enumerate(stats.source_index):
57
+ if s.image > cur_index:
58
+ index_map.append(i)
59
+ cur_index = s.image
60
+ return index_map
61
+
62
+
63
+ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray:
64
+ if not hasattr(box_stats, key) or not hasattr(img_stats, key):
65
+ raise KeyError("Invalid key for provided stats output object.")
66
+
67
+ stats = getattr(box_stats, key)
68
+
69
+ # Copy over stats index maps and box counts
70
+ if key in (SOURCE_INDEX):
71
+ return copy.deepcopy(stats)
72
+ elif key == BOX_COUNT:
73
+ return np.copy(stats)
74
+
75
+ # Calculate ratios for each stat
76
+ out_stats: np.ndarray = np.copy(stats).astype(np.float64)
77
+
78
+ box_map = get_index_map(box_stats)
79
+ img_map = get_index_map(img_stats)
80
+ for i, (box_i, img_i) in enumerate(zip(box_map, img_map)):
81
+ box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
82
+ img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
83
+ stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
84
+ out_type = type(box_stats)
85
+ use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
86
+ ratio = (
87
+ RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
88
+ if use_override
89
+ else np.nan_to_num(stats.box[key] / stats.img[key])
90
+ )
91
+ out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
92
+ return out_stats
93
+
94
+
95
+ @set_metadata("dataeval.metrics")
96
+ def boxratiostats(
97
+ boxstats: TStatOutput,
98
+ imgstats: TStatOutput,
99
+ ) -> TStatOutput:
100
+ """
101
+ Calculates ratio statistics of box outputs over image outputs
102
+
103
+ Parameters
104
+ ----------
105
+ boxstats : DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
106
+ Box statistics outputs to perform calculations on
107
+ imgstats : DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
108
+ Image statistics outputs to perform calculations on
109
+
110
+ Returns
111
+ -------
112
+ DimensionStatsOutput | PixelStatsOutput | VisualStatsOutput
113
+ A dictionary-like object containing the computed ratio of the box statistics divided by the
114
+ image statistics.
115
+
116
+ See Also
117
+ --------
118
+ dimensionstats, pixelstats, visualstats
119
+
120
+ Note
121
+ ----
122
+ DimensionStatsOutput values for channels, depth and distances are the original values
123
+ provided by the box outputs
124
+
125
+ Examples
126
+ --------
127
+ Calculating the box ratio statistics using the dimension stats of the boxes and images
128
+
129
+ >>> imagestats = dimensionstats(images)
130
+ >>> boxstats = dimensionstats(images, bboxes)
131
+ >>> ratiostats = boxratiostats(boxstats, imagestats)
132
+ >>> print(ratiostats.aspect_ratio)
133
+ [ 1.15169271 0.78450521 21.33333333 1.5234375 2.25651042 0.77799479
134
+ 0.88867188 3.40625 1.73307292 1.11132812 0.75018315 0.45018315
135
+ 0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
136
+ >>> print(ratiostats.size)
137
+ [0.03401693 0.01383464 0.00130208 0.01822917 0.02327474 0.00683594
138
+ 0.01220703 0.0168457 0.01057943 0.00976562 0.00130208 0.01098633
139
+ 0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
140
+ """
141
+ output_cls = type(boxstats)
142
+ if type(boxstats) is not type(imgstats):
143
+ raise TypeError("Must provide stats outputs of the same type.")
144
+ if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
145
+ raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
146
+ if all(count == 0 for count in boxstats.box_count):
147
+ raise TypeError("Input for boxstats must contain box information.")
148
+ if any(count != 0 for count in imgstats.box_count):
149
+ raise TypeError("Input for imgstats must not contain box information.")
150
+ boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
151
+ imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
152
+ if boxstats_has_channels != imgstats_has_channels:
153
+ raise TypeError("Input for boxstats and imgstats must have matching channel information.")
154
+
155
+ output_dict = {}
156
+ for key in boxstats.dict():
157
+ output_dict[key] = calculate_ratios(key, boxstats, imgstats)
158
+
159
+ return output_cls(**output_dict)
@@ -0,0 +1,97 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ from numpy.typing import ArrayLike
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput
9
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
10
+ from dataeval._internal.metrics.stats.labelstats import LabelStatsOutput, labelstats
11
+ from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
12
+ from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput, visualstats
13
+ from dataeval._internal.output import OutputMetadata, set_metadata
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class DatasetStatsOutput(OutputMetadata):
18
+ """
19
+ This class represents the outputs of various stats functions against a single
20
+ dataset, such that each index across all stat outputs are representative of
21
+ the same source image. Modifying or mixing outputs will result in inaccurate
22
+ outlier calculations if not created correctly.
23
+
24
+ Attributes
25
+ ----------
26
+ dimensionstats : DimensionStatsOutput or None
27
+ pixelstats: PixelStatsOutput or None
28
+ visualstats: VisualStatsOutput or None
29
+ labelstats: LabelStatsOutput or None, default None
30
+ """
31
+
32
+ dimensionstats: DimensionStatsOutput | None
33
+ pixelstats: PixelStatsOutput | None
34
+ visualstats: VisualStatsOutput | None
35
+ labelstats: LabelStatsOutput | None = None
36
+
37
+ def outputs(self) -> list[BaseStatsOutput]:
38
+ return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats) if s is not None]
39
+
40
+ def __post_init__(self):
41
+ lengths = [len(s) for s in self.outputs()]
42
+ if not all(length == lengths[0] for length in lengths):
43
+ raise ValueError("All StatsOutput classes must contain the same number of image sources.")
44
+
45
+
46
+ @set_metadata("dataeval.metrics")
47
+ def datasetstats(
48
+ images: Iterable[ArrayLike],
49
+ bboxes: Iterable[ArrayLike] | None = None,
50
+ labels: Iterable[ArrayLike] | None = None,
51
+ use_dimension: bool = True,
52
+ use_pixel: bool = True,
53
+ use_visual: bool = True,
54
+ ) -> DatasetStatsOutput:
55
+ """
56
+ Calculates various statistics for each image
57
+
58
+ This function computes dimension, pixel and visual metrics
59
+ on the images or individual bounding boxes for each image as
60
+ well as label statistics if provided.
61
+
62
+ Parameters
63
+ ----------
64
+ images : Iterable[ArrayLike]
65
+ Images to perform calculations on
66
+ bboxes : Iterable[ArrayLike] or None
67
+ Bounding boxes in `xyxy` format for each image to perform calculations on
68
+ labels : Iterable[ArrayLike] or None
69
+ Labels of images or boxes to perform calculations on
70
+
71
+ Returns
72
+ -------
73
+ DatasetStatsOutput
74
+ Output class containing the outputs of various stats functions
75
+
76
+ See Also
77
+ --------
78
+ dimensionstats, labelstats, pixelstats, visualstats, Outliers
79
+
80
+ Examples
81
+ --------
82
+ Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
83
+
84
+ >>> stats = datasetstats(images, bboxes)
85
+ >>> print(stats.dimensionstats.aspect_ratio)
86
+ [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
87
+ 0.8335 1. 0.6 0.522 15. 3.834 1.75 0.75 0.7 ]
88
+ >>> print(stats.visualstats.contrast)
89
+ [1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
90
+ 0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
91
+ """
92
+ return DatasetStatsOutput(
93
+ dimensionstats(images, bboxes) if use_dimension else None,
94
+ pixelstats(images, bboxes) if use_pixel else None,
95
+ visualstats(images, bboxes) if use_visual else None,
96
+ labelstats(labels) if labels else None,
97
+ )
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
10
+ from dataeval._internal.metrics.utils import get_bitdepth
11
+ from dataeval._internal.output import set_metadata
12
+
13
+
14
+ class DimensionStatsProcessor(StatsProcessor):
15
+ image_function_map = {
16
+ "left": lambda x: x.box[0],
17
+ "top": lambda x: x.box[1],
18
+ "width": lambda x: x.shape[-1],
19
+ "height": lambda x: x.shape[-2],
20
+ "channels": lambda x: x.shape[-3],
21
+ "size": lambda x: np.prod(x.shape[-2:]),
22
+ "aspect_ratio": lambda x: x.shape[-1] / x.shape[-2],
23
+ "depth": lambda x: get_bitdepth(x.image).depth,
24
+ "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
25
+ "distance": lambda x: np.sqrt(
26
+ np.square(((x.box[0] + x.box[2]) / 2) - (x.width / 2))
27
+ + np.square(((x.box[1] + x.box[3]) / 2) - (x.height / 2))
28
+ ),
29
+ }
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class DimensionStatsOutput(BaseStatsOutput):
34
+ """
35
+ Attributes
36
+ ----------
37
+ left : NDArray[np.int32]
38
+ Offsets from the left edge of images in pixels
39
+ top : NDArray[np.int32]
40
+ Offsets from the top edge of images in pixels
41
+ width : NDArray[np.uint32]
42
+ Width of the images in pixels
43
+ height : NDArray[np.uint32]
44
+ Height of the images in pixels
45
+ channels : NDArray[np.uint8]
46
+ Channel count of the images in pixels
47
+ size : NDArray[np.uint32]
48
+ Size of the images in pixels
49
+ aspect_ratio : NDArray[np.float16]
50
+ Aspect ratio of the images (width/height)
51
+ depth : NDArray[np.uint8]
52
+ Color depth of the images in bits
53
+ center : NDArray[np.uint16]
54
+ Offset from center in [x,y] coordinates of the images in pixels
55
+ distance : NDArray[np.float16]
56
+ Distance in pixels from center
57
+ """
58
+
59
+ left: NDArray[np.int32]
60
+ top: NDArray[np.int32]
61
+ width: NDArray[np.uint32]
62
+ height: NDArray[np.uint32]
63
+ channels: NDArray[np.uint8]
64
+ size: NDArray[np.uint32]
65
+ aspect_ratio: NDArray[np.float16]
66
+ depth: NDArray[np.uint8]
67
+ center: NDArray[np.int16]
68
+ distance: NDArray[np.float16]
69
+
70
+
71
+ @set_metadata("dataeval.metrics")
72
+ def dimensionstats(
73
+ images: Iterable[ArrayLike],
74
+ bboxes: Iterable[ArrayLike] | None = None,
75
+ ) -> DimensionStatsOutput:
76
+ """
77
+ Calculates dimension statistics for each image
78
+
79
+ This function computes various dimensional metrics (e.g., width, height, channels)
80
+ on the images or individual bounding boxes for each image.
81
+
82
+ Parameters
83
+ ----------
84
+ images : Iterable[ArrayLike]
85
+ Images to perform calculations on
86
+ bboxes : Iterable[ArrayLike] or None
87
+ Bounding boxes in `xyxy` format for each image to perform calculations on
88
+
89
+ Returns
90
+ -------
91
+ DimensionStatsOutput
92
+ A dictionary-like object containing the computed dimension statistics for each image or bounding
93
+ box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
94
+ are lists of results for each image or numpy arrays when the results are multi-dimensional.
95
+
96
+ See Also
97
+ --------
98
+ pixelstats, visualstats, Outliers
99
+
100
+ Examples
101
+ --------
102
+ Calculating the dimension statistics on the images, whose shape is (C, H, W)
103
+
104
+ >>> results = dimensionstats(images)
105
+ >>> print(results.aspect_ratio)
106
+ [0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
107
+ >>> print(results.channels)
108
+ [1 1 1 1 1 1 3 1 1 3]
109
+ """
110
+ output = run_stats(images, bboxes, False, DimensionStatsProcessor, DimensionStatsOutput)
111
+ return DimensionStatsOutput(**output)
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable
5
+
6
+ from numpy.typing import ArrayLike
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
9
+ from dataeval._internal.metrics.utils import pchash, xxhash
10
+ from dataeval._internal.output import set_metadata
11
+
12
+
13
+ class HashStatsProcessor(StatsProcessor):
14
+ image_function_map = {
15
+ "xxhash": lambda x: xxhash(x.image),
16
+ "pchash": lambda x: pchash(x.image),
17
+ }
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class HashStatsOutput(BaseStatsOutput):
22
+ """
23
+ Attributes
24
+ ----------
25
+ xxhash : List[str]
26
+ xxHash hash of the images as a hex string
27
+ pchash : List[str]
28
+ Perception hash of the images as a hex string
29
+ """
30
+
31
+ xxhash: list[str]
32
+ pchash: list[str]
33
+
34
+
35
+ @set_metadata("dataeval.metrics")
36
+ def hashstats(
37
+ images: Iterable[ArrayLike],
38
+ bboxes: Iterable[ArrayLike] | None = None,
39
+ ) -> HashStatsOutput:
40
+ """
41
+ Calculates hashes for each image
42
+
43
+ This function computes hashes from the images including exact hashes and perception-based
44
+ hashes. These hash values can be used to determine if images are exact or near matches.
45
+
46
+ Parameters
47
+ ----------
48
+ images : ArrayLike
49
+ Images to hashing
50
+ bboxes : Iterable[ArrayLike] or None
51
+ Bounding boxes in `xyxy` format for each image
52
+
53
+ Returns
54
+ -------
55
+ HashStatsOutput
56
+ A dictionary-like object containing the computed hashes for each image.
57
+
58
+ See Also
59
+ --------
60
+ Duplicates
61
+
62
+ Examples
63
+ --------
64
+ Calculating the statistics on the images, whose shape is (C, H, W)
65
+
66
+ >>> results = hashstats(images)
67
+ >>> print(results.xxhash)
68
+ ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
69
+ >>> print(results.pchash)
70
+ ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
71
+ """
72
+ output = run_stats(images, bboxes, False, HashStatsProcessor, HashStatsOutput)
73
+ return HashStatsOutput(**output)