dataeval 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/config.py +68 -11
  3. dataeval/detectors/drift/__init__.py +2 -2
  4. dataeval/detectors/drift/_base.py +8 -64
  5. dataeval/detectors/drift/_mmd.py +12 -38
  6. dataeval/detectors/drift/_torch.py +7 -7
  7. dataeval/detectors/drift/_uncertainty.py +6 -5
  8. dataeval/detectors/drift/updates.py +20 -3
  9. dataeval/detectors/linters/__init__.py +3 -2
  10. dataeval/detectors/linters/duplicates.py +14 -46
  11. dataeval/detectors/linters/outliers.py +25 -159
  12. dataeval/detectors/ood/__init__.py +1 -1
  13. dataeval/detectors/ood/ae.py +6 -5
  14. dataeval/detectors/ood/base.py +2 -2
  15. dataeval/detectors/ood/metadata_ood_mi.py +4 -6
  16. dataeval/detectors/ood/mixin.py +3 -4
  17. dataeval/detectors/ood/vae.py +3 -2
  18. dataeval/metadata/__init__.py +2 -1
  19. dataeval/metadata/_distance.py +134 -0
  20. dataeval/metadata/_ood.py +30 -49
  21. dataeval/metadata/_utils.py +44 -0
  22. dataeval/metrics/bias/__init__.py +5 -4
  23. dataeval/metrics/bias/_balance.py +17 -149
  24. dataeval/metrics/bias/_coverage.py +4 -106
  25. dataeval/metrics/bias/_diversity.py +12 -107
  26. dataeval/metrics/bias/_parity.py +7 -71
  27. dataeval/metrics/estimators/__init__.py +5 -4
  28. dataeval/metrics/estimators/_ber.py +2 -20
  29. dataeval/metrics/estimators/_clusterer.py +1 -61
  30. dataeval/metrics/estimators/_divergence.py +2 -19
  31. dataeval/metrics/estimators/_uap.py +2 -16
  32. dataeval/metrics/stats/__init__.py +15 -12
  33. dataeval/metrics/stats/_base.py +41 -128
  34. dataeval/metrics/stats/_boxratiostats.py +13 -13
  35. dataeval/metrics/stats/_dimensionstats.py +17 -58
  36. dataeval/metrics/stats/_hashstats.py +19 -35
  37. dataeval/metrics/stats/_imagestats.py +94 -0
  38. dataeval/metrics/stats/_labelstats.py +42 -121
  39. dataeval/metrics/stats/_pixelstats.py +19 -51
  40. dataeval/metrics/stats/_visualstats.py +19 -51
  41. dataeval/outputs/__init__.py +57 -0
  42. dataeval/outputs/_base.py +182 -0
  43. dataeval/outputs/_bias.py +381 -0
  44. dataeval/outputs/_drift.py +83 -0
  45. dataeval/outputs/_estimators.py +114 -0
  46. dataeval/outputs/_linters.py +186 -0
  47. dataeval/outputs/_metadata.py +54 -0
  48. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  49. dataeval/outputs/_stats.py +393 -0
  50. dataeval/outputs/_utils.py +44 -0
  51. dataeval/outputs/_workflows.py +364 -0
  52. dataeval/typing.py +187 -7
  53. dataeval/utils/_method.py +1 -5
  54. dataeval/utils/_plot.py +2 -2
  55. dataeval/utils/data/__init__.py +5 -1
  56. dataeval/utils/data/_dataset.py +217 -0
  57. dataeval/utils/data/_embeddings.py +12 -14
  58. dataeval/utils/data/_images.py +30 -27
  59. dataeval/utils/data/_metadata.py +28 -11
  60. dataeval/utils/data/_selection.py +25 -22
  61. dataeval/utils/data/_split.py +5 -29
  62. dataeval/utils/data/_targets.py +14 -2
  63. dataeval/utils/data/datasets/_base.py +5 -5
  64. dataeval/utils/data/datasets/_cifar10.py +1 -1
  65. dataeval/utils/data/datasets/_milco.py +1 -1
  66. dataeval/utils/data/datasets/_mnist.py +1 -1
  67. dataeval/utils/data/datasets/_ships.py +1 -1
  68. dataeval/utils/data/{_types.py → datasets/_types.py} +10 -16
  69. dataeval/utils/data/datasets/_voc.py +1 -1
  70. dataeval/utils/data/selections/_classfilter.py +4 -5
  71. dataeval/utils/data/selections/_indices.py +2 -2
  72. dataeval/utils/data/selections/_limit.py +2 -2
  73. dataeval/utils/data/selections/_reverse.py +2 -2
  74. dataeval/utils/data/selections/_shuffle.py +2 -2
  75. dataeval/utils/torch/_internal.py +5 -5
  76. dataeval/utils/torch/trainer.py +8 -8
  77. dataeval/workflows/__init__.py +2 -1
  78. dataeval/workflows/sufficiency.py +6 -342
  79. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/METADATA +2 -2
  80. dataeval-0.82.1.dist-info/RECORD +105 -0
  81. dataeval/_output.py +0 -137
  82. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  83. dataeval/metrics/stats/_datasetstats.py +0 -198
  84. dataeval-0.81.0.dist-info/RECORD +0 -94
  85. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/LICENSE.txt +0 -0
  86. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/WHEEL +0 -0
@@ -4,35 +4,24 @@ __all__ = []
4
4
 
5
5
  import re
6
6
  import warnings
7
+ from collections import ChainMap
7
8
  from copy import deepcopy
8
9
  from dataclasses import dataclass
9
10
  from functools import partial
10
- from itertools import repeat
11
11
  from multiprocessing import Pool
12
- from typing import Any, Callable, Generic, Iterable, Optional, Sequence, Sized, TypeVar, Union
12
+ from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
13
13
 
14
14
  import numpy as np
15
15
  import tqdm
16
16
  from numpy.typing import NDArray
17
17
 
18
- from dataeval._output import Output
19
18
  from dataeval.config import get_max_processes
20
- from dataeval.typing import ArrayLike
21
- from dataeval.utils._array import to_numpy_iter
19
+ from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
20
+ from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
21
+ from dataeval.utils._array import to_numpy
22
22
  from dataeval.utils._image import normalize_image_shape, rescale
23
- from dataeval.utils._plot import histogram_plot
24
23
 
25
24
  DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
26
- SOURCE_INDEX = "source_index"
27
- BOX_COUNT = "box_count"
28
-
29
- OptionalRange = Optional[Union[int, Iterable[int]]]
30
-
31
-
32
- def matches(index: int | None, opt_range: OptionalRange) -> bool:
33
- if index is None or opt_range is None:
34
- return True
35
- return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
36
25
 
37
26
 
38
27
  def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
@@ -48,87 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
48
37
  return bounding_box
49
38
 
50
39
 
51
- @dataclass
52
- class SourceIndex:
53
- """
54
- Attributes
55
- ----------
56
- image: int
57
- Index of the source image
58
- box : int | None
59
- Index of the box of the source image
60
- channel : int | None
61
- Index of the channel of the source image
62
- """
63
-
64
- image: int
65
- box: int | None
66
- channel: int | None
67
-
68
-
69
- @dataclass(frozen=True)
70
- class BaseStatsOutput(Output):
71
- """
72
- Attributes
73
- ----------
74
- source_index : List[SourceIndex]
75
- Mapping from statistic to source image, box and channel index
76
- box_count : NDArray[np.uint16]
77
- """
78
-
79
- source_index: list[SourceIndex]
80
- box_count: NDArray[np.uint16]
81
-
82
- def get_channel_mask(
83
- self,
84
- channel_index: OptionalRange,
85
- channel_count: OptionalRange = None,
86
- ) -> list[bool]:
87
- """
88
- Boolean mask for results filtered to specified channel index and optionally the count
89
- of the channels per image.
90
-
91
- Parameters
92
- ----------
93
- channel_index : int | Iterable[int] | None
94
- Index or indices of channel(s) to filter for
95
- channel_count : int | Iterable[int] | None
96
- Optional count(s) of channels to filter for
97
- """
98
- mask: list[bool] = []
99
- cur_mask: list[bool] = []
100
- cur_image = 0
101
- cur_max_channel = 0
102
- for source_index in list(self.source_index) + [None]:
103
- if source_index is None or source_index.image > cur_image:
104
- mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
105
- if source_index is not None:
106
- cur_image = source_index.image
107
- cur_max_channel = 0
108
- cur_mask.clear()
109
- if source_index is not None:
110
- cur_mask.append(matches(source_index.channel, channel_index))
111
- cur_max_channel = max(cur_max_channel, source_index.channel or 0)
112
- return mask
113
-
114
- def __len__(self) -> int:
115
- return len(self.source_index)
116
-
117
-
118
- def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
119
- return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
120
-
121
-
122
- class HistogramPlotMixin:
123
- _excluded_keys: Iterable[str] = []
124
-
125
- def dict(self) -> dict[str, Any]: ...
126
-
127
- def plot(self, log: bool) -> None:
128
- data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
129
- histogram_plot(data_dict, log)
130
-
131
-
132
40
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
133
41
 
134
42
 
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
193
101
  cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
194
102
  ) -> TStatsOutput:
195
103
  output = {}
196
- for key in source:
197
- if key not in cls.output_class.__annotations__:
198
- continue
199
- stat_type: str = cls.output_class.__annotations__[key]
104
+ attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
105
+ for key in (key for key in source if key in attrs):
106
+ stat_type: str = attrs[key]
200
107
  dtype_match = re.match(DTYPE_REGEX, stat_type)
201
108
  if dtype_match is not None:
202
109
  output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
@@ -215,16 +122,20 @@ class StatsProcessorOutput:
215
122
 
216
123
  def process_stats(
217
124
  i: int,
218
- image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
125
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
126
+ per_box: bool,
219
127
  per_channel: bool,
220
128
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
221
129
  ) -> StatsProcessorOutput:
222
- image, boxes = image_boxes
130
+ data = dataset[i]
131
+ image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
132
+ target = None if not isinstance(target, ObjectDetectionTarget) else target
133
+ boxes = to_numpy(target.boxes) if target is not None else None
223
134
  results_list: list[dict[str, Any]] = []
224
135
  source_indices: list[SourceIndex] = []
225
136
  box_counts: list[int] = []
226
137
  warnings_list: list[str] = []
227
- nboxes = [None] if boxes is None else normalize_box_shape(boxes)
138
+ nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
228
139
  for i_b, box in enumerate(nboxes):
229
140
  i_b = None if box is None else i_b
230
141
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
@@ -232,7 +143,7 @@ def process_stats(
232
143
  warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
233
144
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
234
145
  if per_channel:
235
- source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
146
+ source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
236
147
  else:
237
148
  source_indices.append(SourceIndex(i, i_b, None))
238
149
  box_counts.append(0 if boxes is None else len(boxes))
@@ -240,16 +151,18 @@ def process_stats(
240
151
 
241
152
 
242
153
  def process_stats_unpack(
243
- args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
154
+ i: int,
155
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
156
+ per_box: bool,
244
157
  per_channel: bool,
245
158
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
246
159
  ) -> StatsProcessorOutput:
247
- return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
160
+ return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
248
161
 
249
162
 
250
163
  def run_stats(
251
- images: Iterable[ArrayLike],
252
- bboxes: Iterable[ArrayLike] | None,
164
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
165
+ per_box: bool,
253
166
  per_channel: bool,
254
167
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
255
168
  ) -> list[TStatsOutput]:
@@ -262,13 +175,11 @@ def run_stats(
262
175
 
263
176
  Parameters
264
177
  ----------
265
- images : Iterable[ArrayLike]
266
- An iterable of images (e.g., list of arrays), where each image is represented as an
267
- array-like structure (e.g., :term:`NumPy` arrays).
268
- bboxes : Iterable[ArrayLike]
269
- An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
270
- as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
271
- iterable should match the length of the input images.
178
+ data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
179
+ A dataset of images and targets to compute statistics on.
180
+ per_box : bool
181
+ A flag which determines if the statistics should be evaluated on a per-box basis or not.
182
+ If the dataset does not include bounding boxes, this flag is ignored.
272
183
  per_channel : bool
273
184
  A flag which determines if the states should be evaluated on a per-channel basis or not.
274
185
  stats_processor_cls : Iterable[type[StatsProcessor]]
@@ -276,10 +187,8 @@ def run_stats(
276
187
 
277
188
  Returns
278
189
  -------
279
- dict[str, NDArray]]
280
- A dictionary containing the computed statistics for each image.
281
- The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
282
- with the results of the computations.
190
+ list[TStatsOutput]
191
+ A list of output classes containing the computed statistics
283
192
 
284
193
  Note
285
194
  ----
@@ -293,20 +202,24 @@ def run_stats(
293
202
  results_list: list[dict[str, NDArray[np.float64]]] = []
294
203
  source_index: list[SourceIndex] = []
295
204
  box_count: list[int] = []
296
- bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
297
205
 
298
206
  warning_list = []
299
- total_for_status = len(images) if isinstance(images, Sized) else None
300
207
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
301
208
 
302
209
  # TODO: Introduce global controls for CPU job parallelism and GPU configurations
303
210
  with Pool(processes=get_max_processes()) as p:
304
211
  for r in tqdm.tqdm(
305
212
  p.imap(
306
- partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
307
- enumerate(zip(to_numpy_iter(images), bbox_iter)),
213
+ partial(
214
+ process_stats_unpack,
215
+ dataset=dataset,
216
+ per_box=per_box,
217
+ per_channel=per_channel,
218
+ stats_processor_cls=stats_processor_cls,
219
+ ),
220
+ range(len(dataset)),
308
221
  ),
309
- total=total_for_status,
222
+ total=len(dataset),
310
223
  ):
311
224
  results_list.extend(r.results)
312
225
  source_index.extend(r.source_indices)
@@ -335,13 +248,13 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
335
248
  if type(a) is not type(b):
336
249
  raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
337
250
 
338
- sum_dict = deepcopy(a.dict())
251
+ sum_dict = deepcopy(a.data())
339
252
 
340
253
  for k in sum_dict:
341
254
  if isinstance(sum_dict[k], list):
342
- sum_dict[k].extend(b.dict()[k])
255
+ sum_dict[k].extend(b.data()[k])
343
256
  else:
344
- sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
257
+ sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
345
258
 
346
259
  return type(a)(**sum_dict)
347
260
 
@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval._output import set_metadata
12
- from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
13
- from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
14
13
 
15
14
  TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
16
15
  ArraySlice = tuple[int, int]
@@ -125,27 +124,28 @@ def boxratiostats(
125
124
 
126
125
  Examples
127
126
  --------
128
- Calculating the box ratio statistics using the dimension stats of the boxes and images
127
+ Calculate the box ratio statistics using the dimension stats of the images and boxes
128
+ on a dataset containing 15 targets.
129
129
 
130
130
  >>> from dataeval.metrics.stats import dimensionstats
131
- >>> imagestats = dimensionstats(stats_images)
132
- >>> boxstats = dimensionstats(stats_images, bboxes)
131
+ >>> imagestats = dimensionstats(dataset, per_box=False)
132
+ >>> boxstats = dimensionstats(dataset, per_box=True)
133
133
  >>> ratiostats = boxratiostats(boxstats, imagestats)
134
134
  >>> print(ratiostats.aspect_ratio)
135
- [ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
136
- 0.66650391 3.83296703 1.95018315]
135
+ [ 0.864 0.588 16. 0.857 1.27 0.438 0.667 3.833 1.95 0.833
136
+ 1. 0.6 0.522 15. 3.834]
137
137
  >>> print(ratiostats.size)
138
- [0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
139
- 0.00915527 0.03369141 0.02115885]
138
+ [0.026 0.01 0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
139
+ 0.017 0.001 0.008]
140
140
  """
141
141
  output_cls = type(boxstats)
142
142
  if type(boxstats) is not type(imgstats):
143
143
  raise TypeError("Must provide stats outputs of the same type.")
144
144
  if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
145
145
  raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
146
- if all(count == 0 for count in boxstats.box_count):
146
+ if any(src_idx.box is None for src_idx in boxstats.source_index):
147
147
  raise ValueError("Input for boxstats must contain box information.")
148
- if any(count != 0 for count in imgstats.box_count):
148
+ if any(src_idx.box is not None for src_idx in imgstats.source_index):
149
149
  raise ValueError("Input for imgstats must not contain box information.")
150
150
  boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
151
151
  imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
@@ -153,7 +153,7 @@ def boxratiostats(
153
153
  raise ValueError("Input for boxstats and imgstats must have matching channel information.")
154
154
 
155
155
  output_dict = {}
156
- for key in boxstats.dict():
156
+ for key in boxstats.data():
157
157
  output_dict[key] = calculate_ratios(key, boxstats, imgstats)
158
158
 
159
159
  return output_cls(**output_dict)
@@ -2,59 +2,17 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from dataclasses import dataclass
6
- from typing import Any, Callable, Iterable
5
+ from typing import Any, Callable
7
6
 
8
7
  import numpy as np
9
- from numpy.typing import NDArray
10
8
 
11
- from dataeval._output import set_metadata
12
- from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
13
- from dataeval.typing import ArrayLike
9
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
+ from dataeval.outputs import DimensionStatsOutput
11
+ from dataeval.outputs._base import set_metadata
12
+ from dataeval.typing import ArrayLike, Dataset
14
13
  from dataeval.utils._image import get_bitdepth
15
14
 
16
15
 
17
- @dataclass(frozen=True)
18
- class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
19
- """
20
- Output class for :func:`.dimensionstats` stats metric.
21
-
22
- Attributes
23
- ----------
24
- left : NDArray[np.int32]
25
- Offsets from the left edge of images in pixels
26
- top : NDArray[np.int32]
27
- Offsets from the top edge of images in pixels
28
- width : NDArray[np.uint32]
29
- Width of the images in pixels
30
- height : NDArray[np.uint32]
31
- Height of the images in pixels
32
- channels : NDArray[np.uint8]
33
- Channel count of the images in pixels
34
- size : NDArray[np.uint32]
35
- Size of the images in pixels
36
- aspect_ratio : NDArray[np.float16]
37
- :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
38
- depth : NDArray[np.uint8]
39
- Color depth of the images in bits
40
- center : NDArray[np.uint16]
41
- Offset from center in [x,y] coordinates of the images in pixels
42
- distance : NDArray[np.float16]
43
- Distance in pixels from center
44
- """
45
-
46
- left: NDArray[np.int32]
47
- top: NDArray[np.int32]
48
- width: NDArray[np.uint32]
49
- height: NDArray[np.uint32]
50
- channels: NDArray[np.uint8]
51
- size: NDArray[np.uint32]
52
- aspect_ratio: NDArray[np.float16]
53
- depth: NDArray[np.uint8]
54
- center: NDArray[np.int16]
55
- distance: NDArray[np.float16]
56
-
57
-
58
16
  class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
59
17
  output_class: type = DimensionStatsOutput
60
18
  image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
@@ -76,8 +34,9 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
76
34
 
77
35
  @set_metadata
78
36
  def dimensionstats(
79
- images: Iterable[ArrayLike],
80
- bboxes: Iterable[ArrayLike] | None = None,
37
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
38
+ *,
39
+ per_box: bool = False,
81
40
  ) -> DimensionStatsOutput:
82
41
  """
83
42
  Calculates dimension :term:`statistics<Statistics>` for each image.
@@ -87,10 +46,10 @@ def dimensionstats(
87
46
 
88
47
  Parameters
89
48
  ----------
90
- images : Iterable[ArrayLike]
91
- Images to perform calculations on
92
- bboxes : Iterable[ArrayLike] or None
93
- Bounding boxes in `xyxy` format for each image to perform calculations on
49
+ dataset : Dataset
50
+ Dataset to perform calculations on.
51
+ per_box : bool, default False
52
+ If True, perform calculations on each bounding box.
94
53
 
95
54
  Returns
96
55
  -------
@@ -105,12 +64,12 @@ def dimensionstats(
105
64
 
106
65
  Examples
107
66
  --------
108
- Calculating the dimension statistics on the images, whose shape is (C, H, W)
67
+ Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
109
68
 
110
- >>> results = dimensionstats(stats_images)
69
+ >>> results = dimensionstats(dataset)
111
70
  >>> print(results.aspect_ratio)
112
- [1. 1. 1.333 1. 0.6665]
71
+ [1. 1. 1.333 1. 0.667 1. 1. 1. ]
113
72
  >>> print(results.channels)
114
- [3 3 1 3 1]
73
+ [3 3 1 3 1 3 3 3]
115
74
  """
116
- return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
75
+ return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]
@@ -4,17 +4,17 @@ import warnings
4
4
 
5
5
  __all__ = []
6
6
 
7
- from dataclasses import dataclass
8
- from typing import Callable, Iterable
7
+ from typing import Any, Callable
9
8
 
10
9
  import numpy as np
11
10
  import xxhash as xxh
12
11
  from PIL import Image
13
12
  from scipy.fftpack import dct
14
13
 
15
- from dataeval._output import set_metadata
16
- from dataeval.metrics.stats._base import BaseStatsOutput, StatsProcessor, run_stats
17
- from dataeval.typing import ArrayLike
14
+ from dataeval.metrics.stats._base import StatsProcessor, run_stats
15
+ from dataeval.outputs import HashStatsOutput
16
+ from dataeval.outputs._base import set_metadata
17
+ from dataeval.typing import ArrayLike, Dataset
18
18
  from dataeval.utils._array import as_numpy
19
19
  from dataeval.utils._image import normalize_image_shape, rescale
20
20
 
@@ -22,23 +22,6 @@ HASH_SIZE = 8
22
22
  MAX_FACTOR = 4
23
23
 
24
24
 
25
- @dataclass(frozen=True)
26
- class HashStatsOutput(BaseStatsOutput):
27
- """
28
- Output class for :func:`.hashstats` stats metric.
29
-
30
- Attributes
31
- ----------
32
- xxhash : List[str]
33
- xxHash hash of the images as a hex string
34
- pchash : List[str]
35
- :term:`Perception-based Hash` of the images as a hex string
36
- """
37
-
38
- xxhash: list[str]
39
- pchash: list[str]
40
-
41
-
42
25
  def pchash(image: ArrayLike) -> str:
43
26
  """
44
27
  Performs a perceptual hash on an image by resizing to a square NxN image
@@ -122,8 +105,9 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
122
105
 
123
106
  @set_metadata
124
107
  def hashstats(
125
- images: Iterable[ArrayLike],
126
- bboxes: Iterable[ArrayLike] | None = None,
108
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
109
+ *,
110
+ per_box: bool = False,
127
111
  ) -> HashStatsOutput:
128
112
  """
129
113
  Calculates hashes for each image.
@@ -133,10 +117,10 @@ def hashstats(
133
117
 
134
118
  Parameters
135
119
  ----------
136
- images : ArrayLike
137
- Images to hashing
138
- bboxes : Iterable[ArrayLike] or None
139
- Bounding boxes in `xyxy` format for each image
120
+ dataset : Dataset
121
+ Dataset to perform calculations on.
122
+ per_box : bool, default False
123
+ If True, perform calculations on each bounding box.
140
124
 
141
125
  Returns
142
126
  -------
@@ -149,12 +133,12 @@ def hashstats(
149
133
 
150
134
  Examples
151
135
  --------
152
- Calculating the statistics on the images, whose shape is (C, H, W)
136
+ Calculate the hashes of a dataset of images, whose shape is (C, H, W)
153
137
 
154
- >>> results = hashstats(stats_images)
155
- >>> print(results.xxhash)
156
- ['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
157
- >>> print(results.pchash)
158
- ['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
138
+ >>> results = hashstats(dataset)
139
+ >>> print(results.xxhash[:5])
140
+ ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
141
+ >>> print(results.pchash[:5])
142
+ ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
159
143
  """
160
- return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
144
+ return run_stats(dataset, per_box, False, [HashStatsProcessor])[0]
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from typing import Any, Literal, overload
6
+
7
+ from dataeval.metrics.stats._base import run_stats
8
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsProcessor
9
+ from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
10
+ from dataeval.metrics.stats._visualstats import VisualStatsProcessor
11
+ from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
12
+ from dataeval.outputs._base import set_metadata
13
+ from dataeval.typing import ArrayLike, Dataset
14
+
15
+
16
+ @overload
17
+ def imagestats(
18
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
19
+ *,
20
+ per_box: bool = False,
21
+ per_channel: Literal[True],
22
+ ) -> ChannelStatsOutput: ...
23
+
24
+
25
+ @overload
26
+ def imagestats(
27
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
28
+ *,
29
+ per_box: bool = False,
30
+ per_channel: Literal[False] = False,
31
+ ) -> ImageStatsOutput: ...
32
+
33
+
34
+ @set_metadata
35
+ def imagestats(
36
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
37
+ *,
38
+ per_box: bool = False,
39
+ per_channel: bool = False,
40
+ ) -> ImageStatsOutput | ChannelStatsOutput:
41
+ """
42
+ Calculates various :term:`statistics<Statistics>` for each image.
43
+
44
+ This function computes dimension, pixel and visual metrics
45
+ on the images or individual bounding boxes for each image. If
46
+ performing calculations per channel dimension stats are excluded.
47
+
48
+ Parameters
49
+ ----------
50
+ dataset : Dataset
51
+ Dataset to perform calculations on.
52
+ per_box : bool, default False
53
+ If True, perform calculations on each bounding box.
54
+ per_channel : bool, default False
55
+ If True, perform calculations on each channel.
56
+
57
+ Returns
58
+ -------
59
+ ImageStatsOutput or ChannelStatsOutput
60
+ Output class containing the outputs of various stats functions
61
+
62
+ See Also
63
+ --------
64
+ dimensionstats, pixelstats, visualstats
65
+
66
+ Examples
67
+ --------
68
+ Calculate dimension, pixel and visual statistics for a dataset containing 8
69
+ images.
70
+
71
+ >>> stats = imagestats(dataset)
72
+ >>> print(stats.aspect_ratio)
73
+ [1. 1. 1.333 1. 0.667 1. 1. 1. ]
74
+
75
+ >>> print(stats.sharpness)
76
+ [20.23 20.23 23.33 20.23 77.06 20.23 20.23 20.23]
77
+
78
+ Calculate the pixel and visual stats for a dataset containing 6 3-channel
79
+ images and 2 1-channel images for a total of 20 channels.
80
+
81
+ >>> ch_stats = imagestats(dataset, per_channel=True)
82
+ >>> print(ch_stats.brightness)
83
+ [0.027 0.152 0.277 0.127 0.135 0.142 0.259 0.377 0.385 0.392 0.508 0.626
84
+ 0.634 0.642 0.751 0.759 0.767 0.876 0.884 0.892]
85
+ """
86
+ if per_channel:
87
+ processors = [PixelStatsProcessor, VisualStatsProcessor]
88
+ output_cls = ChannelStatsOutput
89
+ else:
90
+ processors = [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor]
91
+ output_cls = ImageStatsOutput
92
+
93
+ outputs = run_stats(dataset, per_box, per_channel, processors)
94
+ return output_cls(**{k: v for d in outputs for k, v in d.data().items()})