dataeval 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/config.py +21 -4
  4. dataeval/data/_embeddings.py +2 -2
  5. dataeval/data/_images.py +2 -3
  6. dataeval/data/_metadata.py +188 -178
  7. dataeval/data/_selection.py +1 -2
  8. dataeval/data/_split.py +4 -5
  9. dataeval/data/_targets.py +17 -13
  10. dataeval/data/selections/_classfilter.py +2 -5
  11. dataeval/data/selections/_prioritize.py +6 -9
  12. dataeval/data/selections/_shuffle.py +3 -1
  13. dataeval/detectors/drift/_base.py +4 -5
  14. dataeval/detectors/drift/_mmd.py +3 -6
  15. dataeval/detectors/drift/_nml/_base.py +4 -2
  16. dataeval/detectors/drift/_nml/_chunk.py +11 -19
  17. dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
  18. dataeval/detectors/drift/_nml/_result.py +8 -9
  19. dataeval/detectors/drift/_nml/_thresholds.py +66 -77
  20. dataeval/detectors/linters/outliers.py +7 -7
  21. dataeval/metadata/_distance.py +10 -7
  22. dataeval/metadata/_ood.py +11 -103
  23. dataeval/metrics/bias/_balance.py +23 -33
  24. dataeval/metrics/bias/_diversity.py +16 -14
  25. dataeval/metrics/bias/_parity.py +18 -18
  26. dataeval/metrics/estimators/_divergence.py +2 -4
  27. dataeval/metrics/stats/_base.py +103 -42
  28. dataeval/metrics/stats/_boxratiostats.py +21 -19
  29. dataeval/metrics/stats/_dimensionstats.py +14 -10
  30. dataeval/metrics/stats/_hashstats.py +1 -1
  31. dataeval/metrics/stats/_pixelstats.py +6 -6
  32. dataeval/metrics/stats/_visualstats.py +3 -3
  33. dataeval/outputs/_base.py +22 -7
  34. dataeval/outputs/_bias.py +24 -70
  35. dataeval/outputs/_drift.py +1 -9
  36. dataeval/outputs/_linters.py +11 -11
  37. dataeval/outputs/_stats.py +82 -23
  38. dataeval/outputs/_workflows.py +2 -2
  39. dataeval/utils/_array.py +6 -9
  40. dataeval/utils/_bin.py +1 -2
  41. dataeval/utils/_clusterer.py +7 -4
  42. dataeval/utils/_fast_mst.py +27 -13
  43. dataeval/utils/_image.py +65 -11
  44. dataeval/utils/_mst.py +1 -3
  45. dataeval/utils/_plot.py +15 -10
  46. dataeval/utils/data/_dataset.py +54 -28
  47. dataeval/utils/data/metadata.py +104 -82
  48. dataeval/utils/datasets/__init__.py +2 -0
  49. dataeval/utils/datasets/_antiuav.py +189 -0
  50. dataeval/utils/datasets/_base.py +11 -8
  51. dataeval/utils/datasets/_cifar10.py +104 -45
  52. dataeval/utils/datasets/_fileio.py +21 -47
  53. dataeval/utils/datasets/_milco.py +22 -12
  54. dataeval/utils/datasets/_mixin.py +2 -4
  55. dataeval/utils/datasets/_mnist.py +3 -4
  56. dataeval/utils/datasets/_ships.py +14 -7
  57. dataeval/utils/datasets/_voc.py +229 -42
  58. dataeval/utils/torch/models.py +5 -10
  59. dataeval/utils/torch/trainer.py +3 -3
  60. dataeval/workflows/sufficiency.py +2 -2
  61. {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/METADATA +2 -1
  62. dataeval-0.86.2.dist-info/RECORD +114 -0
  63. dataeval/detectors/ood/vae.py +0 -74
  64. dataeval-0.86.0.dist-info/RECORD +0 -114
  65. {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/LICENSE.txt +0 -0
  66. {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/WHEEL +0 -0
dataeval/utils/_image.py CHANGED
@@ -12,6 +12,9 @@ from scipy.signal import convolve2d
12
12
  EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
13
13
  BIT_DEPTH = (1, 8, 12, 16, 32)
14
14
 
15
+ Box = tuple[int, int, int, int]
16
+ """Bounding box as tuple of integers in x0, y0, x1, y1 format."""
17
+
15
18
 
16
19
  @dataclass
17
20
  class BitDepth:
@@ -25,12 +28,11 @@ def get_bitdepth(image: NDArray[Any]) -> BitDepth:
25
28
  Approximates the bit depth of the image using the
26
29
  min and max pixel values.
27
30
  """
28
- pmin, pmax = np.min(image), np.max(image)
31
+ pmin, pmax = np.nanmin(image), np.nanmax(image)
29
32
  if pmin < 0:
30
33
  return BitDepth(0, pmin, pmax)
31
- else:
32
- depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
33
- return BitDepth(depth, 0, 2**depth - 1)
34
+ depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
35
+ return BitDepth(depth, 0, 2**depth - 1)
34
36
 
35
37
 
36
38
  def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
@@ -40,9 +42,8 @@ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
40
42
  bitdepth = get_bitdepth(image)
41
43
  if bitdepth.depth == depth:
42
44
  return image
43
- else:
44
- normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
45
- return normalized * (2**depth - 1)
45
+ normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
46
+ return normalized * (2**depth - 1)
46
47
 
47
48
 
48
49
  def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
@@ -52,13 +53,12 @@ def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
52
53
  ndim = image.ndim
53
54
  if ndim == 2:
54
55
  return np.expand_dims(image, axis=0)
55
- elif ndim == 3:
56
+ if ndim == 3:
56
57
  return image
57
- elif ndim > 3:
58
+ if ndim > 3:
58
59
  # Slice all but the last 3 dimensions
59
60
  return image[(0,) * (ndim - 3)]
60
- else:
61
- raise ValueError("Images must have 2 or more dimensions.")
61
+ raise ValueError("Images must have 2 or more dimensions.")
62
62
 
63
63
 
64
64
  def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
@@ -71,3 +71,57 @@ def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
71
71
  edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
72
72
  np.clip(edges, 0, 255, edges)
73
73
  return edges
74
+
75
+
76
+ def clip_box(image: NDArray[Any], box: Box) -> Box:
77
+ """
78
+ Clip the box to inside the provided image dimensions.
79
+ """
80
+ x0, y0, x1, y1 = box
81
+ h, w = image.shape[-2:]
82
+
83
+ return max(0, x0), max(0, y0), min(w, x1), min(h, y1)
84
+
85
+
86
+ def is_valid_box(box: Box) -> bool:
87
+ """
88
+ Check if the box dimensions provided are a valid image.
89
+ """
90
+ return box[2] > box[0] and box[3] > box[1]
91
+
92
+
93
+ def clip_and_pad(image: NDArray[Any], box: Box) -> NDArray[Any]:
94
+ """
95
+ Extract a region from an image based on a bounding box, clipping to image boundaries
96
+ and padding out-of-bounds areas with np.nan.
97
+
98
+ Parameters:
99
+ -----------
100
+ image : NDArray[Any]
101
+ Input image array in format C, H, W (channels first)
102
+ box : Box
103
+ Bounding box coordinates as (x0, y0, x1, y1) where (x0, y0) is top-left and (x1, y1) is bottom-right
104
+
105
+ Returns:
106
+ --------
107
+ NDArray[Any]
108
+ The extracted region with out-of-bounds areas padded with np.nan
109
+ """
110
+
111
+ # Create output array filled with NaN with a minimum size of 1x1
112
+ bw, bh = max(1, box[2] - box[0]), max(1, box[3] - box[1])
113
+
114
+ output = np.full((image.shape[-3] if image.ndim > 2 else 1, bh, bw), np.nan)
115
+
116
+ # Calculate source box
117
+ sbox = clip_box(image, box)
118
+
119
+ # Calculate destination box
120
+ x0, y0 = sbox[0] - box[0], sbox[1] - box[1]
121
+ x1, y1 = x0 + (sbox[2] - sbox[0]), y0 + (sbox[3] - sbox[1])
122
+
123
+ # Copy the source if valid from the image to the output
124
+ if is_valid_box(sbox):
125
+ output[:, y0:y1, x0:x1] = image[:, sbox[1] : sbox[3], sbox[0] : sbox[2]]
126
+
127
+ return output
dataeval/utils/_mst.py CHANGED
@@ -83,6 +83,4 @@ def compute_neighbors(
83
83
 
84
84
  nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
85
85
  nns = nbrs.kneighbors(A)[1]
86
- nns = nns[:, 1:].squeeze()
87
-
88
- return nns
86
+ return nns[:, 1:].squeeze()
dataeval/utils/_plot.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  import contextlib
6
+ import math
6
7
  from typing import Any
7
8
 
8
9
  import numpy as np
@@ -160,11 +161,9 @@ def histogram_plot(
160
161
  import matplotlib.pyplot as plt
161
162
 
162
163
  num_metrics = len(data_dict)
163
- if num_metrics > 2:
164
- rows = int(len(data_dict) / 3)
165
- fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
166
- else:
167
- fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
164
+ rows = math.ceil(num_metrics / 3)
165
+ cols = min(num_metrics, 3)
166
+ fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
168
167
 
169
168
  for ax, metric in zip(
170
169
  axs.flat,
@@ -178,6 +177,10 @@ def histogram_plot(
178
177
  ax.set_ylabel(ylabel)
179
178
  ax.set_xlabel(xlabel)
180
179
 
180
+ for ax in axs.flat[num_metrics:]:
181
+ ax.axis("off")
182
+ ax.set_visible(False)
183
+
181
184
  fig.tight_layout()
182
185
  return fig
183
186
 
@@ -216,11 +219,9 @@ def channel_histogram_plot(
216
219
  label_kwargs = {"label": [f"Channel {i}" for i in range(max_channels)]}
217
220
 
218
221
  num_metrics = len(data_keys)
219
- if num_metrics > 2:
220
- rows = int(len(data_keys) / 3)
221
- fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
222
- else:
223
- fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
222
+ rows = math.ceil(num_metrics / 3)
223
+ cols = min(num_metrics, 3)
224
+ fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
224
225
 
225
226
  for ax, metric in zip(
226
227
  axs.flat,
@@ -245,5 +246,9 @@ def channel_histogram_plot(
245
246
  ax.set_ylabel(ylabel)
246
247
  ax.set_xlabel(xlabel)
247
248
 
249
+ for ax in axs.flat[num_metrics:]:
250
+ ax.axis("off")
251
+ ax.set_visible(False)
252
+
248
253
  fig.tight_layout()
249
254
  return fig
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar
5
+ from typing import Any, Generic, Iterable, Literal, Sequence, SupportsFloat, SupportsInt, TypeVar, cast
6
6
 
7
7
  from dataeval.typing import (
8
8
  Array,
@@ -17,9 +17,9 @@ from dataeval.utils._array import as_numpy
17
17
  def _validate_data(
18
18
  datum_type: Literal["ic", "od"],
19
19
  images: Array | Sequence[Array],
20
- labels: Sequence[int] | Sequence[Sequence[int]],
21
- bboxes: Sequence[Sequence[Sequence[float]]] | None,
22
- metadata: Sequence[dict[str, Any]] | None,
20
+ labels: Array | Sequence[int] | Sequence[Array] | Sequence[Sequence[int]],
21
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]] | None,
22
+ metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
23
23
  ) -> None:
24
24
  # Validate inputs
25
25
  dataset_len = len(images)
@@ -30,20 +30,31 @@ def _validate_data(
30
30
  raise ValueError(f"Number of labels ({len(labels)}) does not match number of images ({dataset_len}).")
31
31
  if bboxes is not None and len(bboxes) != dataset_len:
32
32
  raise ValueError(f"Number of bboxes ({len(bboxes)}) does not match number of images ({dataset_len}).")
33
- if metadata is not None and len(metadata) != dataset_len:
33
+ if metadata is not None and (
34
+ len(metadata) != dataset_len
35
+ if isinstance(metadata, Sequence)
36
+ else any(
37
+ not isinstance(metadatum, Sequence) or len(metadatum) != dataset_len for metadatum in metadata.values()
38
+ )
39
+ ):
34
40
  raise ValueError(f"Number of metadata ({len(metadata)}) does not match number of images ({dataset_len}).")
35
41
 
36
42
  if datum_type == "ic":
37
- if not isinstance(labels, Sequence) or not isinstance(labels[0], int):
43
+ if not isinstance(labels, (Sequence, Array)) or not isinstance(labels[0], (int, SupportsInt)):
38
44
  raise TypeError("Labels must be a sequence of integers for image classification.")
39
45
  elif datum_type == "od":
40
- if not isinstance(labels, Sequence) or not isinstance(labels[0], Sequence) or not isinstance(labels[0][0], int):
46
+ if (
47
+ not isinstance(labels, (Sequence, Array))
48
+ or not isinstance(labels[0], (Sequence, Array))
49
+ or not isinstance(cast(Sequence[Any], labels[0])[0], (int, SupportsInt))
50
+ ):
41
51
  raise TypeError("Labels must be a sequence of sequences of integers for object detection.")
42
52
  if (
43
53
  bboxes is None
44
54
  or not isinstance(bboxes, (Sequence, Array))
45
55
  or not isinstance(bboxes[0], (Sequence, Array))
46
56
  or not isinstance(bboxes[0][0], (Sequence, Array))
57
+ or not isinstance(bboxes[0][0][0], (float, SupportsFloat))
47
58
  or not len(bboxes[0][0]) == 4
48
59
  ):
49
60
  raise TypeError("Boxes must be a sequence of sequences of (x0, y0, x1, y1) for object detection.")
@@ -51,12 +62,19 @@ def _validate_data(
51
62
  raise ValueError(f"Unknown datum type '{datum_type}'. Must be 'ic' or 'od'.")
52
63
 
53
64
 
65
+ def _listify_metadata(
66
+ metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
67
+ ) -> Sequence[dict[str, Any]] | None:
68
+ if isinstance(metadata, dict):
69
+ return [{k: v[i] for k, v in metadata.items()} for i in range(len(next(iter(metadata.values()))))]
70
+ return metadata
71
+
72
+
54
73
  def _find_max(arr: ArrayLike) -> Any:
55
- if isinstance(arr, (Iterable, Sequence, Array)):
74
+ if not isinstance(arr, (bytes, str)) and isinstance(arr, (Iterable, Sequence, Array)):
56
75
  if isinstance(arr[0], (Iterable, Sequence, Array)):
57
76
  return max([_find_max(x) for x in arr]) # type: ignore
58
- else:
59
- return max(arr)
77
+ return max(arr)
60
78
  return arr
61
79
 
62
80
 
@@ -92,12 +110,14 @@ class CustomImageClassificationDataset(BaseAnnotatedDataset[Sequence[int]], Imag
92
110
  def __init__(
93
111
  self,
94
112
  images: Array | Sequence[Array],
95
- labels: Sequence[int],
113
+ labels: Array | Sequence[int],
96
114
  metadata: Sequence[dict[str, Any]] | None,
97
115
  classes: Sequence[str] | None,
98
116
  name: str | None = None,
99
117
  ) -> None:
100
- super().__init__("ic", images, labels, metadata, classes)
118
+ super().__init__(
119
+ "ic", images, as_numpy(labels).tolist() if isinstance(labels, Array) else labels, metadata, classes
120
+ )
101
121
  if name is not None:
102
122
  self.__name__ = name
103
123
  self.__class__.__name__ = name
@@ -135,18 +155,24 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
135
155
  def __init__(
136
156
  self,
137
157
  images: Array | Sequence[Array],
138
- labels: Sequence[Sequence[int]],
139
- bboxes: Sequence[Sequence[Sequence[float]]],
158
+ labels: Array | Sequence[Array] | Sequence[Sequence[int]],
159
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
140
160
  metadata: Sequence[dict[str, Any]] | None,
141
161
  classes: Sequence[str] | None,
142
162
  name: str | None = None,
143
163
  ) -> None:
144
- super().__init__("od", images, labels, metadata, classes)
164
+ super().__init__(
165
+ "od",
166
+ images,
167
+ [as_numpy(label).tolist() if isinstance(label, Array) else label for label in labels],
168
+ metadata,
169
+ classes,
170
+ )
145
171
  if name is not None:
146
172
  self.__name__ = name
147
173
  self.__class__.__name__ = name
148
174
  self.__class__.__qualname__ = name
149
- self._bboxes = bboxes
175
+ self._bboxes = [[as_numpy(box).tolist() if isinstance(box, Array) else box for box in bbox] for bbox in bboxes]
150
176
 
151
177
  @property
152
178
  def metadata(self) -> DatasetMetadata:
@@ -162,8 +188,8 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
162
188
 
163
189
  def to_image_classification_dataset(
164
190
  images: Array | Sequence[Array],
165
- labels: Sequence[int],
166
- metadata: Sequence[dict[str, Any]] | None,
191
+ labels: Array | Sequence[int],
192
+ metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
167
193
  classes: Sequence[str] | None,
168
194
  name: str | None = None,
169
195
  ) -> ImageClassificationDataset:
@@ -174,9 +200,9 @@ def to_image_classification_dataset(
174
200
  ----------
175
201
  images : Array | Sequence[Array]
176
202
  The images to use in the dataset.
177
- labels : Sequence[int]
203
+ labels : Array | Sequence[int]
178
204
  The labels to use in the dataset.
179
- metadata : Sequence[dict[str, Any]] | None
205
+ metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
180
206
  The metadata to use in the dataset.
181
207
  classes : Sequence[str] | None
182
208
  The classes to use in the dataset.
@@ -186,14 +212,14 @@ def to_image_classification_dataset(
186
212
  ImageClassificationDataset
187
213
  """
188
214
  _validate_data("ic", images, labels, None, metadata)
189
- return CustomImageClassificationDataset(images, labels, metadata, classes, name)
215
+ return CustomImageClassificationDataset(images, labels, _listify_metadata(metadata), classes, name)
190
216
 
191
217
 
192
218
  def to_object_detection_dataset(
193
219
  images: Array | Sequence[Array],
194
- labels: Sequence[Sequence[int]],
195
- bboxes: Sequence[Sequence[Sequence[float]]],
196
- metadata: Sequence[dict[str, Any]] | None,
220
+ labels: Array | Sequence[Array] | Sequence[Sequence[int]],
221
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
222
+ metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
197
223
  classes: Sequence[str] | None,
198
224
  name: str | None = None,
199
225
  ) -> ObjectDetectionDataset:
@@ -204,11 +230,11 @@ def to_object_detection_dataset(
204
230
  ----------
205
231
  images : Array | Sequence[Array]
206
232
  The images to use in the dataset.
207
- labels : Sequence[Sequence[int]]
233
+ labels : Array | Sequence[Array] | Sequence[Sequence[int]]
208
234
  The labels to use in the dataset.
209
- bboxes : Sequence[Sequence[Sequence[float]]]
235
+ bboxes : Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]]
210
236
  The bounding boxes (x0,y0,x1,y0) to use in the dataset.
211
- metadata : Sequence[dict[str, Any]] | None
237
+ metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
212
238
  The metadata to use in the dataset.
213
239
  classes : Sequence[str] | None
214
240
  The classes to use in the dataset.
@@ -218,4 +244,4 @@ def to_object_detection_dataset(
218
244
  ObjectDetectionDataset
219
245
  """
220
246
  _validate_data("od", images, labels, bboxes, metadata)
221
- return CustomObjectDetectionDataset(images, labels, bboxes, metadata, classes, name)
247
+ return CustomObjectDetectionDataset(images, labels, bboxes, _listify_metadata(metadata), classes, name)
@@ -228,58 +228,130 @@ def flatten(
228
228
 
229
229
  if return_dropped:
230
230
  return output, size, _sorted_drop_reasons(dropped)
231
+ if dropped:
232
+ dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
233
+ warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
234
+ return output, size
235
+
236
+
237
+ def _flatten_for_merge(
238
+ metadatum: Mapping[str, Any],
239
+ ignore_lists: bool,
240
+ fully_qualified: bool,
241
+ targets: int | None,
242
+ ) -> tuple[dict[str, list[Any]] | dict[str, Any], int, dict[str, list[str]]]:
243
+ flattened, image_repeats, dropped_inner = flatten(
244
+ metadatum, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
245
+ )
246
+ if targets is not None:
247
+ # check for mismatch in targets per image and force ignore_lists
248
+ if not ignore_lists and targets != image_repeats:
249
+ flattened, image_repeats, dropped_inner = flatten(
250
+ metadatum, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
251
+ )
252
+ if targets != image_repeats:
253
+ flattened = {k: [v] * targets for k, v in flattened.items()}
254
+ image_repeats = targets
255
+ return flattened, image_repeats, dropped_inner
256
+
257
+
258
+ def _merge(
259
+ dicts: list[Mapping[str, Any]],
260
+ ignore_lists: bool,
261
+ fully_qualified: bool,
262
+ targets_per_image: Sequence[int] | None,
263
+ ) -> tuple[dict[str, list[Any]], dict[str, set[DropReason]], NDArray[np.intp]]:
264
+ merged: dict[str, list[Any]] = {}
265
+ isect: set[str] = set()
266
+ union: set[str] = set()
267
+ image_repeats = np.zeros(len(dicts), dtype=np.int_)
268
+ dropped: dict[str, set[DropReason]] = {}
269
+ for i, d in enumerate(dicts):
270
+ targets = None if targets_per_image is None else targets_per_image[i]
271
+ flattened, image_repeats[i], dropped_inner = _flatten_for_merge(d, ignore_lists, fully_qualified, targets)
272
+ isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
273
+ union.update(flattened.keys())
274
+ for k, v in dropped_inner.items():
275
+ dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
276
+ for k, v in flattened.items():
277
+ merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
278
+
279
+ for k in union - isect:
280
+ dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
281
+
282
+ if image_repeats.sum() == image_repeats.size:
283
+ image_indices = np.arange(image_repeats.size)
231
284
  else:
232
- if dropped:
233
- dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
234
- warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
235
- return output, size
285
+ image_ids = np.arange(image_repeats.size)
286
+ image_data = np.concatenate(
287
+ [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
288
+ )
289
+ _, image_unsorted = np.unique(image_data, return_inverse=True)
290
+ image_indices = np.sort(image_unsorted)
236
291
 
292
+ merged = {k: _simplify_type(v) for k, v in merged.items() if k in isect}
293
+ return merged, dropped, image_indices
237
294
 
238
- def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
239
- """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
240
- # single dict
241
- if len(metadata) < 2:
242
- return False
243
295
 
244
- # dict of non dicts
245
- keys = list(metadata)
246
- if not isinstance(metadata[keys[0]], Mapping):
247
- return False
296
+ @overload
297
+ def merge(
298
+ metadata: Iterable[Mapping[str, Any]],
299
+ *,
300
+ return_dropped: Literal[True],
301
+ return_numpy: Literal[False] = False,
302
+ ignore_lists: bool = False,
303
+ fully_qualified: bool = False,
304
+ targets_per_image: Sequence[int] | None = None,
305
+ image_index_key: str = "_image_index",
306
+ ) -> tuple[dict[str, list[Any]], dict[str, list[str]]]: ...
307
+
248
308
 
249
- # dict of dicts with matching keys
250
- return set(metadata[keys[0]]) == set(metadata[keys[1]])
309
+ @overload
310
+ def merge(
311
+ metadata: Iterable[Mapping[str, Any]],
312
+ *,
313
+ return_dropped: Literal[False] = False,
314
+ return_numpy: Literal[False] = False,
315
+ ignore_lists: bool = False,
316
+ fully_qualified: bool = False,
317
+ targets_per_image: Sequence[int] | None = None,
318
+ image_index_key: str = "_image_index",
319
+ ) -> dict[str, list[Any]]: ...
251
320
 
252
321
 
253
322
  @overload
254
323
  def merge(
255
324
  metadata: Iterable[Mapping[str, Any]],
325
+ *,
256
326
  return_dropped: Literal[True],
327
+ return_numpy: Literal[True],
257
328
  ignore_lists: bool = False,
258
329
  fully_qualified: bool = False,
259
- return_numpy: bool = False,
260
330
  targets_per_image: Sequence[int] | None = None,
261
331
  image_index_key: str = "_image_index",
262
- ) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
332
+ ) -> tuple[dict[str, NDArray[Any]], dict[str, list[str]]]: ...
263
333
 
264
334
 
265
335
  @overload
266
336
  def merge(
267
337
  metadata: Iterable[Mapping[str, Any]],
338
+ *,
268
339
  return_dropped: Literal[False] = False,
340
+ return_numpy: Literal[True],
269
341
  ignore_lists: bool = False,
270
342
  fully_qualified: bool = False,
271
- return_numpy: bool = False,
272
343
  targets_per_image: Sequence[int] | None = None,
273
344
  image_index_key: str = "_image_index",
274
- ) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
345
+ ) -> dict[str, NDArray[Any]]: ...
275
346
 
276
347
 
277
348
  def merge(
278
349
  metadata: Iterable[Mapping[str, Any]],
350
+ *,
279
351
  return_dropped: bool = False,
352
+ return_numpy: bool = False,
280
353
  ignore_lists: bool = False,
281
354
  fully_qualified: bool = False,
282
- return_numpy: bool = False,
283
355
  targets_per_image: Sequence[int] | None = None,
284
356
  image_index_key: str = "_image_index",
285
357
  ):
@@ -298,12 +370,12 @@ def merge(
298
370
  Iterable collection of metadata dictionaries to flatten and merge
299
371
  return_dropped: bool, default False
300
372
  Option to return a dictionary of dropped keys and the reason(s) for dropping
373
+ return_numpy : bool, default False
374
+ Option to return results as lists or NumPy arrays
301
375
  ignore_lists : bool, default False
302
376
  Option to skip expanding lists within metadata
303
377
  fully_qualified : bool, default False
304
378
  Option to return dictionary keys full qualified instead of minimized
305
- return_numpy : bool, default False
306
- Option to return results as lists or NumPy arrays
307
379
  targets_per_image : Sequence[int] or None, default None
308
380
  Number of targets for each image metadata entry
309
381
  image_index_key : str, default "_image_index"
@@ -330,74 +402,24 @@ def merge(
330
402
  >>> dropped_keys
331
403
  {'target_c': ['inconsistent_key']}
332
404
  """
333
- merged: dict[str, list[Any]] = {}
334
- isect: set[str] = set()
335
- union: set[str] = set()
336
- keys: list[str] | None = None
337
- dicts: list[Mapping[str, Any]]
338
-
339
- # EXPERIMENTAL
340
- if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
341
- warnings.warn("Experimental processing for dict of dicts.")
342
- keys = [str(k) for k in metadata]
343
- dicts = list(metadata.values())
344
- ignore_lists = True
345
- else:
346
- dicts = list(metadata)
405
+
406
+ dicts: list[Mapping[str, Any]] = list(metadata)
347
407
 
348
408
  if targets_per_image is not None and len(dicts) != len(targets_per_image):
349
409
  raise ValueError("Number of targets per image must be equal to number of metadata entries.")
350
410
 
351
- image_repeats = np.zeros(len(dicts), dtype=np.int_)
352
- dropped: dict[str, set[DropReason]] = {}
353
- for i, d in enumerate(dicts):
354
- flattened, image_repeats[i], dropped_inner = flatten(
355
- d, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
356
- )
357
- if targets_per_image is not None:
358
- # check for mismatch in targets per image and force ignore_lists
359
- if not ignore_lists and targets_per_image[i] != image_repeats[i]:
360
- flattened, image_repeats[i], dropped_inner = flatten(
361
- d, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
362
- )
363
- if targets_per_image[i] != image_repeats[i]:
364
- flattened = {k: [v] * targets_per_image[i] for k, v in flattened.items()}
365
- image_repeats[i] = targets_per_image[i]
366
- isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
367
- union.update(flattened.keys())
368
- for k, v in dropped_inner.items():
369
- dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
370
- for k, v in flattened.items():
371
- merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
372
-
373
- for k in union - isect:
374
- dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
375
-
376
- if image_repeats.sum() == image_repeats.size:
377
- image_indices = np.arange(image_repeats.size)
378
- else:
379
- image_ids = np.arange(image_repeats.size)
380
- image_data = np.concatenate(
381
- [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
382
- )
383
- _, image_unsorted = np.unique(image_data, return_inverse=True)
384
- image_indices = np.sort(image_unsorted)
385
-
386
- output: dict[str, Any] = {}
411
+ merged, dropped, image_indices = _merge(dicts, ignore_lists, fully_qualified, targets_per_image)
387
412
 
388
- if keys:
389
- output["keys"] = np.array(keys) if return_numpy else keys
413
+ output: dict[str, Any] = {k: np.asarray(v) for k, v in merged.items()} if return_numpy else merged
390
414
 
391
- for k in (key for key in merged if key in isect):
392
- cv = _simplify_type(merged[k])
393
- output[k] = np.array(cv) if return_numpy else cv
394
415
  if image_index_key not in output:
395
416
  output[image_index_key] = image_indices if return_numpy else image_indices.tolist()
396
417
 
397
418
  if return_dropped:
398
419
  return output, _sorted_drop_reasons(dropped)
399
- else:
400
- if dropped:
401
- dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
402
- warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
403
- return output
420
+
421
+ if dropped:
422
+ dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
423
+ warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
424
+
425
+ return output
@@ -1,5 +1,6 @@
1
1
  """Provides access to common Computer Vision datasets."""
2
2
 
3
+ from dataeval.utils.datasets._antiuav import AntiUAVDetection
3
4
  from dataeval.utils.datasets._cifar10 import CIFAR10
4
5
  from dataeval.utils.datasets._milco import MILCO
5
6
  from dataeval.utils.datasets._mnist import MNIST
@@ -10,6 +11,7 @@ __all__ = [
10
11
  "MNIST",
11
12
  "Ships",
12
13
  "CIFAR10",
14
+ "AntiUAVDetection",
13
15
  "MILCO",
14
16
  "VOCDetection",
15
17
  "VOCDetectionTorch",