dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/config.py +21 -4
  4. dataeval/data/_embeddings.py +2 -2
  5. dataeval/data/_images.py +2 -3
  6. dataeval/data/_metadata.py +65 -42
  7. dataeval/data/_selection.py +2 -3
  8. dataeval/data/_split.py +2 -3
  9. dataeval/data/_targets.py +17 -13
  10. dataeval/data/selections/_classfilter.py +6 -8
  11. dataeval/data/selections/_prioritize.py +6 -9
  12. dataeval/data/selections/_shuffle.py +3 -1
  13. dataeval/detectors/drift/__init__.py +4 -1
  14. dataeval/detectors/drift/_base.py +4 -5
  15. dataeval/detectors/drift/_mmd.py +3 -6
  16. dataeval/detectors/drift/_mvdc.py +92 -0
  17. dataeval/detectors/drift/_nml/__init__.py +6 -0
  18. dataeval/detectors/drift/_nml/_base.py +70 -0
  19. dataeval/detectors/drift/_nml/_chunk.py +396 -0
  20. dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
  21. dataeval/detectors/drift/_nml/_result.py +97 -0
  22. dataeval/detectors/drift/_nml/_thresholds.py +269 -0
  23. dataeval/detectors/linters/outliers.py +7 -7
  24. dataeval/metrics/bias/_parity.py +10 -13
  25. dataeval/metrics/estimators/_divergence.py +2 -4
  26. dataeval/metrics/stats/_base.py +103 -42
  27. dataeval/metrics/stats/_boxratiostats.py +21 -19
  28. dataeval/metrics/stats/_dimensionstats.py +14 -10
  29. dataeval/metrics/stats/_hashstats.py +1 -1
  30. dataeval/metrics/stats/_pixelstats.py +6 -6
  31. dataeval/metrics/stats/_visualstats.py +3 -3
  32. dataeval/outputs/__init__.py +2 -1
  33. dataeval/outputs/_base.py +22 -7
  34. dataeval/outputs/_bias.py +27 -31
  35. dataeval/outputs/_drift.py +60 -0
  36. dataeval/outputs/_linters.py +12 -17
  37. dataeval/outputs/_stats.py +83 -29
  38. dataeval/outputs/_workflows.py +2 -2
  39. dataeval/utils/_array.py +6 -9
  40. dataeval/utils/_bin.py +1 -2
  41. dataeval/utils/_clusterer.py +7 -4
  42. dataeval/utils/_fast_mst.py +27 -13
  43. dataeval/utils/_image.py +65 -11
  44. dataeval/utils/_mst.py +1 -3
  45. dataeval/utils/_plot.py +15 -10
  46. dataeval/utils/data/_dataset.py +32 -20
  47. dataeval/utils/data/metadata.py +104 -82
  48. dataeval/utils/datasets/__init__.py +2 -0
  49. dataeval/utils/datasets/_antiuav.py +189 -0
  50. dataeval/utils/datasets/_base.py +11 -8
  51. dataeval/utils/datasets/_cifar10.py +104 -45
  52. dataeval/utils/datasets/_fileio.py +21 -47
  53. dataeval/utils/datasets/_milco.py +19 -11
  54. dataeval/utils/datasets/_mixin.py +2 -4
  55. dataeval/utils/datasets/_mnist.py +3 -4
  56. dataeval/utils/datasets/_ships.py +14 -7
  57. dataeval/utils/datasets/_voc.py +229 -42
  58. dataeval/utils/torch/models.py +5 -10
  59. dataeval/utils/torch/trainer.py +3 -3
  60. dataeval/workflows/sufficiency.py +2 -2
  61. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
  62. dataeval-0.86.1.dist-info/RECORD +114 -0
  63. dataeval/detectors/ood/vae.py +0 -74
  64. dataeval-0.85.0.dist-info/RECORD +0 -107
  65. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
  66. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
dataeval/utils/_mst.py CHANGED
@@ -83,6 +83,4 @@ def compute_neighbors(
83
83
 
84
84
  nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
85
85
  nns = nbrs.kneighbors(A)[1]
86
- nns = nns[:, 1:].squeeze()
87
-
88
- return nns
86
+ return nns[:, 1:].squeeze()
dataeval/utils/_plot.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  import contextlib
6
+ import math
6
7
  from typing import Any
7
8
 
8
9
  import numpy as np
@@ -160,11 +161,9 @@ def histogram_plot(
160
161
  import matplotlib.pyplot as plt
161
162
 
162
163
  num_metrics = len(data_dict)
163
- if num_metrics > 2:
164
- rows = int(len(data_dict) / 3)
165
- fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
166
- else:
167
- fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
164
+ rows = math.ceil(num_metrics / 3)
165
+ cols = min(num_metrics, 3)
166
+ fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
168
167
 
169
168
  for ax, metric in zip(
170
169
  axs.flat,
@@ -178,6 +177,10 @@ def histogram_plot(
178
177
  ax.set_ylabel(ylabel)
179
178
  ax.set_xlabel(xlabel)
180
179
 
180
+ for ax in axs.flat[num_metrics:]:
181
+ ax.axis("off")
182
+ ax.set_visible(False)
183
+
181
184
  fig.tight_layout()
182
185
  return fig
183
186
 
@@ -216,11 +219,9 @@ def channel_histogram_plot(
216
219
  label_kwargs = {"label": [f"Channel {i}" for i in range(max_channels)]}
217
220
 
218
221
  num_metrics = len(data_keys)
219
- if num_metrics > 2:
220
- rows = int(len(data_keys) / 3)
221
- fig, axs = plt.subplots(rows, 3, figsize=(10, rows * 2.5))
222
- else:
223
- fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
222
+ rows = math.ceil(num_metrics / 3)
223
+ cols = min(num_metrics, 3)
224
+ fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
224
225
 
225
226
  for ax, metric in zip(
226
227
  axs.flat,
@@ -245,5 +246,9 @@ def channel_histogram_plot(
245
246
  ax.set_ylabel(ylabel)
246
247
  ax.set_xlabel(xlabel)
247
248
 
249
+ for ax in axs.flat[num_metrics:]:
250
+ ax.axis("off")
251
+ ax.set_visible(False)
252
+
248
253
  fig.tight_layout()
249
254
  return fig
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar
5
+ from typing import Any, Generic, Iterable, Literal, Sequence, SupportsFloat, SupportsInt, TypeVar, cast
6
6
 
7
7
  from dataeval.typing import (
8
8
  Array,
@@ -17,8 +17,8 @@ from dataeval.utils._array import as_numpy
17
17
  def _validate_data(
18
18
  datum_type: Literal["ic", "od"],
19
19
  images: Array | Sequence[Array],
20
- labels: Sequence[int] | Sequence[Sequence[int]],
21
- bboxes: Sequence[Sequence[Sequence[float]]] | None,
20
+ labels: Array | Sequence[int] | Sequence[Array] | Sequence[Sequence[int]],
21
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]] | None,
22
22
  metadata: Sequence[dict[str, Any]] | None,
23
23
  ) -> None:
24
24
  # Validate inputs
@@ -34,16 +34,21 @@ def _validate_data(
34
34
  raise ValueError(f"Number of metadata ({len(metadata)}) does not match number of images ({dataset_len}).")
35
35
 
36
36
  if datum_type == "ic":
37
- if not isinstance(labels, Sequence) or not isinstance(labels[0], int):
37
+ if not isinstance(labels, (Sequence, Array)) or not isinstance(labels[0], (int, SupportsInt)):
38
38
  raise TypeError("Labels must be a sequence of integers for image classification.")
39
39
  elif datum_type == "od":
40
- if not isinstance(labels, Sequence) or not isinstance(labels[0], Sequence) or not isinstance(labels[0][0], int):
40
+ if (
41
+ not isinstance(labels, (Sequence, Array))
42
+ or not isinstance(labels[0], (Sequence, Array))
43
+ or not isinstance(cast(Sequence[Any], labels[0])[0], (int, SupportsInt))
44
+ ):
41
45
  raise TypeError("Labels must be a sequence of sequences of integers for object detection.")
42
46
  if (
43
47
  bboxes is None
44
48
  or not isinstance(bboxes, (Sequence, Array))
45
49
  or not isinstance(bboxes[0], (Sequence, Array))
46
50
  or not isinstance(bboxes[0][0], (Sequence, Array))
51
+ or not isinstance(bboxes[0][0][0], (float, SupportsFloat))
47
52
  or not len(bboxes[0][0]) == 4
48
53
  ):
49
54
  raise TypeError("Boxes must be a sequence of sequences of (x0, y0, x1, y1) for object detection.")
@@ -52,11 +57,10 @@ def _validate_data(
52
57
 
53
58
 
54
59
  def _find_max(arr: ArrayLike) -> Any:
55
- if isinstance(arr, (Iterable, Sequence, Array)):
60
+ if not isinstance(arr, (bytes, str)) and isinstance(arr, (Iterable, Sequence, Array)):
56
61
  if isinstance(arr[0], (Iterable, Sequence, Array)):
57
62
  return max([_find_max(x) for x in arr]) # type: ignore
58
- else:
59
- return max(arr)
63
+ return max(arr)
60
64
  return arr
61
65
 
62
66
 
@@ -92,12 +96,14 @@ class CustomImageClassificationDataset(BaseAnnotatedDataset[Sequence[int]], Imag
92
96
  def __init__(
93
97
  self,
94
98
  images: Array | Sequence[Array],
95
- labels: Sequence[int],
99
+ labels: Array | Sequence[int],
96
100
  metadata: Sequence[dict[str, Any]] | None,
97
101
  classes: Sequence[str] | None,
98
102
  name: str | None = None,
99
103
  ) -> None:
100
- super().__init__("ic", images, labels, metadata, classes)
104
+ super().__init__(
105
+ "ic", images, as_numpy(labels).tolist() if isinstance(labels, Array) else labels, metadata, classes
106
+ )
101
107
  if name is not None:
102
108
  self.__name__ = name
103
109
  self.__class__.__name__ = name
@@ -135,18 +141,24 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
135
141
  def __init__(
136
142
  self,
137
143
  images: Array | Sequence[Array],
138
- labels: Sequence[Sequence[int]],
139
- bboxes: Sequence[Sequence[Sequence[float]]],
144
+ labels: Array | Sequence[Array] | Sequence[Sequence[int]],
145
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
140
146
  metadata: Sequence[dict[str, Any]] | None,
141
147
  classes: Sequence[str] | None,
142
148
  name: str | None = None,
143
149
  ) -> None:
144
- super().__init__("od", images, labels, metadata, classes)
150
+ super().__init__(
151
+ "od",
152
+ images,
153
+ [as_numpy(label).tolist() if isinstance(label, Array) else label for label in labels],
154
+ metadata,
155
+ classes,
156
+ )
145
157
  if name is not None:
146
158
  self.__name__ = name
147
159
  self.__class__.__name__ = name
148
160
  self.__class__.__qualname__ = name
149
- self._bboxes = bboxes
161
+ self._bboxes = [[as_numpy(box).tolist() if isinstance(box, Array) else box for box in bbox] for bbox in bboxes]
150
162
 
151
163
  @property
152
164
  def metadata(self) -> DatasetMetadata:
@@ -162,7 +174,7 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
162
174
 
163
175
  def to_image_classification_dataset(
164
176
  images: Array | Sequence[Array],
165
- labels: Sequence[int],
177
+ labels: Array | Sequence[int],
166
178
  metadata: Sequence[dict[str, Any]] | None,
167
179
  classes: Sequence[str] | None,
168
180
  name: str | None = None,
@@ -174,7 +186,7 @@ def to_image_classification_dataset(
174
186
  ----------
175
187
  images : Array | Sequence[Array]
176
188
  The images to use in the dataset.
177
- labels : Sequence[int]
189
+ labels : Array | Sequence[int]
178
190
  The labels to use in the dataset.
179
191
  metadata : Sequence[dict[str, Any]] | None
180
192
  The metadata to use in the dataset.
@@ -191,8 +203,8 @@ def to_image_classification_dataset(
191
203
 
192
204
  def to_object_detection_dataset(
193
205
  images: Array | Sequence[Array],
194
- labels: Sequence[Sequence[int]],
195
- bboxes: Sequence[Sequence[Sequence[float]]],
206
+ labels: Array | Sequence[Array] | Sequence[Sequence[int]],
207
+ bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
196
208
  metadata: Sequence[dict[str, Any]] | None,
197
209
  classes: Sequence[str] | None,
198
210
  name: str | None = None,
@@ -204,9 +216,9 @@ def to_object_detection_dataset(
204
216
  ----------
205
217
  images : Array | Sequence[Array]
206
218
  The images to use in the dataset.
207
- labels : Sequence[Sequence[int]]
219
+ labels : Array | Sequence[Array] | Sequence[Sequence[int]]
208
220
  The labels to use in the dataset.
209
- bboxes : Sequence[Sequence[Sequence[float]]]
221
+ bboxes : Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]]
210
222
  The bounding boxes (x0,y0,x1,y0) to use in the dataset.
211
223
  metadata : Sequence[dict[str, Any]] | None
212
224
  The metadata to use in the dataset.
@@ -228,58 +228,130 @@ def flatten(
228
228
 
229
229
  if return_dropped:
230
230
  return output, size, _sorted_drop_reasons(dropped)
231
+ if dropped:
232
+ dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
233
+ warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
234
+ return output, size
235
+
236
+
237
+ def _flatten_for_merge(
238
+ metadatum: Mapping[str, Any],
239
+ ignore_lists: bool,
240
+ fully_qualified: bool,
241
+ targets: int | None,
242
+ ) -> tuple[dict[str, list[Any]] | dict[str, Any], int, dict[str, list[str]]]:
243
+ flattened, image_repeats, dropped_inner = flatten(
244
+ metadatum, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
245
+ )
246
+ if targets is not None:
247
+ # check for mismatch in targets per image and force ignore_lists
248
+ if not ignore_lists and targets != image_repeats:
249
+ flattened, image_repeats, dropped_inner = flatten(
250
+ metadatum, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
251
+ )
252
+ if targets != image_repeats:
253
+ flattened = {k: [v] * targets for k, v in flattened.items()}
254
+ image_repeats = targets
255
+ return flattened, image_repeats, dropped_inner
256
+
257
+
258
+ def _merge(
259
+ dicts: list[Mapping[str, Any]],
260
+ ignore_lists: bool,
261
+ fully_qualified: bool,
262
+ targets_per_image: Sequence[int] | None,
263
+ ) -> tuple[dict[str, list[Any]], dict[str, set[DropReason]], NDArray[np.intp]]:
264
+ merged: dict[str, list[Any]] = {}
265
+ isect: set[str] = set()
266
+ union: set[str] = set()
267
+ image_repeats = np.zeros(len(dicts), dtype=np.int_)
268
+ dropped: dict[str, set[DropReason]] = {}
269
+ for i, d in enumerate(dicts):
270
+ targets = None if targets_per_image is None else targets_per_image[i]
271
+ flattened, image_repeats[i], dropped_inner = _flatten_for_merge(d, ignore_lists, fully_qualified, targets)
272
+ isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
273
+ union.update(flattened.keys())
274
+ for k, v in dropped_inner.items():
275
+ dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
276
+ for k, v in flattened.items():
277
+ merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
278
+
279
+ for k in union - isect:
280
+ dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
281
+
282
+ if image_repeats.sum() == image_repeats.size:
283
+ image_indices = np.arange(image_repeats.size)
231
284
  else:
232
- if dropped:
233
- dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
234
- warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
235
- return output, size
285
+ image_ids = np.arange(image_repeats.size)
286
+ image_data = np.concatenate(
287
+ [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
288
+ )
289
+ _, image_unsorted = np.unique(image_data, return_inverse=True)
290
+ image_indices = np.sort(image_unsorted)
236
291
 
292
+ merged = {k: _simplify_type(v) for k, v in merged.items() if k in isect}
293
+ return merged, dropped, image_indices
237
294
 
238
- def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
239
- """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
240
- # single dict
241
- if len(metadata) < 2:
242
- return False
243
295
 
244
- # dict of non dicts
245
- keys = list(metadata)
246
- if not isinstance(metadata[keys[0]], Mapping):
247
- return False
296
+ @overload
297
+ def merge(
298
+ metadata: Iterable[Mapping[str, Any]],
299
+ *,
300
+ return_dropped: Literal[True],
301
+ return_numpy: Literal[False] = False,
302
+ ignore_lists: bool = False,
303
+ fully_qualified: bool = False,
304
+ targets_per_image: Sequence[int] | None = None,
305
+ image_index_key: str = "_image_index",
306
+ ) -> tuple[dict[str, list[Any]], dict[str, list[str]]]: ...
307
+
248
308
 
249
- # dict of dicts with matching keys
250
- return set(metadata[keys[0]]) == set(metadata[keys[1]])
309
+ @overload
310
+ def merge(
311
+ metadata: Iterable[Mapping[str, Any]],
312
+ *,
313
+ return_dropped: Literal[False] = False,
314
+ return_numpy: Literal[False] = False,
315
+ ignore_lists: bool = False,
316
+ fully_qualified: bool = False,
317
+ targets_per_image: Sequence[int] | None = None,
318
+ image_index_key: str = "_image_index",
319
+ ) -> dict[str, list[Any]]: ...
251
320
 
252
321
 
253
322
  @overload
254
323
  def merge(
255
324
  metadata: Iterable[Mapping[str, Any]],
325
+ *,
256
326
  return_dropped: Literal[True],
327
+ return_numpy: Literal[True],
257
328
  ignore_lists: bool = False,
258
329
  fully_qualified: bool = False,
259
- return_numpy: bool = False,
260
330
  targets_per_image: Sequence[int] | None = None,
261
331
  image_index_key: str = "_image_index",
262
- ) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
332
+ ) -> tuple[dict[str, NDArray[Any]], dict[str, list[str]]]: ...
263
333
 
264
334
 
265
335
  @overload
266
336
  def merge(
267
337
  metadata: Iterable[Mapping[str, Any]],
338
+ *,
268
339
  return_dropped: Literal[False] = False,
340
+ return_numpy: Literal[True],
269
341
  ignore_lists: bool = False,
270
342
  fully_qualified: bool = False,
271
- return_numpy: bool = False,
272
343
  targets_per_image: Sequence[int] | None = None,
273
344
  image_index_key: str = "_image_index",
274
- ) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
345
+ ) -> dict[str, NDArray[Any]]: ...
275
346
 
276
347
 
277
348
  def merge(
278
349
  metadata: Iterable[Mapping[str, Any]],
350
+ *,
279
351
  return_dropped: bool = False,
352
+ return_numpy: bool = False,
280
353
  ignore_lists: bool = False,
281
354
  fully_qualified: bool = False,
282
- return_numpy: bool = False,
283
355
  targets_per_image: Sequence[int] | None = None,
284
356
  image_index_key: str = "_image_index",
285
357
  ):
@@ -298,12 +370,12 @@ def merge(
298
370
  Iterable collection of metadata dictionaries to flatten and merge
299
371
  return_dropped: bool, default False
300
372
  Option to return a dictionary of dropped keys and the reason(s) for dropping
373
+ return_numpy : bool, default False
374
+ Option to return results as lists or NumPy arrays
301
375
  ignore_lists : bool, default False
302
376
  Option to skip expanding lists within metadata
303
377
  fully_qualified : bool, default False
304
378
  Option to return dictionary keys full qualified instead of minimized
305
- return_numpy : bool, default False
306
- Option to return results as lists or NumPy arrays
307
379
  targets_per_image : Sequence[int] or None, default None
308
380
  Number of targets for each image metadata entry
309
381
  image_index_key : str, default "_image_index"
@@ -330,74 +402,24 @@ def merge(
330
402
  >>> dropped_keys
331
403
  {'target_c': ['inconsistent_key']}
332
404
  """
333
- merged: dict[str, list[Any]] = {}
334
- isect: set[str] = set()
335
- union: set[str] = set()
336
- keys: list[str] | None = None
337
- dicts: list[Mapping[str, Any]]
338
-
339
- # EXPERIMENTAL
340
- if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
341
- warnings.warn("Experimental processing for dict of dicts.")
342
- keys = [str(k) for k in metadata]
343
- dicts = list(metadata.values())
344
- ignore_lists = True
345
- else:
346
- dicts = list(metadata)
405
+
406
+ dicts: list[Mapping[str, Any]] = list(metadata)
347
407
 
348
408
  if targets_per_image is not None and len(dicts) != len(targets_per_image):
349
409
  raise ValueError("Number of targets per image must be equal to number of metadata entries.")
350
410
 
351
- image_repeats = np.zeros(len(dicts), dtype=np.int_)
352
- dropped: dict[str, set[DropReason]] = {}
353
- for i, d in enumerate(dicts):
354
- flattened, image_repeats[i], dropped_inner = flatten(
355
- d, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
356
- )
357
- if targets_per_image is not None:
358
- # check for mismatch in targets per image and force ignore_lists
359
- if not ignore_lists and targets_per_image[i] != image_repeats[i]:
360
- flattened, image_repeats[i], dropped_inner = flatten(
361
- d, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
362
- )
363
- if targets_per_image[i] != image_repeats[i]:
364
- flattened = {k: [v] * targets_per_image[i] for k, v in flattened.items()}
365
- image_repeats[i] = targets_per_image[i]
366
- isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
367
- union.update(flattened.keys())
368
- for k, v in dropped_inner.items():
369
- dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
370
- for k, v in flattened.items():
371
- merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
372
-
373
- for k in union - isect:
374
- dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
375
-
376
- if image_repeats.sum() == image_repeats.size:
377
- image_indices = np.arange(image_repeats.size)
378
- else:
379
- image_ids = np.arange(image_repeats.size)
380
- image_data = np.concatenate(
381
- [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
382
- )
383
- _, image_unsorted = np.unique(image_data, return_inverse=True)
384
- image_indices = np.sort(image_unsorted)
385
-
386
- output: dict[str, Any] = {}
411
+ merged, dropped, image_indices = _merge(dicts, ignore_lists, fully_qualified, targets_per_image)
387
412
 
388
- if keys:
389
- output["keys"] = np.array(keys) if return_numpy else keys
413
+ output: dict[str, Any] = {k: np.asarray(v) for k, v in merged.items()} if return_numpy else merged
390
414
 
391
- for k in (key for key in merged if key in isect):
392
- cv = _simplify_type(merged[k])
393
- output[k] = np.array(cv) if return_numpy else cv
394
415
  if image_index_key not in output:
395
416
  output[image_index_key] = image_indices if return_numpy else image_indices.tolist()
396
417
 
397
418
  if return_dropped:
398
419
  return output, _sorted_drop_reasons(dropped)
399
- else:
400
- if dropped:
401
- dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
402
- warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
403
- return output
420
+
421
+ if dropped:
422
+ dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
423
+ warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
424
+
425
+ return output
@@ -1,5 +1,6 @@
1
1
  """Provides access to common Computer Vision datasets."""
2
2
 
3
+ from dataeval.utils.datasets._antiuav import AntiUAVDetection
3
4
  from dataeval.utils.datasets._cifar10 import CIFAR10
4
5
  from dataeval.utils.datasets._milco import MILCO
5
6
  from dataeval.utils.datasets._mnist import MNIST
@@ -10,6 +11,7 @@ __all__ = [
10
11
  "MNIST",
11
12
  "Ships",
12
13
  "CIFAR10",
14
+ "AntiUAVDetection",
13
15
  "MILCO",
14
16
  "VOCDetection",
15
17
  "VOCDetectionTorch",
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Literal, Sequence
7
+
8
+ from defusedxml.ElementTree import parse
9
+ from numpy.typing import NDArray
10
+
11
+ from dataeval.utils.datasets._base import BaseODDataset, DataLocation
12
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
13
+
14
+ if TYPE_CHECKING:
15
+ from dataeval.typing import Transform
16
+
17
+
18
+ class AntiUAVDetection(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
19
+ """
20
+ A UAV detection dataset focused on detecting UAVs in natural images against large variation in backgrounds.
21
+
22
+ The dataset comes from the paper
23
+ `Vision-based Anti-UAV Detection and Tracking <https://ieeexplore.ieee.org/document/9785379>`_
24
+ by Jie Zhao et. al. (2022).
25
+
26
+ The dataset is approximately 1.3 GB and can be found `here <https://github.com/wangdongdut/DUT-Anti-UAV>`_.
27
+ Images are collected against a variety of different backgrounds with a variety in the number and type of UAV.
28
+ Ground truth labels are provided for the train, validation and test set.
29
+ There are 35 different types of drones along with a variety in lighting conditions and weather conditions.
30
+
31
+ There are 10,000 images: 5200 images in the training set, 2200 images in the validation set,
32
+ and 2600 images in the test set.
33
+ The dataset only has a single UAV class with the focus being on identifying object location in the image.
34
+ Ground-truth bounding boxes are provided in (x0, y0, x1, y1) format.
35
+ The images come in a variety of sizes from 3744 x 5616 to 160 x 240.
36
+
37
+ Parameters
38
+ ----------
39
+ root : str or pathlib.Path
40
+ Root directory where the data should be downloaded to or
41
+ the ``antiuavdetection`` folder of the already downloaded data.
42
+ image_set: "train", "val", "test", or "base", default "train"
43
+ If "base", then the full dataset is selected (train, val and test).
44
+ transforms : Transform, Sequence[Transform] or None, default None
45
+ Transform(s) to apply to the data.
46
+ download : bool, default False
47
+ If True, downloads the dataset from the internet and puts it in root directory.
48
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
49
+ verbose : bool, default False
50
+ If True, outputs print statements.
51
+
52
+ Attributes
53
+ ----------
54
+ path : pathlib.Path
55
+ Location of the folder containing the data.
56
+ image_set : "train", "val", "test", or "base"
57
+ The selected image set from the dataset.
58
+ index2label : dict[int, str]
59
+ Dictionary which translates from class integers to the associated class strings.
60
+ label2index : dict[str, int]
61
+ Dictionary which translates from class strings to the associated class integers.
62
+ metadata : DatasetMetadata
63
+ Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
64
+ transforms : Sequence[Transform]
65
+ The transforms to be applied to the data.
66
+ size : int
67
+ The size of the dataset.
68
+
69
+ Note
70
+ ----
71
+ Data License: `Apache 2.0 <https://www.apache.org/licenses/LICENSE-2.0.txt>`_
72
+ """
73
+
74
+ # Need to run the sha256 on the files and then store that
75
+ _resources = [
76
+ DataLocation(
77
+ url="https://drive.usercontent.google.com/download?id=1RVsSGPUKTdmoyoPTBTWwroyulLek1eTj&export=download&authuser=0&confirm=t&uuid=6bca4f94-a242-4bc2-9663-fb03cd94ef2c&at=APcmpox0--NroQ_3bqeTFaJxP7Pw%3A1746552902927",
78
+ filename="train.zip",
79
+ md5=False,
80
+ checksum="14f927290556df60e23cedfa80dffc10dc21e4a3b6843e150cfc49644376eece",
81
+ ),
82
+ DataLocation(
83
+ url="https://drive.usercontent.google.com/download?id=1333uEQfGuqTKslRkkeLSCxylh6AQ0X6n&export=download&authuser=0&confirm=t&uuid=c2ad2f01-aca8-4a85-96bb-b8ef6e40feea&at=APcmpozY-8bhk3nZSFaYbE8rq1Fi%3A1746551543297",
84
+ filename="val.zip",
85
+ md5=False,
86
+ checksum="238be0ceb3e7c5be6711ee3247e49df2750d52f91f54f5366c68bebac112ebf8",
87
+ ),
88
+ DataLocation(
89
+ url="https://drive.usercontent.google.com/download?id=1L1zeW1EMDLlXHClSDcCjl3rs_A6sVai0&export=download&authuser=0&confirm=t&uuid=5a1d7650-d8cd-4461-8354-7daf7292f06c&at=APcmpozLQC1CuP-n5_UX2JnP53Zo%3A1746551676177",
90
+ filename="test.zip",
91
+ md5=False,
92
+ checksum="a671989a01cff98c684aeb084e59b86f4152c50499d86152eb970a9fc7fb1cbe",
93
+ ),
94
+ ]
95
+
96
+ index2label: dict[int, str] = {
97
+ 0: "unknown",
98
+ 1: "UAV",
99
+ }
100
+
101
+ def __init__(
102
+ self,
103
+ root: str | Path,
104
+ image_set: Literal["train", "val", "test", "base"] = "train",
105
+ transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
106
+ download: bool = False,
107
+ verbose: bool = False,
108
+ ) -> None:
109
+ super().__init__(
110
+ root,
111
+ image_set,
112
+ transforms,
113
+ download,
114
+ verbose,
115
+ )
116
+
117
+ def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]:
118
+ filepaths: list[str] = []
119
+ targets: list[str] = []
120
+ datum_metadata: dict[str, list[Any]] = {}
121
+
122
+ # If base, load all resources
123
+ if self.image_set == "base":
124
+ metadata_list: list[dict[str, Any]] = []
125
+
126
+ for resource in self._resources:
127
+ self._resource = resource
128
+ resource_filepaths, resource_targets, resource_metadata = super()._load_data()
129
+ filepaths.extend(resource_filepaths)
130
+ targets.extend(resource_targets)
131
+ metadata_list.append(resource_metadata)
132
+
133
+ # Combine metadata
134
+ for data_dict in metadata_list:
135
+ for key, val in data_dict.items():
136
+ str_key = str(key) # Ensure key is string
137
+ if str_key not in datum_metadata:
138
+ datum_metadata[str_key] = []
139
+ datum_metadata[str_key].extend(val)
140
+
141
+ else:
142
+ # Grab only the desired data
143
+ for resource in self._resources:
144
+ if self.image_set in resource.filename:
145
+ self._resource = resource
146
+ resource_filepaths, resource_targets, resource_metadata = super()._load_data()
147
+ filepaths.extend(resource_filepaths)
148
+ targets.extend(resource_targets)
149
+ datum_metadata.update(resource_metadata)
150
+
151
+ return filepaths, targets, datum_metadata
152
+
153
+ def _load_data_inner(self) -> tuple[list[str], list[str], dict[str, Any]]:
154
+ resource_name = self._resource.filename[:-4]
155
+ base_dir = self.path / resource_name
156
+ data_folder = sorted((base_dir / "img").glob("*.jpg"))
157
+ if not data_folder:
158
+ raise FileNotFoundError
159
+
160
+ file_data = {"image_id": [f"{resource_name}_{entry.name}" for entry in data_folder]}
161
+ data = [str(entry) for entry in data_folder]
162
+ annotations = sorted(str(entry) for entry in (base_dir / "xml").glob("*.xml"))
163
+
164
+ return data, annotations, file_data
165
+
166
+ def _read_annotations(self, annotation: str) -> tuple[list[list[float]], list[int], dict[str, Any]]:
167
+ """Function for extracting the info for the label and boxes"""
168
+ boxes: list[list[float]] = []
169
+ labels = []
170
+ root = parse(annotation).getroot()
171
+ if root is None:
172
+ raise ValueError(f"Unable to parse {annotation}")
173
+ additional_meta: dict[str, Any] = {
174
+ "image_width": int(root.findtext("size/width", default="-1")),
175
+ "image_height": int(root.findtext("size/height", default="-1")),
176
+ "image_depth": int(root.findtext("size/depth", default="-1")),
177
+ }
178
+ for obj in root.findall("object"):
179
+ labels.append(1 if obj.findtext("name", default="") == "UAV" else 0)
180
+ boxes.append(
181
+ [
182
+ float(obj.findtext("bndbox/xmin", default="0")),
183
+ float(obj.findtext("bndbox/ymin", default="0")),
184
+ float(obj.findtext("bndbox/xmax", default="0")),
185
+ float(obj.findtext("bndbox/ymax", default="0")),
186
+ ]
187
+ )
188
+
189
+ return boxes, labels, additional_meta