dataeval 0.86.0__py3-none-any.whl → 0.86.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_log.py +1 -1
- dataeval/config.py +21 -4
- dataeval/data/_embeddings.py +2 -2
- dataeval/data/_images.py +2 -3
- dataeval/data/_metadata.py +188 -178
- dataeval/data/_selection.py +1 -2
- dataeval/data/_split.py +4 -5
- dataeval/data/_targets.py +17 -13
- dataeval/data/selections/_classfilter.py +2 -5
- dataeval/data/selections/_prioritize.py +6 -9
- dataeval/data/selections/_shuffle.py +3 -1
- dataeval/detectors/drift/_base.py +4 -5
- dataeval/detectors/drift/_mmd.py +3 -6
- dataeval/detectors/drift/_nml/_base.py +4 -2
- dataeval/detectors/drift/_nml/_chunk.py +11 -19
- dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
- dataeval/detectors/drift/_nml/_result.py +8 -9
- dataeval/detectors/drift/_nml/_thresholds.py +66 -77
- dataeval/detectors/linters/outliers.py +7 -7
- dataeval/metadata/_distance.py +10 -7
- dataeval/metadata/_ood.py +11 -103
- dataeval/metrics/bias/_balance.py +23 -33
- dataeval/metrics/bias/_diversity.py +16 -14
- dataeval/metrics/bias/_parity.py +18 -18
- dataeval/metrics/estimators/_divergence.py +2 -4
- dataeval/metrics/stats/_base.py +103 -42
- dataeval/metrics/stats/_boxratiostats.py +21 -19
- dataeval/metrics/stats/_dimensionstats.py +14 -10
- dataeval/metrics/stats/_hashstats.py +1 -1
- dataeval/metrics/stats/_pixelstats.py +6 -6
- dataeval/metrics/stats/_visualstats.py +3 -3
- dataeval/outputs/_base.py +22 -7
- dataeval/outputs/_bias.py +24 -70
- dataeval/outputs/_drift.py +1 -9
- dataeval/outputs/_linters.py +11 -11
- dataeval/outputs/_stats.py +82 -23
- dataeval/outputs/_workflows.py +2 -2
- dataeval/utils/_array.py +6 -9
- dataeval/utils/_bin.py +1 -2
- dataeval/utils/_clusterer.py +7 -4
- dataeval/utils/_fast_mst.py +27 -13
- dataeval/utils/_image.py +65 -11
- dataeval/utils/_mst.py +1 -3
- dataeval/utils/_plot.py +15 -10
- dataeval/utils/data/_dataset.py +54 -28
- dataeval/utils/data/metadata.py +104 -82
- dataeval/utils/datasets/__init__.py +2 -0
- dataeval/utils/datasets/_antiuav.py +189 -0
- dataeval/utils/datasets/_base.py +11 -8
- dataeval/utils/datasets/_cifar10.py +104 -45
- dataeval/utils/datasets/_fileio.py +21 -47
- dataeval/utils/datasets/_milco.py +22 -12
- dataeval/utils/datasets/_mixin.py +2 -4
- dataeval/utils/datasets/_mnist.py +3 -4
- dataeval/utils/datasets/_ships.py +14 -7
- dataeval/utils/datasets/_voc.py +229 -42
- dataeval/utils/torch/models.py +5 -10
- dataeval/utils/torch/trainer.py +3 -3
- dataeval/workflows/sufficiency.py +2 -2
- {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/METADATA +2 -1
- dataeval-0.86.2.dist-info/RECORD +114 -0
- dataeval/detectors/ood/vae.py +0 -74
- dataeval-0.86.0.dist-info/RECORD +0 -114
- {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.86.0.dist-info → dataeval-0.86.2.dist-info}/WHEEL +0 -0
dataeval/utils/_image.py
CHANGED
@@ -12,6 +12,9 @@ from scipy.signal import convolve2d
|
|
12
12
|
EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
|
13
13
|
BIT_DEPTH = (1, 8, 12, 16, 32)
|
14
14
|
|
15
|
+
Box = tuple[int, int, int, int]
|
16
|
+
"""Bounding box as tuple of integers in x0, y0, x1, y1 format."""
|
17
|
+
|
15
18
|
|
16
19
|
@dataclass
|
17
20
|
class BitDepth:
|
@@ -25,12 +28,11 @@ def get_bitdepth(image: NDArray[Any]) -> BitDepth:
|
|
25
28
|
Approximates the bit depth of the image using the
|
26
29
|
min and max pixel values.
|
27
30
|
"""
|
28
|
-
pmin, pmax = np.
|
31
|
+
pmin, pmax = np.nanmin(image), np.nanmax(image)
|
29
32
|
if pmin < 0:
|
30
33
|
return BitDepth(0, pmin, pmax)
|
31
|
-
|
32
|
-
|
33
|
-
return BitDepth(depth, 0, 2**depth - 1)
|
34
|
+
depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
|
35
|
+
return BitDepth(depth, 0, 2**depth - 1)
|
34
36
|
|
35
37
|
|
36
38
|
def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
|
@@ -40,9 +42,8 @@ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
|
|
40
42
|
bitdepth = get_bitdepth(image)
|
41
43
|
if bitdepth.depth == depth:
|
42
44
|
return image
|
43
|
-
|
44
|
-
|
45
|
-
return normalized * (2**depth - 1)
|
45
|
+
normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
|
46
|
+
return normalized * (2**depth - 1)
|
46
47
|
|
47
48
|
|
48
49
|
def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
|
@@ -52,13 +53,12 @@ def normalize_image_shape(image: NDArray[Any]) -> NDArray[Any]:
|
|
52
53
|
ndim = image.ndim
|
53
54
|
if ndim == 2:
|
54
55
|
return np.expand_dims(image, axis=0)
|
55
|
-
|
56
|
+
if ndim == 3:
|
56
57
|
return image
|
57
|
-
|
58
|
+
if ndim > 3:
|
58
59
|
# Slice all but the last 3 dimensions
|
59
60
|
return image[(0,) * (ndim - 3)]
|
60
|
-
|
61
|
-
raise ValueError("Images must have 2 or more dimensions.")
|
61
|
+
raise ValueError("Images must have 2 or more dimensions.")
|
62
62
|
|
63
63
|
|
64
64
|
def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
|
@@ -71,3 +71,57 @@ def edge_filter(image: NDArray[Any], offset: float = 0.5) -> NDArray[np.uint8]:
|
|
71
71
|
edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
|
72
72
|
np.clip(edges, 0, 255, edges)
|
73
73
|
return edges
|
74
|
+
|
75
|
+
|
76
|
+
def clip_box(image: NDArray[Any], box: Box) -> Box:
|
77
|
+
"""
|
78
|
+
Clip the box to inside the provided image dimensions.
|
79
|
+
"""
|
80
|
+
x0, y0, x1, y1 = box
|
81
|
+
h, w = image.shape[-2:]
|
82
|
+
|
83
|
+
return max(0, x0), max(0, y0), min(w, x1), min(h, y1)
|
84
|
+
|
85
|
+
|
86
|
+
def is_valid_box(box: Box) -> bool:
|
87
|
+
"""
|
88
|
+
Check if the box dimensions provided are a valid image.
|
89
|
+
"""
|
90
|
+
return box[2] > box[0] and box[3] > box[1]
|
91
|
+
|
92
|
+
|
93
|
+
def clip_and_pad(image: NDArray[Any], box: Box) -> NDArray[Any]:
|
94
|
+
"""
|
95
|
+
Extract a region from an image based on a bounding box, clipping to image boundaries
|
96
|
+
and padding out-of-bounds areas with np.nan.
|
97
|
+
|
98
|
+
Parameters:
|
99
|
+
-----------
|
100
|
+
image : NDArray[Any]
|
101
|
+
Input image array in format C, H, W (channels first)
|
102
|
+
box : Box
|
103
|
+
Bounding box coordinates as (x0, y0, x1, y1) where (x0, y0) is top-left and (x1, y1) is bottom-right
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
--------
|
107
|
+
NDArray[Any]
|
108
|
+
The extracted region with out-of-bounds areas padded with np.nan
|
109
|
+
"""
|
110
|
+
|
111
|
+
# Create output array filled with NaN with a minimum size of 1x1
|
112
|
+
bw, bh = max(1, box[2] - box[0]), max(1, box[3] - box[1])
|
113
|
+
|
114
|
+
output = np.full((image.shape[-3] if image.ndim > 2 else 1, bh, bw), np.nan)
|
115
|
+
|
116
|
+
# Calculate source box
|
117
|
+
sbox = clip_box(image, box)
|
118
|
+
|
119
|
+
# Calculate destination box
|
120
|
+
x0, y0 = sbox[0] - box[0], sbox[1] - box[1]
|
121
|
+
x1, y1 = x0 + (sbox[2] - sbox[0]), y0 + (sbox[3] - sbox[1])
|
122
|
+
|
123
|
+
# Copy the source if valid from the image to the output
|
124
|
+
if is_valid_box(sbox):
|
125
|
+
output[:, y0:y1, x0:x1] = image[:, sbox[1] : sbox[3], sbox[0] : sbox[2]]
|
126
|
+
|
127
|
+
return output
|
dataeval/utils/_mst.py
CHANGED
dataeval/utils/_plot.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
|
+
import math
|
6
7
|
from typing import Any
|
7
8
|
|
8
9
|
import numpy as np
|
@@ -160,11 +161,9 @@ def histogram_plot(
|
|
160
161
|
import matplotlib.pyplot as plt
|
161
162
|
|
162
163
|
num_metrics = len(data_dict)
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
else:
|
167
|
-
fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
|
164
|
+
rows = math.ceil(num_metrics / 3)
|
165
|
+
cols = min(num_metrics, 3)
|
166
|
+
fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
|
168
167
|
|
169
168
|
for ax, metric in zip(
|
170
169
|
axs.flat,
|
@@ -178,6 +177,10 @@ def histogram_plot(
|
|
178
177
|
ax.set_ylabel(ylabel)
|
179
178
|
ax.set_xlabel(xlabel)
|
180
179
|
|
180
|
+
for ax in axs.flat[num_metrics:]:
|
181
|
+
ax.axis("off")
|
182
|
+
ax.set_visible(False)
|
183
|
+
|
181
184
|
fig.tight_layout()
|
182
185
|
return fig
|
183
186
|
|
@@ -216,11 +219,9 @@ def channel_histogram_plot(
|
|
216
219
|
label_kwargs = {"label": [f"Channel {i}" for i in range(max_channels)]}
|
217
220
|
|
218
221
|
num_metrics = len(data_keys)
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
else:
|
223
|
-
fig, axs = plt.subplots(1, num_metrics, figsize=(4 * num_metrics, 4))
|
222
|
+
rows = math.ceil(num_metrics / 3)
|
223
|
+
cols = min(num_metrics, 3)
|
224
|
+
fig, axs = plt.subplots(rows, 3, figsize=(cols * 3 + 1, rows * 3))
|
224
225
|
|
225
226
|
for ax, metric in zip(
|
226
227
|
axs.flat,
|
@@ -245,5 +246,9 @@ def channel_histogram_plot(
|
|
245
246
|
ax.set_ylabel(ylabel)
|
246
247
|
ax.set_xlabel(xlabel)
|
247
248
|
|
249
|
+
for ax in axs.flat[num_metrics:]:
|
250
|
+
ax.axis("off")
|
251
|
+
ax.set_visible(False)
|
252
|
+
|
248
253
|
fig.tight_layout()
|
249
254
|
return fig
|
dataeval/utils/data/_dataset.py
CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar
|
5
|
+
from typing import Any, Generic, Iterable, Literal, Sequence, SupportsFloat, SupportsInt, TypeVar, cast
|
6
6
|
|
7
7
|
from dataeval.typing import (
|
8
8
|
Array,
|
@@ -17,9 +17,9 @@ from dataeval.utils._array import as_numpy
|
|
17
17
|
def _validate_data(
|
18
18
|
datum_type: Literal["ic", "od"],
|
19
19
|
images: Array | Sequence[Array],
|
20
|
-
labels: Sequence[int] | Sequence[Sequence[int]],
|
21
|
-
bboxes: Sequence[Sequence[Sequence[float]]] | None,
|
22
|
-
metadata: Sequence[dict[str, Any]] | None,
|
20
|
+
labels: Array | Sequence[int] | Sequence[Array] | Sequence[Sequence[int]],
|
21
|
+
bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]] | None,
|
22
|
+
metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
|
23
23
|
) -> None:
|
24
24
|
# Validate inputs
|
25
25
|
dataset_len = len(images)
|
@@ -30,20 +30,31 @@ def _validate_data(
|
|
30
30
|
raise ValueError(f"Number of labels ({len(labels)}) does not match number of images ({dataset_len}).")
|
31
31
|
if bboxes is not None and len(bboxes) != dataset_len:
|
32
32
|
raise ValueError(f"Number of bboxes ({len(bboxes)}) does not match number of images ({dataset_len}).")
|
33
|
-
if metadata is not None and
|
33
|
+
if metadata is not None and (
|
34
|
+
len(metadata) != dataset_len
|
35
|
+
if isinstance(metadata, Sequence)
|
36
|
+
else any(
|
37
|
+
not isinstance(metadatum, Sequence) or len(metadatum) != dataset_len for metadatum in metadata.values()
|
38
|
+
)
|
39
|
+
):
|
34
40
|
raise ValueError(f"Number of metadata ({len(metadata)}) does not match number of images ({dataset_len}).")
|
35
41
|
|
36
42
|
if datum_type == "ic":
|
37
|
-
if not isinstance(labels, Sequence) or not isinstance(labels[0], int):
|
43
|
+
if not isinstance(labels, (Sequence, Array)) or not isinstance(labels[0], (int, SupportsInt)):
|
38
44
|
raise TypeError("Labels must be a sequence of integers for image classification.")
|
39
45
|
elif datum_type == "od":
|
40
|
-
if
|
46
|
+
if (
|
47
|
+
not isinstance(labels, (Sequence, Array))
|
48
|
+
or not isinstance(labels[0], (Sequence, Array))
|
49
|
+
or not isinstance(cast(Sequence[Any], labels[0])[0], (int, SupportsInt))
|
50
|
+
):
|
41
51
|
raise TypeError("Labels must be a sequence of sequences of integers for object detection.")
|
42
52
|
if (
|
43
53
|
bboxes is None
|
44
54
|
or not isinstance(bboxes, (Sequence, Array))
|
45
55
|
or not isinstance(bboxes[0], (Sequence, Array))
|
46
56
|
or not isinstance(bboxes[0][0], (Sequence, Array))
|
57
|
+
or not isinstance(bboxes[0][0][0], (float, SupportsFloat))
|
47
58
|
or not len(bboxes[0][0]) == 4
|
48
59
|
):
|
49
60
|
raise TypeError("Boxes must be a sequence of sequences of (x0, y0, x1, y1) for object detection.")
|
@@ -51,12 +62,19 @@ def _validate_data(
|
|
51
62
|
raise ValueError(f"Unknown datum type '{datum_type}'. Must be 'ic' or 'od'.")
|
52
63
|
|
53
64
|
|
65
|
+
def _listify_metadata(
|
66
|
+
metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
|
67
|
+
) -> Sequence[dict[str, Any]] | None:
|
68
|
+
if isinstance(metadata, dict):
|
69
|
+
return [{k: v[i] for k, v in metadata.items()} for i in range(len(next(iter(metadata.values()))))]
|
70
|
+
return metadata
|
71
|
+
|
72
|
+
|
54
73
|
def _find_max(arr: ArrayLike) -> Any:
|
55
|
-
if isinstance(arr, (Iterable, Sequence, Array)):
|
74
|
+
if not isinstance(arr, (bytes, str)) and isinstance(arr, (Iterable, Sequence, Array)):
|
56
75
|
if isinstance(arr[0], (Iterable, Sequence, Array)):
|
57
76
|
return max([_find_max(x) for x in arr]) # type: ignore
|
58
|
-
|
59
|
-
return max(arr)
|
77
|
+
return max(arr)
|
60
78
|
return arr
|
61
79
|
|
62
80
|
|
@@ -92,12 +110,14 @@ class CustomImageClassificationDataset(BaseAnnotatedDataset[Sequence[int]], Imag
|
|
92
110
|
def __init__(
|
93
111
|
self,
|
94
112
|
images: Array | Sequence[Array],
|
95
|
-
labels: Sequence[int],
|
113
|
+
labels: Array | Sequence[int],
|
96
114
|
metadata: Sequence[dict[str, Any]] | None,
|
97
115
|
classes: Sequence[str] | None,
|
98
116
|
name: str | None = None,
|
99
117
|
) -> None:
|
100
|
-
super().__init__(
|
118
|
+
super().__init__(
|
119
|
+
"ic", images, as_numpy(labels).tolist() if isinstance(labels, Array) else labels, metadata, classes
|
120
|
+
)
|
101
121
|
if name is not None:
|
102
122
|
self.__name__ = name
|
103
123
|
self.__class__.__name__ = name
|
@@ -135,18 +155,24 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
|
|
135
155
|
def __init__(
|
136
156
|
self,
|
137
157
|
images: Array | Sequence[Array],
|
138
|
-
labels: Sequence[Sequence[int]],
|
139
|
-
bboxes: Sequence[Sequence[Sequence[float]]],
|
158
|
+
labels: Array | Sequence[Array] | Sequence[Sequence[int]],
|
159
|
+
bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
|
140
160
|
metadata: Sequence[dict[str, Any]] | None,
|
141
161
|
classes: Sequence[str] | None,
|
142
162
|
name: str | None = None,
|
143
163
|
) -> None:
|
144
|
-
super().__init__(
|
164
|
+
super().__init__(
|
165
|
+
"od",
|
166
|
+
images,
|
167
|
+
[as_numpy(label).tolist() if isinstance(label, Array) else label for label in labels],
|
168
|
+
metadata,
|
169
|
+
classes,
|
170
|
+
)
|
145
171
|
if name is not None:
|
146
172
|
self.__name__ = name
|
147
173
|
self.__class__.__name__ = name
|
148
174
|
self.__class__.__qualname__ = name
|
149
|
-
self._bboxes = bboxes
|
175
|
+
self._bboxes = [[as_numpy(box).tolist() if isinstance(box, Array) else box for box in bbox] for bbox in bboxes]
|
150
176
|
|
151
177
|
@property
|
152
178
|
def metadata(self) -> DatasetMetadata:
|
@@ -162,8 +188,8 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
|
|
162
188
|
|
163
189
|
def to_image_classification_dataset(
|
164
190
|
images: Array | Sequence[Array],
|
165
|
-
labels: Sequence[int],
|
166
|
-
metadata: Sequence[dict[str, Any]] | None,
|
191
|
+
labels: Array | Sequence[int],
|
192
|
+
metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
|
167
193
|
classes: Sequence[str] | None,
|
168
194
|
name: str | None = None,
|
169
195
|
) -> ImageClassificationDataset:
|
@@ -174,9 +200,9 @@ def to_image_classification_dataset(
|
|
174
200
|
----------
|
175
201
|
images : Array | Sequence[Array]
|
176
202
|
The images to use in the dataset.
|
177
|
-
labels : Sequence[int]
|
203
|
+
labels : Array | Sequence[int]
|
178
204
|
The labels to use in the dataset.
|
179
|
-
metadata : Sequence[dict[str, Any]] | None
|
205
|
+
metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
|
180
206
|
The metadata to use in the dataset.
|
181
207
|
classes : Sequence[str] | None
|
182
208
|
The classes to use in the dataset.
|
@@ -186,14 +212,14 @@ def to_image_classification_dataset(
|
|
186
212
|
ImageClassificationDataset
|
187
213
|
"""
|
188
214
|
_validate_data("ic", images, labels, None, metadata)
|
189
|
-
return CustomImageClassificationDataset(images, labels, metadata, classes, name)
|
215
|
+
return CustomImageClassificationDataset(images, labels, _listify_metadata(metadata), classes, name)
|
190
216
|
|
191
217
|
|
192
218
|
def to_object_detection_dataset(
|
193
219
|
images: Array | Sequence[Array],
|
194
|
-
labels: Sequence[Sequence[int]],
|
195
|
-
bboxes: Sequence[Sequence[Sequence[float]]],
|
196
|
-
metadata: Sequence[dict[str, Any]] | None,
|
220
|
+
labels: Array | Sequence[Array] | Sequence[Sequence[int]],
|
221
|
+
bboxes: Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]],
|
222
|
+
metadata: Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None,
|
197
223
|
classes: Sequence[str] | None,
|
198
224
|
name: str | None = None,
|
199
225
|
) -> ObjectDetectionDataset:
|
@@ -204,11 +230,11 @@ def to_object_detection_dataset(
|
|
204
230
|
----------
|
205
231
|
images : Array | Sequence[Array]
|
206
232
|
The images to use in the dataset.
|
207
|
-
labels : Sequence[Sequence[int]]
|
233
|
+
labels : Array | Sequence[Array] | Sequence[Sequence[int]]
|
208
234
|
The labels to use in the dataset.
|
209
|
-
bboxes : Sequence[Sequence[Sequence[float]]]
|
235
|
+
bboxes : Array | Sequence[Array] | Sequence[Sequence[Array]] | Sequence[Sequence[Sequence[float]]]
|
210
236
|
The bounding boxes (x0,y0,x1,y0) to use in the dataset.
|
211
|
-
metadata : Sequence[dict[str, Any]] | None
|
237
|
+
metadata : Sequence[dict[str, Any]] | dict[str, Sequence[Any]] | None
|
212
238
|
The metadata to use in the dataset.
|
213
239
|
classes : Sequence[str] | None
|
214
240
|
The classes to use in the dataset.
|
@@ -218,4 +244,4 @@ def to_object_detection_dataset(
|
|
218
244
|
ObjectDetectionDataset
|
219
245
|
"""
|
220
246
|
_validate_data("od", images, labels, bboxes, metadata)
|
221
|
-
return CustomObjectDetectionDataset(images, labels, bboxes, metadata, classes, name)
|
247
|
+
return CustomObjectDetectionDataset(images, labels, bboxes, _listify_metadata(metadata), classes, name)
|
dataeval/utils/data/metadata.py
CHANGED
@@ -228,58 +228,130 @@ def flatten(
|
|
228
228
|
|
229
229
|
if return_dropped:
|
230
230
|
return output, size, _sorted_drop_reasons(dropped)
|
231
|
+
if dropped:
|
232
|
+
dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
|
233
|
+
warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
|
234
|
+
return output, size
|
235
|
+
|
236
|
+
|
237
|
+
def _flatten_for_merge(
|
238
|
+
metadatum: Mapping[str, Any],
|
239
|
+
ignore_lists: bool,
|
240
|
+
fully_qualified: bool,
|
241
|
+
targets: int | None,
|
242
|
+
) -> tuple[dict[str, list[Any]] | dict[str, Any], int, dict[str, list[str]]]:
|
243
|
+
flattened, image_repeats, dropped_inner = flatten(
|
244
|
+
metadatum, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
|
245
|
+
)
|
246
|
+
if targets is not None:
|
247
|
+
# check for mismatch in targets per image and force ignore_lists
|
248
|
+
if not ignore_lists and targets != image_repeats:
|
249
|
+
flattened, image_repeats, dropped_inner = flatten(
|
250
|
+
metadatum, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
|
251
|
+
)
|
252
|
+
if targets != image_repeats:
|
253
|
+
flattened = {k: [v] * targets for k, v in flattened.items()}
|
254
|
+
image_repeats = targets
|
255
|
+
return flattened, image_repeats, dropped_inner
|
256
|
+
|
257
|
+
|
258
|
+
def _merge(
|
259
|
+
dicts: list[Mapping[str, Any]],
|
260
|
+
ignore_lists: bool,
|
261
|
+
fully_qualified: bool,
|
262
|
+
targets_per_image: Sequence[int] | None,
|
263
|
+
) -> tuple[dict[str, list[Any]], dict[str, set[DropReason]], NDArray[np.intp]]:
|
264
|
+
merged: dict[str, list[Any]] = {}
|
265
|
+
isect: set[str] = set()
|
266
|
+
union: set[str] = set()
|
267
|
+
image_repeats = np.zeros(len(dicts), dtype=np.int_)
|
268
|
+
dropped: dict[str, set[DropReason]] = {}
|
269
|
+
for i, d in enumerate(dicts):
|
270
|
+
targets = None if targets_per_image is None else targets_per_image[i]
|
271
|
+
flattened, image_repeats[i], dropped_inner = _flatten_for_merge(d, ignore_lists, fully_qualified, targets)
|
272
|
+
isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
|
273
|
+
union.update(flattened.keys())
|
274
|
+
for k, v in dropped_inner.items():
|
275
|
+
dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
|
276
|
+
for k, v in flattened.items():
|
277
|
+
merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
|
278
|
+
|
279
|
+
for k in union - isect:
|
280
|
+
dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
|
281
|
+
|
282
|
+
if image_repeats.sum() == image_repeats.size:
|
283
|
+
image_indices = np.arange(image_repeats.size)
|
231
284
|
else:
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
285
|
+
image_ids = np.arange(image_repeats.size)
|
286
|
+
image_data = np.concatenate(
|
287
|
+
[np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
|
288
|
+
)
|
289
|
+
_, image_unsorted = np.unique(image_data, return_inverse=True)
|
290
|
+
image_indices = np.sort(image_unsorted)
|
236
291
|
|
292
|
+
merged = {k: _simplify_type(v) for k, v in merged.items() if k in isect}
|
293
|
+
return merged, dropped, image_indices
|
237
294
|
|
238
|
-
def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
|
239
|
-
"""EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
|
240
|
-
# single dict
|
241
|
-
if len(metadata) < 2:
|
242
|
-
return False
|
243
295
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
296
|
+
@overload
|
297
|
+
def merge(
|
298
|
+
metadata: Iterable[Mapping[str, Any]],
|
299
|
+
*,
|
300
|
+
return_dropped: Literal[True],
|
301
|
+
return_numpy: Literal[False] = False,
|
302
|
+
ignore_lists: bool = False,
|
303
|
+
fully_qualified: bool = False,
|
304
|
+
targets_per_image: Sequence[int] | None = None,
|
305
|
+
image_index_key: str = "_image_index",
|
306
|
+
) -> tuple[dict[str, list[Any]], dict[str, list[str]]]: ...
|
307
|
+
|
248
308
|
|
249
|
-
|
250
|
-
|
309
|
+
@overload
|
310
|
+
def merge(
|
311
|
+
metadata: Iterable[Mapping[str, Any]],
|
312
|
+
*,
|
313
|
+
return_dropped: Literal[False] = False,
|
314
|
+
return_numpy: Literal[False] = False,
|
315
|
+
ignore_lists: bool = False,
|
316
|
+
fully_qualified: bool = False,
|
317
|
+
targets_per_image: Sequence[int] | None = None,
|
318
|
+
image_index_key: str = "_image_index",
|
319
|
+
) -> dict[str, list[Any]]: ...
|
251
320
|
|
252
321
|
|
253
322
|
@overload
|
254
323
|
def merge(
|
255
324
|
metadata: Iterable[Mapping[str, Any]],
|
325
|
+
*,
|
256
326
|
return_dropped: Literal[True],
|
327
|
+
return_numpy: Literal[True],
|
257
328
|
ignore_lists: bool = False,
|
258
329
|
fully_qualified: bool = False,
|
259
|
-
return_numpy: bool = False,
|
260
330
|
targets_per_image: Sequence[int] | None = None,
|
261
331
|
image_index_key: str = "_image_index",
|
262
|
-
) -> tuple[dict[str,
|
332
|
+
) -> tuple[dict[str, NDArray[Any]], dict[str, list[str]]]: ...
|
263
333
|
|
264
334
|
|
265
335
|
@overload
|
266
336
|
def merge(
|
267
337
|
metadata: Iterable[Mapping[str, Any]],
|
338
|
+
*,
|
268
339
|
return_dropped: Literal[False] = False,
|
340
|
+
return_numpy: Literal[True],
|
269
341
|
ignore_lists: bool = False,
|
270
342
|
fully_qualified: bool = False,
|
271
|
-
return_numpy: bool = False,
|
272
343
|
targets_per_image: Sequence[int] | None = None,
|
273
344
|
image_index_key: str = "_image_index",
|
274
|
-
) -> dict[str,
|
345
|
+
) -> dict[str, NDArray[Any]]: ...
|
275
346
|
|
276
347
|
|
277
348
|
def merge(
|
278
349
|
metadata: Iterable[Mapping[str, Any]],
|
350
|
+
*,
|
279
351
|
return_dropped: bool = False,
|
352
|
+
return_numpy: bool = False,
|
280
353
|
ignore_lists: bool = False,
|
281
354
|
fully_qualified: bool = False,
|
282
|
-
return_numpy: bool = False,
|
283
355
|
targets_per_image: Sequence[int] | None = None,
|
284
356
|
image_index_key: str = "_image_index",
|
285
357
|
):
|
@@ -298,12 +370,12 @@ def merge(
|
|
298
370
|
Iterable collection of metadata dictionaries to flatten and merge
|
299
371
|
return_dropped: bool, default False
|
300
372
|
Option to return a dictionary of dropped keys and the reason(s) for dropping
|
373
|
+
return_numpy : bool, default False
|
374
|
+
Option to return results as lists or NumPy arrays
|
301
375
|
ignore_lists : bool, default False
|
302
376
|
Option to skip expanding lists within metadata
|
303
377
|
fully_qualified : bool, default False
|
304
378
|
Option to return dictionary keys full qualified instead of minimized
|
305
|
-
return_numpy : bool, default False
|
306
|
-
Option to return results as lists or NumPy arrays
|
307
379
|
targets_per_image : Sequence[int] or None, default None
|
308
380
|
Number of targets for each image metadata entry
|
309
381
|
image_index_key : str, default "_image_index"
|
@@ -330,74 +402,24 @@ def merge(
|
|
330
402
|
>>> dropped_keys
|
331
403
|
{'target_c': ['inconsistent_key']}
|
332
404
|
"""
|
333
|
-
|
334
|
-
|
335
|
-
union: set[str] = set()
|
336
|
-
keys: list[str] | None = None
|
337
|
-
dicts: list[Mapping[str, Any]]
|
338
|
-
|
339
|
-
# EXPERIMENTAL
|
340
|
-
if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
|
341
|
-
warnings.warn("Experimental processing for dict of dicts.")
|
342
|
-
keys = [str(k) for k in metadata]
|
343
|
-
dicts = list(metadata.values())
|
344
|
-
ignore_lists = True
|
345
|
-
else:
|
346
|
-
dicts = list(metadata)
|
405
|
+
|
406
|
+
dicts: list[Mapping[str, Any]] = list(metadata)
|
347
407
|
|
348
408
|
if targets_per_image is not None and len(dicts) != len(targets_per_image):
|
349
409
|
raise ValueError("Number of targets per image must be equal to number of metadata entries.")
|
350
410
|
|
351
|
-
|
352
|
-
dropped: dict[str, set[DropReason]] = {}
|
353
|
-
for i, d in enumerate(dicts):
|
354
|
-
flattened, image_repeats[i], dropped_inner = flatten(
|
355
|
-
d, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
|
356
|
-
)
|
357
|
-
if targets_per_image is not None:
|
358
|
-
# check for mismatch in targets per image and force ignore_lists
|
359
|
-
if not ignore_lists and targets_per_image[i] != image_repeats[i]:
|
360
|
-
flattened, image_repeats[i], dropped_inner = flatten(
|
361
|
-
d, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
|
362
|
-
)
|
363
|
-
if targets_per_image[i] != image_repeats[i]:
|
364
|
-
flattened = {k: [v] * targets_per_image[i] for k, v in flattened.items()}
|
365
|
-
image_repeats[i] = targets_per_image[i]
|
366
|
-
isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
|
367
|
-
union.update(flattened.keys())
|
368
|
-
for k, v in dropped_inner.items():
|
369
|
-
dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
|
370
|
-
for k, v in flattened.items():
|
371
|
-
merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
|
372
|
-
|
373
|
-
for k in union - isect:
|
374
|
-
dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
|
375
|
-
|
376
|
-
if image_repeats.sum() == image_repeats.size:
|
377
|
-
image_indices = np.arange(image_repeats.size)
|
378
|
-
else:
|
379
|
-
image_ids = np.arange(image_repeats.size)
|
380
|
-
image_data = np.concatenate(
|
381
|
-
[np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
|
382
|
-
)
|
383
|
-
_, image_unsorted = np.unique(image_data, return_inverse=True)
|
384
|
-
image_indices = np.sort(image_unsorted)
|
385
|
-
|
386
|
-
output: dict[str, Any] = {}
|
411
|
+
merged, dropped, image_indices = _merge(dicts, ignore_lists, fully_qualified, targets_per_image)
|
387
412
|
|
388
|
-
if
|
389
|
-
output["keys"] = np.array(keys) if return_numpy else keys
|
413
|
+
output: dict[str, Any] = {k: np.asarray(v) for k, v in merged.items()} if return_numpy else merged
|
390
414
|
|
391
|
-
for k in (key for key in merged if key in isect):
|
392
|
-
cv = _simplify_type(merged[k])
|
393
|
-
output[k] = np.array(cv) if return_numpy else cv
|
394
415
|
if image_index_key not in output:
|
395
416
|
output[image_index_key] = image_indices if return_numpy else image_indices.tolist()
|
396
417
|
|
397
418
|
if return_dropped:
|
398
419
|
return output, _sorted_drop_reasons(dropped)
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
420
|
+
|
421
|
+
if dropped:
|
422
|
+
dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
|
423
|
+
warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
|
424
|
+
|
425
|
+
return output
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Provides access to common Computer Vision datasets."""
|
2
2
|
|
3
|
+
from dataeval.utils.datasets._antiuav import AntiUAVDetection
|
3
4
|
from dataeval.utils.datasets._cifar10 import CIFAR10
|
4
5
|
from dataeval.utils.datasets._milco import MILCO
|
5
6
|
from dataeval.utils.datasets._mnist import MNIST
|
@@ -10,6 +11,7 @@ __all__ = [
|
|
10
11
|
"MNIST",
|
11
12
|
"Ships",
|
12
13
|
"CIFAR10",
|
14
|
+
"AntiUAVDetection",
|
13
15
|
"MILCO",
|
14
16
|
"VOCDetection",
|
15
17
|
"VOCDetectionTorch",
|