dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_log.py +1 -1
- dataeval/config.py +21 -4
- dataeval/data/_embeddings.py +2 -2
- dataeval/data/_images.py +2 -3
- dataeval/data/_metadata.py +65 -42
- dataeval/data/_selection.py +2 -3
- dataeval/data/_split.py +2 -3
- dataeval/data/_targets.py +17 -13
- dataeval/data/selections/_classfilter.py +6 -8
- dataeval/data/selections/_prioritize.py +6 -9
- dataeval/data/selections/_shuffle.py +3 -1
- dataeval/detectors/drift/__init__.py +4 -1
- dataeval/detectors/drift/_base.py +4 -5
- dataeval/detectors/drift/_mmd.py +3 -6
- dataeval/detectors/drift/_mvdc.py +92 -0
- dataeval/detectors/drift/_nml/__init__.py +6 -0
- dataeval/detectors/drift/_nml/_base.py +70 -0
- dataeval/detectors/drift/_nml/_chunk.py +396 -0
- dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
- dataeval/detectors/drift/_nml/_result.py +97 -0
- dataeval/detectors/drift/_nml/_thresholds.py +269 -0
- dataeval/detectors/linters/outliers.py +7 -7
- dataeval/metrics/bias/_parity.py +10 -13
- dataeval/metrics/estimators/_divergence.py +2 -4
- dataeval/metrics/stats/_base.py +103 -42
- dataeval/metrics/stats/_boxratiostats.py +21 -19
- dataeval/metrics/stats/_dimensionstats.py +14 -10
- dataeval/metrics/stats/_hashstats.py +1 -1
- dataeval/metrics/stats/_pixelstats.py +6 -6
- dataeval/metrics/stats/_visualstats.py +3 -3
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_base.py +22 -7
- dataeval/outputs/_bias.py +27 -31
- dataeval/outputs/_drift.py +60 -0
- dataeval/outputs/_linters.py +12 -17
- dataeval/outputs/_stats.py +83 -29
- dataeval/outputs/_workflows.py +2 -2
- dataeval/utils/_array.py +6 -9
- dataeval/utils/_bin.py +1 -2
- dataeval/utils/_clusterer.py +7 -4
- dataeval/utils/_fast_mst.py +27 -13
- dataeval/utils/_image.py +65 -11
- dataeval/utils/_mst.py +1 -3
- dataeval/utils/_plot.py +15 -10
- dataeval/utils/data/_dataset.py +32 -20
- dataeval/utils/data/metadata.py +104 -82
- dataeval/utils/datasets/__init__.py +2 -0
- dataeval/utils/datasets/_antiuav.py +189 -0
- dataeval/utils/datasets/_base.py +11 -8
- dataeval/utils/datasets/_cifar10.py +104 -45
- dataeval/utils/datasets/_fileio.py +21 -47
- dataeval/utils/datasets/_milco.py +19 -11
- dataeval/utils/datasets/_mixin.py +2 -4
- dataeval/utils/datasets/_mnist.py +3 -4
- dataeval/utils/datasets/_ships.py +14 -7
- dataeval/utils/datasets/_voc.py +229 -42
- dataeval/utils/torch/models.py +5 -10
- dataeval/utils/torch/trainer.py +3 -3
- dataeval/workflows/sufficiency.py +2 -2
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
- dataeval-0.86.1.dist-info/RECORD +114 -0
- dataeval/detectors/ood/vae.py +0 -74
- dataeval-0.85.0.dist-info/RECORD +0 -107
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
dataeval/metrics/stats/_base.py
CHANGED
@@ -10,23 +10,86 @@ from copy import deepcopy
|
|
10
10
|
from dataclasses import dataclass
|
11
11
|
from functools import partial
|
12
12
|
from multiprocessing import Pool
|
13
|
-
from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
|
13
|
+
from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
|
14
14
|
|
15
15
|
import numpy as np
|
16
16
|
import tqdm
|
17
17
|
from numpy.typing import NDArray
|
18
18
|
|
19
19
|
from dataeval.config import get_max_processes
|
20
|
-
from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
|
20
|
+
from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
|
21
21
|
from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
|
22
22
|
from dataeval.utils._array import as_numpy, to_numpy
|
23
|
-
from dataeval.utils._image import normalize_image_shape, rescale
|
23
|
+
from dataeval.utils._image import clip_and_pad, clip_box, is_valid_box, normalize_image_shape, rescale
|
24
24
|
|
25
25
|
DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
|
26
26
|
|
27
|
-
BoundingBox = tuple[float, float, float, float]
|
28
27
|
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
|
29
28
|
|
29
|
+
_S = TypeVar("_S")
|
30
|
+
_T = TypeVar("_T")
|
31
|
+
|
32
|
+
|
33
|
+
@dataclass
|
34
|
+
class BoundingBox:
|
35
|
+
x0: float
|
36
|
+
y0: float
|
37
|
+
x1: float
|
38
|
+
y1: float
|
39
|
+
|
40
|
+
def __post_init__(self) -> None:
|
41
|
+
# Test for invalid coordinates
|
42
|
+
x_swap = self.x0 > self.x1
|
43
|
+
y_swap = self.y0 > self.y1
|
44
|
+
if x_swap or y_swap:
|
45
|
+
warnings.warn(f"Invalid bounding box coordinates: {self} - swapping invalid coordinates.")
|
46
|
+
if x_swap:
|
47
|
+
self.x0, self.x1 = self.x1, self.x0
|
48
|
+
if y_swap:
|
49
|
+
self.y0, self.y1 = self.y1, self.y0
|
50
|
+
|
51
|
+
@property
|
52
|
+
def width(self) -> float:
|
53
|
+
return self.x1 - self.x0
|
54
|
+
|
55
|
+
@property
|
56
|
+
def height(self) -> float:
|
57
|
+
return self.y1 - self.y0
|
58
|
+
|
59
|
+
def to_int(self) -> tuple[int, int, int, int]:
|
60
|
+
"""
|
61
|
+
Returns the bounding box as a tuple of integers.
|
62
|
+
"""
|
63
|
+
x0_int = math.floor(self.x0)
|
64
|
+
y0_int = math.floor(self.y0)
|
65
|
+
x1_int = math.ceil(self.x1)
|
66
|
+
y1_int = math.ceil(self.y1)
|
67
|
+
return x0_int, y0_int, x1_int, y1_int
|
68
|
+
|
69
|
+
|
70
|
+
class PoolWrapper:
|
71
|
+
"""
|
72
|
+
Wraps `multiprocessing.Pool` to allow for easy switching between
|
73
|
+
multiprocessing and single-threaded execution.
|
74
|
+
|
75
|
+
This helps with debugging and profiling, as well as usage with Jupyter notebooks
|
76
|
+
in VS Code, which does not support subprocess debugging.
|
77
|
+
"""
|
78
|
+
|
79
|
+
def __init__(self, processes: int | None) -> None:
|
80
|
+
self.pool = Pool(processes) if processes is not None and processes > 1 else None
|
81
|
+
|
82
|
+
def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
|
83
|
+
return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
|
84
|
+
|
85
|
+
def __enter__(self, *args: Any, **kwargs: Any) -> PoolWrapper:
|
86
|
+
return self
|
87
|
+
|
88
|
+
def __exit__(self, *args: Any) -> None:
|
89
|
+
if self.pool is not None:
|
90
|
+
self.pool.close()
|
91
|
+
self.pool.join()
|
92
|
+
|
30
93
|
|
31
94
|
class StatsProcessor(Generic[TStatsOutput]):
|
32
95
|
output_class: type[TStatsOutput]
|
@@ -34,32 +97,26 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
34
97
|
image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
35
98
|
channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
36
99
|
|
37
|
-
def __init__(self, image: NDArray[Any], box: BoundingBox | None, per_channel: bool) -> None:
|
100
|
+
def __init__(self, image: NDArray[Any], box: BoundingBox | Iterable[Any] | None, per_channel: bool) -> None:
|
38
101
|
self.raw = image
|
39
102
|
self.width: int = image.shape[-1]
|
40
103
|
self.height: int = image.shape[-2]
|
41
|
-
box =
|
42
|
-
|
43
|
-
x0, y0 = (min(j, max(0, math.floor(box[i]))) for i, j in zip((0, 1), (self.width - 1, self.height - 1)))
|
44
|
-
x1, y1 = (min(j, max(1, math.ceil(box[i]))) for i, j in zip((2, 3), (self.width, self.height)))
|
45
|
-
self.box: NDArray[np.int64] = np.array([x0, y0, x1, y1], dtype=np.int64)
|
104
|
+
box = (0, 0, self.width, self.height) if box is None else box
|
105
|
+
self.box = box if isinstance(box, BoundingBox) else BoundingBox(*box)
|
46
106
|
self._per_channel = per_channel
|
47
107
|
self._image = None
|
48
108
|
self._shape = None
|
49
109
|
self._scaled = None
|
50
110
|
self._cache = {}
|
51
111
|
self._fn_map = self.channel_function_map if per_channel else self.image_function_map
|
52
|
-
self.
|
53
|
-
box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
|
54
|
-
)
|
112
|
+
self._is_valid_box = is_valid_box(clip_box(image, self.box.to_int()))
|
55
113
|
|
56
114
|
def get(self, fn_key: str) -> NDArray[Any]:
|
57
115
|
if fn_key in self.cache_keys:
|
58
116
|
if fn_key not in self._cache:
|
59
117
|
self._cache[fn_key] = self._fn_map[fn_key](self)
|
60
118
|
return self._cache[fn_key]
|
61
|
-
|
62
|
-
return self._fn_map[fn_key](self)
|
119
|
+
return self._fn_map[fn_key](self)
|
63
120
|
|
64
121
|
def process(self) -> dict[str, Any]:
|
65
122
|
return {k: self._fn_map[k](self) for k in self._fn_map}
|
@@ -67,11 +124,7 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
67
124
|
@property
|
68
125
|
def image(self) -> NDArray[Any]:
|
69
126
|
if self._image is None:
|
70
|
-
|
71
|
-
norm = normalize_image_shape(self.raw)
|
72
|
-
self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
|
73
|
-
else:
|
74
|
-
self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
|
127
|
+
self._image = clip_and_pad(normalize_image_shape(self.raw), self.box.to_int())
|
75
128
|
return self._image
|
76
129
|
|
77
130
|
@property
|
@@ -90,9 +143,9 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
90
143
|
|
91
144
|
@classmethod
|
92
145
|
def convert_output(
|
93
|
-
cls, source: dict[str, Any], source_index: list[SourceIndex],
|
146
|
+
cls, source: dict[str, Any], source_index: list[SourceIndex], object_count: list[int], image_count: int
|
94
147
|
) -> TStatsOutput:
|
95
|
-
output = {}
|
148
|
+
output: dict[str, Any] = {}
|
96
149
|
attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
|
97
150
|
for key in (key for key in source if key in attrs):
|
98
151
|
stat_type: str = attrs[key]
|
@@ -101,14 +154,17 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
101
154
|
output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
|
102
155
|
else:
|
103
156
|
output[key] = source[key]
|
104
|
-
|
157
|
+
base_attrs: dict[str, Any] = dict(
|
158
|
+
zip(BASE_ATTRS, (source_index, np.asarray(object_count, dtype=np.uint16), image_count))
|
159
|
+
)
|
160
|
+
return cls.output_class(**output, **base_attrs)
|
105
161
|
|
106
162
|
|
107
163
|
@dataclass
|
108
164
|
class StatsProcessorOutput:
|
109
165
|
results: list[dict[str, Any]]
|
110
166
|
source_indices: list[SourceIndex]
|
111
|
-
|
167
|
+
object_counts: list[int]
|
112
168
|
warnings_list: list[str]
|
113
169
|
|
114
170
|
|
@@ -119,18 +175,18 @@ def process_stats(
|
|
119
175
|
per_channel: bool,
|
120
176
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
121
177
|
) -> StatsProcessorOutput:
|
122
|
-
|
178
|
+
np_image = to_numpy(image)
|
123
179
|
results_list: list[dict[str, Any]] = []
|
124
180
|
source_indices: list[SourceIndex] = []
|
125
181
|
box_counts: list[int] = []
|
126
182
|
warnings_list: list[str] = []
|
127
183
|
for i_b, box in [(None, None)] if boxes is None else enumerate(boxes):
|
128
|
-
processor_list = [p(
|
129
|
-
if any(not p.
|
130
|
-
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box}
|
184
|
+
processor_list = [p(np_image, box, per_channel) for p in stats_processor_cls]
|
185
|
+
if any(not p._is_valid_box for p in processor_list) and i_b is not None and box is not None:
|
186
|
+
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for image shape {np_image.shape} is invalid.")
|
131
187
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
132
188
|
if per_channel:
|
133
|
-
source_indices.extend([SourceIndex(i, i_b, c) for c in range(
|
189
|
+
source_indices.extend([SourceIndex(i, i_b, c) for c in range(np_image.shape[-3])])
|
134
190
|
else:
|
135
191
|
source_indices.append(SourceIndex(i, i_b, None))
|
136
192
|
box_counts.append(0 if boxes is None else len(boxes))
|
@@ -145,13 +201,18 @@ def process_stats_unpack(
|
|
145
201
|
return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
|
146
202
|
|
147
203
|
|
148
|
-
def _enumerate(
|
204
|
+
def _enumerate(
|
205
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool
|
206
|
+
) -> Iterator[tuple[int, ArrayLike, Any]]:
|
149
207
|
for i in range(len(dataset)):
|
150
208
|
d = dataset[i]
|
151
209
|
image = d[0] if isinstance(d, tuple) else d
|
152
210
|
if per_box and isinstance(d, tuple) and isinstance(d[1], ObjectDetectionTarget):
|
153
|
-
|
154
|
-
|
211
|
+
try:
|
212
|
+
boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
|
213
|
+
target = [BoundingBox(*(float(box[i]) for i in range(4))) for box in boxes]
|
214
|
+
except (ValueError, IndexError):
|
215
|
+
raise ValueError(f"Invalid bounding box format for image {i}: {d[1].boxes}")
|
155
216
|
else:
|
156
217
|
target = None
|
157
218
|
|
@@ -199,12 +260,13 @@ def run_stats(
|
|
199
260
|
"""
|
200
261
|
results_list: list[dict[str, NDArray[np.float64]]] = []
|
201
262
|
source_index: list[SourceIndex] = []
|
202
|
-
|
263
|
+
object_count: list[int] = []
|
264
|
+
image_count: int = len(dataset)
|
203
265
|
|
204
266
|
warning_list = []
|
205
267
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
206
268
|
|
207
|
-
with
|
269
|
+
with PoolWrapper(processes=get_max_processes()) as p:
|
208
270
|
for r in tqdm.tqdm(
|
209
271
|
p.imap(
|
210
272
|
partial(
|
@@ -214,14 +276,12 @@ def run_stats(
|
|
214
276
|
),
|
215
277
|
_enumerate(dataset, per_box),
|
216
278
|
),
|
217
|
-
total=
|
279
|
+
total=image_count,
|
218
280
|
):
|
219
281
|
results_list.extend(r.results)
|
220
282
|
source_index.extend(r.source_indices)
|
221
|
-
|
283
|
+
object_count.extend(r.object_counts)
|
222
284
|
warning_list.extend(r.warnings_list)
|
223
|
-
p.close()
|
224
|
-
p.join()
|
225
285
|
|
226
286
|
# warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
|
227
287
|
for w in warning_list:
|
@@ -235,8 +295,7 @@ def run_stats(
|
|
235
295
|
else:
|
236
296
|
output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
|
237
297
|
|
238
|
-
|
239
|
-
return outputs
|
298
|
+
return [s.convert_output(output, source_index, object_count, image_count) for s in stats_processor_cls]
|
240
299
|
|
241
300
|
|
242
301
|
def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
@@ -246,10 +305,12 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
|
246
305
|
sum_dict = deepcopy(a.data())
|
247
306
|
|
248
307
|
for k in sum_dict:
|
249
|
-
if isinstance(sum_dict[k],
|
308
|
+
if isinstance(sum_dict[k], Sequence):
|
250
309
|
sum_dict[k].extend(b.data()[k])
|
251
|
-
|
310
|
+
elif isinstance(sum_dict[k], Array):
|
252
311
|
sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
|
312
|
+
else:
|
313
|
+
sum_dict[k] += b.data()[k]
|
253
314
|
|
254
315
|
return type(a)(**sum_dict)
|
255
316
|
|
@@ -8,8 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
+
from dataeval.config import EPSILON
|
11
12
|
from dataeval.outputs._base import set_metadata
|
12
|
-
from dataeval.outputs._stats import
|
13
|
+
from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput
|
13
14
|
|
14
15
|
TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
|
15
16
|
ArraySlice = tuple[int, int]
|
@@ -40,15 +41,19 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
|
|
40
41
|
self.img = self.StatSlicer(img_stats, img_slice)
|
41
42
|
|
42
43
|
|
43
|
-
RATIOSTATS_OVERRIDE_MAP: dict[
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
44
|
+
RATIOSTATS_OVERRIDE_MAP: dict[str, Callable[[BoxImageStatsOutputSlice[Any]], NDArray[Any]]] = {
|
45
|
+
"offset_x": lambda x: x.box["offset_x"] / x.img["width"],
|
46
|
+
"offset_y": lambda x: x.box["offset_y"] / x.img["height"],
|
47
|
+
"channels": lambda x: x.box["channels"],
|
48
|
+
"depth": lambda x: x.box["depth"],
|
49
|
+
"distance_center": lambda x: x.box["distance_center"]
|
50
|
+
/ (np.sqrt(np.square(x.img["width"]) + np.square(x.img["height"])) / 2),
|
51
|
+
"distance_edge": lambda x: x.box["distance_edge"]
|
52
|
+
/ (
|
53
|
+
x.img["width"]
|
54
|
+
if np.min([np.abs(x.box["offset_x"]), np.abs((x.box["width"] + x.box["offset_x"]) - x.img["width"])])
|
55
|
+
< np.min([np.abs(x.box["offset_y"]), np.abs((x.box["height"] + x.box["offset_y"]) - x.img["height"])])
|
56
|
+
else x.img["height"]
|
52
57
|
),
|
53
58
|
}
|
54
59
|
|
@@ -69,11 +74,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
|
|
69
74
|
|
70
75
|
stats = getattr(box_stats, key)
|
71
76
|
|
72
|
-
# Copy over
|
73
|
-
if key in
|
77
|
+
# Copy over base attributes
|
78
|
+
if key in BASE_ATTRS:
|
74
79
|
return copy.deepcopy(stats)
|
75
|
-
elif key == BOX_COUNT:
|
76
|
-
return np.copy(stats)
|
77
80
|
|
78
81
|
# Calculate ratios for each stat
|
79
82
|
out_stats: np.ndarray = np.copy(stats).astype(np.float64)
|
@@ -84,10 +87,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
|
|
84
87
|
box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
|
85
88
|
img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
|
86
89
|
stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
|
87
|
-
|
88
|
-
use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
|
90
|
+
use_override = key in RATIOSTATS_OVERRIDE_MAP
|
89
91
|
with np.errstate(divide="ignore", invalid="ignore"):
|
90
|
-
ratio = RATIOSTATS_OVERRIDE_MAP[
|
92
|
+
ratio = RATIOSTATS_OVERRIDE_MAP[key](stats) if use_override else stats.box[key] / (stats.img[key] + EPSILON)
|
91
93
|
out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
|
92
94
|
return out_stats
|
93
95
|
|
@@ -141,8 +143,8 @@ def boxratiostats(
|
|
141
143
|
output_cls = type(boxstats)
|
142
144
|
if type(boxstats) is not type(imgstats):
|
143
145
|
raise TypeError("Must provide stats outputs of the same type.")
|
144
|
-
if boxstats.
|
145
|
-
raise ValueError("Stats
|
146
|
+
if boxstats.image_count != imgstats.image_count:
|
147
|
+
raise ValueError("Stats image count length mismatch. Check if the correct box and image stats were provided.")
|
146
148
|
if any(src_idx.box is None for src_idx in boxstats.source_index):
|
147
149
|
raise ValueError("Input for boxstats must contain box information.")
|
148
150
|
if any(src_idx.box is not None for src_idx in imgstats.source_index):
|
@@ -6,6 +6,7 @@ from typing import Any, Callable
|
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
|
9
|
+
from dataeval.config import EPSILON
|
9
10
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
11
|
from dataeval.outputs import DimensionStatsOutput
|
11
12
|
from dataeval.outputs._base import set_metadata
|
@@ -16,18 +17,21 @@ from dataeval.utils._image import get_bitdepth
|
|
16
17
|
class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
17
18
|
output_class: type = DimensionStatsOutput
|
18
19
|
image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
|
19
|
-
"
|
20
|
-
"
|
21
|
-
"width": lambda x: x.box
|
22
|
-
"height": lambda x: x.box
|
20
|
+
"offset_x": lambda x: x.box.x0,
|
21
|
+
"offset_y": lambda x: x.box.y0,
|
22
|
+
"width": lambda x: x.box.width,
|
23
|
+
"height": lambda x: x.box.height,
|
23
24
|
"channels": lambda x: x.shape[-3],
|
24
|
-
"size": lambda x:
|
25
|
-
"aspect_ratio": lambda x:
|
25
|
+
"size": lambda x: x.box.width * x.box.height,
|
26
|
+
"aspect_ratio": lambda x: x.box.width / (x.box.height + EPSILON),
|
26
27
|
"depth": lambda x: get_bitdepth(x.image).depth,
|
27
|
-
"center": lambda x: np.asarray([(x.box
|
28
|
-
"
|
29
|
-
np.square(((x.box
|
30
|
-
+ np.square(((x.box
|
28
|
+
"center": lambda x: np.asarray([(x.box.x0 + x.box.x1) / 2, (x.box.y0 + x.box.y1) / 2]),
|
29
|
+
"distance_center": lambda x: np.sqrt(
|
30
|
+
np.square(((x.box.x0 + x.box.x1) / 2) - (x.raw.shape[-1] / 2))
|
31
|
+
+ np.square(((x.box.y0 + x.box.y1) / 2) - (x.raw.shape[-2] / 2))
|
32
|
+
),
|
33
|
+
"distance_edge": lambda x: np.min(
|
34
|
+
[np.abs(x.box.x0), np.abs(x.box.y0), np.abs(x.box.x1 - x.raw.shape[-1]), np.abs(x.box.y1 - x.raw.shape[-2])]
|
31
35
|
),
|
32
36
|
}
|
33
37
|
|
@@ -137,7 +137,7 @@ def hashstats(
|
|
137
137
|
|
138
138
|
>>> results = hashstats(dataset)
|
139
139
|
>>> print(results.xxhash[:5])
|
140
|
-
['
|
140
|
+
['69b50a5f06af238c', '5a861d7a23d1afe7', '7ffdb4990ad44ac6', '4f0c366a3298ceac', 'c5519e36ac1f8839']
|
141
141
|
>>> print(results.pchash[:5])
|
142
142
|
['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
|
143
143
|
"""
|
@@ -16,18 +16,18 @@ from dataeval.typing import ArrayLike, Dataset
|
|
16
16
|
class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
17
17
|
output_class: type = PixelStatsOutput
|
18
18
|
image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
|
19
|
-
"mean": lambda x: np.
|
20
|
-
"std": lambda x: np.
|
21
|
-
"var": lambda x: np.
|
19
|
+
"mean": lambda x: np.nanmean(x.scaled),
|
20
|
+
"std": lambda x: np.nanstd(x.scaled),
|
21
|
+
"var": lambda x: np.nanvar(x.scaled),
|
22
22
|
"skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
|
23
23
|
"kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
|
24
24
|
"histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
|
25
25
|
"entropy": lambda x: entropy(x.get("histogram")),
|
26
26
|
}
|
27
27
|
channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
|
28
|
-
"mean": lambda x: np.
|
29
|
-
"std": lambda x: np.
|
30
|
-
"var": lambda x: np.
|
28
|
+
"mean": lambda x: np.nanmean(x.scaled, axis=1),
|
29
|
+
"std": lambda x: np.nanstd(x.scaled, axis=1),
|
30
|
+
"var": lambda x: np.nanvar(x.scaled, axis=1),
|
31
31
|
"skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
|
32
32
|
"kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
|
33
33
|
"histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
|
@@ -24,8 +24,8 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
|
24
24
|
else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
|
25
25
|
"darkness": lambda x: x.get("percentiles")[-2],
|
26
26
|
"missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
|
27
|
-
"sharpness": lambda x: np.
|
28
|
-
"zeros": lambda x: np.count_nonzero(np.
|
27
|
+
"sharpness": lambda x: np.nanstd(edge_filter(np.mean(x.image, axis=0))),
|
28
|
+
"zeros": lambda x: np.count_nonzero(np.nansum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
|
29
29
|
"percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
|
30
30
|
}
|
31
31
|
channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
|
@@ -36,7 +36,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
|
36
36
|
),
|
37
37
|
"darkness": lambda x: x.get("percentiles")[:, -2],
|
38
38
|
"missing": lambda x: np.count_nonzero(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
|
39
|
-
"sharpness": lambda x: np.
|
39
|
+
"sharpness": lambda x: np.nanstd(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
|
40
40
|
"zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
|
41
41
|
"percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
|
42
42
|
}
|
dataeval/outputs/__init__.py
CHANGED
@@ -5,7 +5,7 @@ as well as runtime metadata for reproducibility and logging.
|
|
5
5
|
|
6
6
|
from ._base import ExecutionMetadata
|
7
7
|
from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
|
8
|
-
from ._drift import DriftMMDOutput, DriftOutput
|
8
|
+
from ._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput
|
9
9
|
from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
|
10
10
|
from ._linters import DuplicatesOutput, OutliersOutput
|
11
11
|
from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
|
@@ -34,6 +34,7 @@ __all__ = [
|
|
34
34
|
"DivergenceOutput",
|
35
35
|
"DiversityOutput",
|
36
36
|
"DriftMMDOutput",
|
37
|
+
"DriftMVDCOutput",
|
37
38
|
"DriftOutput",
|
38
39
|
"DuplicatesOutput",
|
39
40
|
"ExecutionMetadata",
|
dataeval/outputs/_base.py
CHANGED
@@ -66,25 +66,40 @@ class GenericOutput(Generic[T]):
|
|
66
66
|
def meta(self) -> ExecutionMetadata:
|
67
67
|
"""
|
68
68
|
Metadata about the execution of the function or method for the Output class.
|
69
|
+
|
70
|
+
Returns
|
71
|
+
-------
|
72
|
+
ExecutionMetadata
|
69
73
|
"""
|
70
74
|
return self._meta or ExecutionMetadata.empty()
|
71
75
|
|
72
76
|
|
73
77
|
class Output(GenericOutput[dict[str, Any]]):
|
74
78
|
def data(self) -> dict[str, Any]:
|
75
|
-
|
79
|
+
"""
|
80
|
+
The output data as a dictionary.
|
76
81
|
|
77
|
-
|
78
|
-
|
82
|
+
Returns
|
83
|
+
-------
|
84
|
+
dict[str, Any]
|
85
|
+
"""
|
86
|
+
return {k: v for k, v in self.__dict__.items() if k != "_meta"}
|
79
87
|
|
80
88
|
def __str__(self) -> str:
|
81
|
-
return
|
89
|
+
return str(self.data())
|
82
90
|
|
83
91
|
|
84
92
|
class BaseCollectionMixin(Collection[Any]):
|
85
93
|
__slots__ = ["_data"]
|
86
94
|
|
87
95
|
def data(self) -> Any:
|
96
|
+
"""
|
97
|
+
The output data as a collection.
|
98
|
+
|
99
|
+
Returns
|
100
|
+
-------
|
101
|
+
Collection
|
102
|
+
"""
|
88
103
|
return self._data
|
89
104
|
|
90
105
|
def __len__(self) -> int:
|
@@ -102,7 +117,7 @@ TValue = TypeVar("TValue")
|
|
102
117
|
|
103
118
|
|
104
119
|
class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Mapping[TKey, TValue]]):
|
105
|
-
def __init__(self, data: Mapping[TKey, TValue]):
|
120
|
+
def __init__(self, data: Mapping[TKey, TValue]) -> None:
|
106
121
|
self._data = data
|
107
122
|
|
108
123
|
def __getitem__(self, key: TKey) -> TValue:
|
@@ -113,7 +128,7 @@ class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Ma
|
|
113
128
|
|
114
129
|
|
115
130
|
class SequenceOutput(Sequence[TValue], BaseCollectionMixin, GenericOutput[Sequence[TValue]]):
|
116
|
-
def __init__(self, data: Sequence[TValue]):
|
131
|
+
def __init__(self, data: Sequence[TValue]) -> None:
|
117
132
|
self._data = data
|
118
133
|
|
119
134
|
@overload
|
@@ -140,7 +155,7 @@ def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None =
|
|
140
155
|
|
141
156
|
@wraps(fn)
|
142
157
|
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
143
|
-
def fmt(v):
|
158
|
+
def fmt(v: Any) -> Any:
|
144
159
|
if np.isscalar(v):
|
145
160
|
return v
|
146
161
|
if hasattr(v, "shape"):
|
dataeval/outputs/_bias.py
CHANGED
@@ -7,10 +7,10 @@ from dataclasses import asdict, dataclass
|
|
7
7
|
from typing import Any, Literal, TypeVar, overload
|
8
8
|
|
9
9
|
import numpy as np
|
10
|
+
import pandas as pd
|
10
11
|
from numpy.typing import NDArray
|
11
12
|
|
12
13
|
with contextlib.suppress(ImportError):
|
13
|
-
import pandas as pd
|
14
14
|
from matplotlib.figure import Figure
|
15
15
|
|
16
16
|
from dataeval.data._images import Images
|
@@ -38,8 +38,6 @@ class ToDataFrameMixin:
|
|
38
38
|
-----
|
39
39
|
This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
|
40
40
|
"""
|
41
|
-
import pandas as pd
|
42
|
-
|
43
41
|
return pd.DataFrame(
|
44
42
|
index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
|
45
43
|
data={
|
@@ -130,33 +128,30 @@ class CoverageOutput(Output):
|
|
130
128
|
|
131
129
|
import matplotlib.pyplot as plt
|
132
130
|
|
131
|
+
images = Images(images) if isinstance(images, Dataset) else images
|
132
|
+
if np.max(self.uncovered_indices) > len(images):
|
133
|
+
raise ValueError(
|
134
|
+
f"Uncovered indices {self.uncovered_indices} specify images "
|
135
|
+
f"unavailable in the provided number of images {len(images)}."
|
136
|
+
)
|
137
|
+
|
133
138
|
# Determine which images to plot
|
134
139
|
selected_indices = self.uncovered_indices[:top_k]
|
135
140
|
|
136
|
-
images = Images(images) if isinstance(images, Dataset) else images
|
137
|
-
|
138
141
|
# Plot the images
|
139
142
|
num_images = min(top_k, len(selected_indices))
|
140
143
|
|
141
144
|
rows = int(np.ceil(num_images / 3))
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
for i in range(rows):
|
153
|
-
for j in range(3):
|
154
|
-
i_j = i * 3 + j
|
155
|
-
if i_j >= len(selected_indices):
|
156
|
-
continue
|
157
|
-
image = channels_first_to_last(as_numpy(images[selected_indices[i_j]]))
|
158
|
-
axs[i, j].imshow(image)
|
159
|
-
axs[i, j].axis("off")
|
145
|
+
cols = min(3, num_images)
|
146
|
+
fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
|
147
|
+
|
148
|
+
for image, ax in zip(images[:num_images], axs.flat):
|
149
|
+
image = channels_first_to_last(as_numpy(image))
|
150
|
+
ax.imshow(image)
|
151
|
+
ax.axis("off")
|
152
|
+
|
153
|
+
for ax in axs.flat[num_images:]:
|
154
|
+
ax.axis("off")
|
160
155
|
|
161
156
|
fig.tight_layout()
|
162
157
|
return fig
|
@@ -235,14 +230,15 @@ class BalanceOutput(Output):
|
|
235
230
|
# return the masked attribute
|
236
231
|
if attr == "factor_names":
|
237
232
|
return [x.replace(f"-{factor_type}", "") for x in self.factor_names if mask_lambda(x)]
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
233
|
+
factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
|
234
|
+
if attr == "factors":
|
235
|
+
return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
|
236
|
+
if attr == "balance":
|
237
|
+
return self.balance[factor_type_mask]
|
238
|
+
if attr == "classwise":
|
239
|
+
return self.classwise[:, factor_type_mask]
|
240
|
+
|
241
|
+
raise ValueError(f"Unknown attr {attr} specified.")
|
246
242
|
|
247
243
|
def plot(
|
248
244
|
self,
|