dataeval 0.85.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/config.py +21 -4
  4. dataeval/data/_embeddings.py +2 -2
  5. dataeval/data/_images.py +2 -3
  6. dataeval/data/_metadata.py +65 -42
  7. dataeval/data/_selection.py +2 -3
  8. dataeval/data/_split.py +2 -3
  9. dataeval/data/_targets.py +17 -13
  10. dataeval/data/selections/_classfilter.py +6 -8
  11. dataeval/data/selections/_prioritize.py +6 -9
  12. dataeval/data/selections/_shuffle.py +3 -1
  13. dataeval/detectors/drift/__init__.py +4 -1
  14. dataeval/detectors/drift/_base.py +4 -5
  15. dataeval/detectors/drift/_mmd.py +3 -6
  16. dataeval/detectors/drift/_mvdc.py +92 -0
  17. dataeval/detectors/drift/_nml/__init__.py +6 -0
  18. dataeval/detectors/drift/_nml/_base.py +70 -0
  19. dataeval/detectors/drift/_nml/_chunk.py +396 -0
  20. dataeval/detectors/drift/_nml/_domainclassifier.py +181 -0
  21. dataeval/detectors/drift/_nml/_result.py +97 -0
  22. dataeval/detectors/drift/_nml/_thresholds.py +269 -0
  23. dataeval/detectors/linters/outliers.py +7 -7
  24. dataeval/metrics/bias/_parity.py +10 -13
  25. dataeval/metrics/estimators/_divergence.py +2 -4
  26. dataeval/metrics/stats/_base.py +103 -42
  27. dataeval/metrics/stats/_boxratiostats.py +21 -19
  28. dataeval/metrics/stats/_dimensionstats.py +14 -10
  29. dataeval/metrics/stats/_hashstats.py +1 -1
  30. dataeval/metrics/stats/_pixelstats.py +6 -6
  31. dataeval/metrics/stats/_visualstats.py +3 -3
  32. dataeval/outputs/__init__.py +2 -1
  33. dataeval/outputs/_base.py +22 -7
  34. dataeval/outputs/_bias.py +27 -31
  35. dataeval/outputs/_drift.py +60 -0
  36. dataeval/outputs/_linters.py +12 -17
  37. dataeval/outputs/_stats.py +83 -29
  38. dataeval/outputs/_workflows.py +2 -2
  39. dataeval/utils/_array.py +6 -9
  40. dataeval/utils/_bin.py +1 -2
  41. dataeval/utils/_clusterer.py +7 -4
  42. dataeval/utils/_fast_mst.py +27 -13
  43. dataeval/utils/_image.py +65 -11
  44. dataeval/utils/_mst.py +1 -3
  45. dataeval/utils/_plot.py +15 -10
  46. dataeval/utils/data/_dataset.py +32 -20
  47. dataeval/utils/data/metadata.py +104 -82
  48. dataeval/utils/datasets/__init__.py +2 -0
  49. dataeval/utils/datasets/_antiuav.py +189 -0
  50. dataeval/utils/datasets/_base.py +11 -8
  51. dataeval/utils/datasets/_cifar10.py +104 -45
  52. dataeval/utils/datasets/_fileio.py +21 -47
  53. dataeval/utils/datasets/_milco.py +19 -11
  54. dataeval/utils/datasets/_mixin.py +2 -4
  55. dataeval/utils/datasets/_mnist.py +3 -4
  56. dataeval/utils/datasets/_ships.py +14 -7
  57. dataeval/utils/datasets/_voc.py +229 -42
  58. dataeval/utils/torch/models.py +5 -10
  59. dataeval/utils/torch/trainer.py +3 -3
  60. dataeval/workflows/sufficiency.py +2 -2
  61. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +3 -2
  62. dataeval-0.86.1.dist-info/RECORD +114 -0
  63. dataeval/detectors/ood/vae.py +0 -74
  64. dataeval-0.85.0.dist-info/RECORD +0 -107
  65. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
  66. {dataeval-0.85.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
@@ -10,23 +10,86 @@ from copy import deepcopy
10
10
  from dataclasses import dataclass
11
11
  from functools import partial
12
12
  from multiprocessing import Pool
13
- from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
13
+ from typing import Any, Callable, Generic, Iterable, Iterator, Sequence, TypeVar
14
14
 
15
15
  import numpy as np
16
16
  import tqdm
17
17
  from numpy.typing import NDArray
18
18
 
19
19
  from dataeval.config import get_max_processes
20
- from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
20
+ from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput, SourceIndex
21
21
  from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
22
22
  from dataeval.utils._array import as_numpy, to_numpy
23
- from dataeval.utils._image import normalize_image_shape, rescale
23
+ from dataeval.utils._image import clip_and_pad, clip_box, is_valid_box, normalize_image_shape, rescale
24
24
 
25
25
  DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
26
26
 
27
- BoundingBox = tuple[float, float, float, float]
28
27
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
29
28
 
29
+ _S = TypeVar("_S")
30
+ _T = TypeVar("_T")
31
+
32
+
33
+ @dataclass
34
+ class BoundingBox:
35
+ x0: float
36
+ y0: float
37
+ x1: float
38
+ y1: float
39
+
40
+ def __post_init__(self) -> None:
41
+ # Test for invalid coordinates
42
+ x_swap = self.x0 > self.x1
43
+ y_swap = self.y0 > self.y1
44
+ if x_swap or y_swap:
45
+ warnings.warn(f"Invalid bounding box coordinates: {self} - swapping invalid coordinates.")
46
+ if x_swap:
47
+ self.x0, self.x1 = self.x1, self.x0
48
+ if y_swap:
49
+ self.y0, self.y1 = self.y1, self.y0
50
+
51
+ @property
52
+ def width(self) -> float:
53
+ return self.x1 - self.x0
54
+
55
+ @property
56
+ def height(self) -> float:
57
+ return self.y1 - self.y0
58
+
59
+ def to_int(self) -> tuple[int, int, int, int]:
60
+ """
61
+ Returns the bounding box as a tuple of integers.
62
+ """
63
+ x0_int = math.floor(self.x0)
64
+ y0_int = math.floor(self.y0)
65
+ x1_int = math.ceil(self.x1)
66
+ y1_int = math.ceil(self.y1)
67
+ return x0_int, y0_int, x1_int, y1_int
68
+
69
+
70
+ class PoolWrapper:
71
+ """
72
+ Wraps `multiprocessing.Pool` to allow for easy switching between
73
+ multiprocessing and single-threaded execution.
74
+
75
+ This helps with debugging and profiling, as well as usage with Jupyter notebooks
76
+ in VS Code, which does not support subprocess debugging.
77
+ """
78
+
79
+ def __init__(self, processes: int | None) -> None:
80
+ self.pool = Pool(processes) if processes is not None and processes > 1 else None
81
+
82
+ def imap(self, func: Callable[[_S], _T], iterable: Iterable[_S]) -> Iterator[_T]:
83
+ return map(func, iterable) if self.pool is None else self.pool.imap(func, iterable)
84
+
85
+ def __enter__(self, *args: Any, **kwargs: Any) -> PoolWrapper:
86
+ return self
87
+
88
+ def __exit__(self, *args: Any) -> None:
89
+ if self.pool is not None:
90
+ self.pool.close()
91
+ self.pool.join()
92
+
30
93
 
31
94
  class StatsProcessor(Generic[TStatsOutput]):
32
95
  output_class: type[TStatsOutput]
@@ -34,32 +97,26 @@ class StatsProcessor(Generic[TStatsOutput]):
34
97
  image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
35
98
  channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
36
99
 
37
- def __init__(self, image: NDArray[Any], box: BoundingBox | None, per_channel: bool) -> None:
100
+ def __init__(self, image: NDArray[Any], box: BoundingBox | Iterable[Any] | None, per_channel: bool) -> None:
38
101
  self.raw = image
39
102
  self.width: int = image.shape[-1]
40
103
  self.height: int = image.shape[-2]
41
- box = BoundingBox((0, 0, self.width, self.height)) if box is None else box
42
- # Clip the bounding box to image
43
- x0, y0 = (min(j, max(0, math.floor(box[i]))) for i, j in zip((0, 1), (self.width - 1, self.height - 1)))
44
- x1, y1 = (min(j, max(1, math.ceil(box[i]))) for i, j in zip((2, 3), (self.width, self.height)))
45
- self.box: NDArray[np.int64] = np.array([x0, y0, x1, y1], dtype=np.int64)
104
+ box = (0, 0, self.width, self.height) if box is None else box
105
+ self.box = box if isinstance(box, BoundingBox) else BoundingBox(*box)
46
106
  self._per_channel = per_channel
47
107
  self._image = None
48
108
  self._shape = None
49
109
  self._scaled = None
50
110
  self._cache = {}
51
111
  self._fn_map = self.channel_function_map if per_channel else self.image_function_map
52
- self._is_valid_slice = box is None or bool(
53
- box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
54
- )
112
+ self._is_valid_box = is_valid_box(clip_box(image, self.box.to_int()))
55
113
 
56
114
  def get(self, fn_key: str) -> NDArray[Any]:
57
115
  if fn_key in self.cache_keys:
58
116
  if fn_key not in self._cache:
59
117
  self._cache[fn_key] = self._fn_map[fn_key](self)
60
118
  return self._cache[fn_key]
61
- else:
62
- return self._fn_map[fn_key](self)
119
+ return self._fn_map[fn_key](self)
63
120
 
64
121
  def process(self) -> dict[str, Any]:
65
122
  return {k: self._fn_map[k](self) for k in self._fn_map}
@@ -67,11 +124,7 @@ class StatsProcessor(Generic[TStatsOutput]):
67
124
  @property
68
125
  def image(self) -> NDArray[Any]:
69
126
  if self._image is None:
70
- if self._is_valid_slice:
71
- norm = normalize_image_shape(self.raw)
72
- self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
73
- else:
74
- self._image = np.zeros((self.raw.shape[0], self.box[3] - self.box[1], self.box[2] - self.box[0]))
127
+ self._image = clip_and_pad(normalize_image_shape(self.raw), self.box.to_int())
75
128
  return self._image
76
129
 
77
130
  @property
@@ -90,9 +143,9 @@ class StatsProcessor(Generic[TStatsOutput]):
90
143
 
91
144
  @classmethod
92
145
  def convert_output(
93
- cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
146
+ cls, source: dict[str, Any], source_index: list[SourceIndex], object_count: list[int], image_count: int
94
147
  ) -> TStatsOutput:
95
- output = {}
148
+ output: dict[str, Any] = {}
96
149
  attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
97
150
  for key in (key for key in source if key in attrs):
98
151
  stat_type: str = attrs[key]
@@ -101,14 +154,17 @@ class StatsProcessor(Generic[TStatsOutput]):
101
154
  output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
102
155
  else:
103
156
  output[key] = source[key]
104
- return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
157
+ base_attrs: dict[str, Any] = dict(
158
+ zip(BASE_ATTRS, (source_index, np.asarray(object_count, dtype=np.uint16), image_count))
159
+ )
160
+ return cls.output_class(**output, **base_attrs)
105
161
 
106
162
 
107
163
  @dataclass
108
164
  class StatsProcessorOutput:
109
165
  results: list[dict[str, Any]]
110
166
  source_indices: list[SourceIndex]
111
- box_counts: list[int]
167
+ object_counts: list[int]
112
168
  warnings_list: list[str]
113
169
 
114
170
 
@@ -119,18 +175,18 @@ def process_stats(
119
175
  per_channel: bool,
120
176
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
121
177
  ) -> StatsProcessorOutput:
122
- image = to_numpy(image)
178
+ np_image = to_numpy(image)
123
179
  results_list: list[dict[str, Any]] = []
124
180
  source_indices: list[SourceIndex] = []
125
181
  box_counts: list[int] = []
126
182
  warnings_list: list[str] = []
127
183
  for i_b, box in [(None, None)] if boxes is None else enumerate(boxes):
128
- processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
129
- if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
130
- warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
184
+ processor_list = [p(np_image, box, per_channel) for p in stats_processor_cls]
185
+ if any(not p._is_valid_box for p in processor_list) and i_b is not None and box is not None:
186
+ warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for image shape {np_image.shape} is invalid.")
131
187
  results_list.append({k: v for p in processor_list for k, v in p.process().items()})
132
188
  if per_channel:
133
- source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
189
+ source_indices.extend([SourceIndex(i, i_b, c) for c in range(np_image.shape[-3])])
134
190
  else:
135
191
  source_indices.append(SourceIndex(i, i_b, None))
136
192
  box_counts.append(0 if boxes is None else len(boxes))
@@ -145,13 +201,18 @@ def process_stats_unpack(
145
201
  return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
146
202
 
147
203
 
148
- def _enumerate(dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool):
204
+ def _enumerate(
205
+ dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]], per_box: bool
206
+ ) -> Iterator[tuple[int, ArrayLike, Any]]:
149
207
  for i in range(len(dataset)):
150
208
  d = dataset[i]
151
209
  image = d[0] if isinstance(d, tuple) else d
152
210
  if per_box and isinstance(d, tuple) and isinstance(d[1], ObjectDetectionTarget):
153
- boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
154
- target = [BoundingBox(float(box[i]) for i in range(4)) for box in boxes]
211
+ try:
212
+ boxes = d[1].boxes if isinstance(d[1].boxes, Array) else as_numpy(d[1].boxes)
213
+ target = [BoundingBox(*(float(box[i]) for i in range(4))) for box in boxes]
214
+ except (ValueError, IndexError):
215
+ raise ValueError(f"Invalid bounding box format for image {i}: {d[1].boxes}")
155
216
  else:
156
217
  target = None
157
218
 
@@ -199,12 +260,13 @@ def run_stats(
199
260
  """
200
261
  results_list: list[dict[str, NDArray[np.float64]]] = []
201
262
  source_index: list[SourceIndex] = []
202
- box_count: list[int] = []
263
+ object_count: list[int] = []
264
+ image_count: int = len(dataset)
203
265
 
204
266
  warning_list = []
205
267
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
206
268
 
207
- with Pool(processes=get_max_processes()) as p:
269
+ with PoolWrapper(processes=get_max_processes()) as p:
208
270
  for r in tqdm.tqdm(
209
271
  p.imap(
210
272
  partial(
@@ -214,14 +276,12 @@ def run_stats(
214
276
  ),
215
277
  _enumerate(dataset, per_box),
216
278
  ),
217
- total=len(dataset),
279
+ total=image_count,
218
280
  ):
219
281
  results_list.extend(r.results)
220
282
  source_index.extend(r.source_indices)
221
- box_count.extend(r.box_counts)
283
+ object_count.extend(r.object_counts)
222
284
  warning_list.extend(r.warnings_list)
223
- p.close()
224
- p.join()
225
285
 
226
286
  # warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
227
287
  for w in warning_list:
@@ -235,8 +295,7 @@ def run_stats(
235
295
  else:
236
296
  output.setdefault(stat, []).append(result.tolist() if isinstance(result, np.ndarray) else result)
237
297
 
238
- outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
239
- return outputs
298
+ return [s.convert_output(output, source_index, object_count, image_count) for s in stats_processor_cls]
240
299
 
241
300
 
242
301
  def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
@@ -246,10 +305,12 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
246
305
  sum_dict = deepcopy(a.data())
247
306
 
248
307
  for k in sum_dict:
249
- if isinstance(sum_dict[k], list):
308
+ if isinstance(sum_dict[k], Sequence):
250
309
  sum_dict[k].extend(b.data()[k])
251
- else:
310
+ elif isinstance(sum_dict[k], Array):
252
311
  sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
312
+ else:
313
+ sum_dict[k] += b.data()[k]
253
314
 
254
315
  return type(a)(**sum_dict)
255
316
 
@@ -8,8 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
+ from dataeval.config import EPSILON
11
12
  from dataeval.outputs._base import set_metadata
12
- from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
13
+ from dataeval.outputs._stats import BASE_ATTRS, BaseStatsOutput
13
14
 
14
15
  TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
15
16
  ArraySlice = tuple[int, int]
@@ -40,15 +41,19 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
40
41
  self.img = self.StatSlicer(img_stats, img_slice)
41
42
 
42
43
 
43
- RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
44
- DimensionStatsOutput: dict[str, Callable[[BoxImageStatsOutputSlice[DimensionStatsOutput]], NDArray[Any]]](
45
- {
46
- "left": lambda x: x.box["left"] / x.img["width"],
47
- "top": lambda x: x.box["top"] / x.img["height"],
48
- "channels": lambda x: x.box["channels"],
49
- "depth": lambda x: x.box["depth"],
50
- "distance": lambda x: x.box["distance"],
51
- }
44
+ RATIOSTATS_OVERRIDE_MAP: dict[str, Callable[[BoxImageStatsOutputSlice[Any]], NDArray[Any]]] = {
45
+ "offset_x": lambda x: x.box["offset_x"] / x.img["width"],
46
+ "offset_y": lambda x: x.box["offset_y"] / x.img["height"],
47
+ "channels": lambda x: x.box["channels"],
48
+ "depth": lambda x: x.box["depth"],
49
+ "distance_center": lambda x: x.box["distance_center"]
50
+ / (np.sqrt(np.square(x.img["width"]) + np.square(x.img["height"])) / 2),
51
+ "distance_edge": lambda x: x.box["distance_edge"]
52
+ / (
53
+ x.img["width"]
54
+ if np.min([np.abs(x.box["offset_x"]), np.abs((x.box["width"] + x.box["offset_x"]) - x.img["width"])])
55
+ < np.min([np.abs(x.box["offset_y"]), np.abs((x.box["height"] + x.box["offset_y"]) - x.img["height"])])
56
+ else x.img["height"]
52
57
  ),
53
58
  }
54
59
 
@@ -69,11 +74,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
69
74
 
70
75
  stats = getattr(box_stats, key)
71
76
 
72
- # Copy over stats index maps and box counts
73
- if key in (SOURCE_INDEX):
77
+ # Copy over base attributes
78
+ if key in BASE_ATTRS:
74
79
  return copy.deepcopy(stats)
75
- elif key == BOX_COUNT:
76
- return np.copy(stats)
77
80
 
78
81
  # Calculate ratios for each stat
79
82
  out_stats: np.ndarray = np.copy(stats).astype(np.float64)
@@ -84,10 +87,9 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
84
87
  box_j = len(box_stats) if i == len(box_map) - 1 else box_map[i + 1]
85
88
  img_j = len(img_stats) if i == len(img_map) - 1 else img_map[i + 1]
86
89
  stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
87
- out_type = type(box_stats)
88
- use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
90
+ use_override = key in RATIOSTATS_OVERRIDE_MAP
89
91
  with np.errstate(divide="ignore", invalid="ignore"):
90
- ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
92
+ ratio = RATIOSTATS_OVERRIDE_MAP[key](stats) if use_override else stats.box[key] / (stats.img[key] + EPSILON)
91
93
  out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
92
94
  return out_stats
93
95
 
@@ -141,8 +143,8 @@ def boxratiostats(
141
143
  output_cls = type(boxstats)
142
144
  if type(boxstats) is not type(imgstats):
143
145
  raise TypeError("Must provide stats outputs of the same type.")
144
- if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
145
- raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
146
+ if boxstats.image_count != imgstats.image_count:
147
+ raise ValueError("Stats image count length mismatch. Check if the correct box and image stats were provided.")
146
148
  if any(src_idx.box is None for src_idx in boxstats.source_index):
147
149
  raise ValueError("Input for boxstats must contain box information.")
148
150
  if any(src_idx.box is not None for src_idx in imgstats.source_index):
@@ -6,6 +6,7 @@ from typing import Any, Callable
6
6
 
7
7
  import numpy as np
8
8
 
9
+ from dataeval.config import EPSILON
9
10
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
11
  from dataeval.outputs import DimensionStatsOutput
11
12
  from dataeval.outputs._base import set_metadata
@@ -16,18 +17,21 @@ from dataeval.utils._image import get_bitdepth
16
17
  class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
17
18
  output_class: type = DimensionStatsOutput
18
19
  image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
19
- "left": lambda x: x.box[0],
20
- "top": lambda x: x.box[1],
21
- "width": lambda x: x.box[2] - x.box[0],
22
- "height": lambda x: x.box[3] - x.box[1],
20
+ "offset_x": lambda x: x.box.x0,
21
+ "offset_y": lambda x: x.box.y0,
22
+ "width": lambda x: x.box.width,
23
+ "height": lambda x: x.box.height,
23
24
  "channels": lambda x: x.shape[-3],
24
- "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
25
- "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
25
+ "size": lambda x: x.box.width * x.box.height,
26
+ "aspect_ratio": lambda x: x.box.width / (x.box.height + EPSILON),
26
27
  "depth": lambda x: get_bitdepth(x.image).depth,
27
- "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
28
- "distance": lambda x: np.sqrt(
29
- np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
30
- + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
28
+ "center": lambda x: np.asarray([(x.box.x0 + x.box.x1) / 2, (x.box.y0 + x.box.y1) / 2]),
29
+ "distance_center": lambda x: np.sqrt(
30
+ np.square(((x.box.x0 + x.box.x1) / 2) - (x.raw.shape[-1] / 2))
31
+ + np.square(((x.box.y0 + x.box.y1) / 2) - (x.raw.shape[-2] / 2))
32
+ ),
33
+ "distance_edge": lambda x: np.min(
34
+ [np.abs(x.box.x0), np.abs(x.box.y0), np.abs(x.box.x1 - x.raw.shape[-1]), np.abs(x.box.y1 - x.raw.shape[-2])]
31
35
  ),
32
36
  }
33
37
 
@@ -137,7 +137,7 @@ def hashstats(
137
137
 
138
138
  >>> results = hashstats(dataset)
139
139
  >>> print(results.xxhash[:5])
140
- ['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
140
+ ['69b50a5f06af238c', '5a861d7a23d1afe7', '7ffdb4990ad44ac6', '4f0c366a3298ceac', 'c5519e36ac1f8839']
141
141
  >>> print(results.pchash[:5])
142
142
  ['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
143
143
  """
@@ -16,18 +16,18 @@ from dataeval.typing import ArrayLike, Dataset
16
16
  class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
17
17
  output_class: type = PixelStatsOutput
18
18
  image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
19
- "mean": lambda x: np.mean(x.scaled),
20
- "std": lambda x: np.std(x.scaled),
21
- "var": lambda x: np.var(x.scaled),
19
+ "mean": lambda x: np.nanmean(x.scaled),
20
+ "std": lambda x: np.nanstd(x.scaled),
21
+ "var": lambda x: np.nanvar(x.scaled),
22
22
  "skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
23
23
  "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
24
24
  "histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
25
25
  "entropy": lambda x: entropy(x.get("histogram")),
26
26
  }
27
27
  channel_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
28
- "mean": lambda x: np.mean(x.scaled, axis=1),
29
- "std": lambda x: np.std(x.scaled, axis=1),
30
- "var": lambda x: np.var(x.scaled, axis=1),
28
+ "mean": lambda x: np.nanmean(x.scaled, axis=1),
29
+ "std": lambda x: np.nanstd(x.scaled, axis=1),
30
+ "var": lambda x: np.nanvar(x.scaled, axis=1),
31
31
  "skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
32
32
  "kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
33
33
  "histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
@@ -24,8 +24,8 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
24
24
  else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
25
25
  "darkness": lambda x: x.get("percentiles")[-2],
26
26
  "missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
27
- "sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
28
- "zeros": lambda x: np.count_nonzero(np.sum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
27
+ "sharpness": lambda x: np.nanstd(edge_filter(np.mean(x.image, axis=0))),
28
+ "zeros": lambda x: np.count_nonzero(np.nansum(x.image, axis=0) == 0) / np.prod(x.shape[-2:]),
29
29
  "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
30
30
  }
31
31
  channel_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
@@ -36,7 +36,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
36
36
  ),
37
37
  "darkness": lambda x: x.get("percentiles")[:, -2],
38
38
  "missing": lambda x: np.count_nonzero(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
39
- "sharpness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
39
+ "sharpness": lambda x: np.nanstd(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
40
40
  "zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
41
41
  "percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
42
42
  }
@@ -5,7 +5,7 @@ as well as runtime metadata for reproducibility and logging.
5
5
 
6
6
  from ._base import ExecutionMetadata
7
7
  from ._bias import BalanceOutput, CompletenessOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
8
- from ._drift import DriftMMDOutput, DriftOutput
8
+ from ._drift import DriftMMDOutput, DriftMVDCOutput, DriftOutput
9
9
  from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
10
10
  from ._linters import DuplicatesOutput, OutliersOutput
11
11
  from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
@@ -34,6 +34,7 @@ __all__ = [
34
34
  "DivergenceOutput",
35
35
  "DiversityOutput",
36
36
  "DriftMMDOutput",
37
+ "DriftMVDCOutput",
37
38
  "DriftOutput",
38
39
  "DuplicatesOutput",
39
40
  "ExecutionMetadata",
dataeval/outputs/_base.py CHANGED
@@ -66,25 +66,40 @@ class GenericOutput(Generic[T]):
66
66
  def meta(self) -> ExecutionMetadata:
67
67
  """
68
68
  Metadata about the execution of the function or method for the Output class.
69
+
70
+ Returns
71
+ -------
72
+ ExecutionMetadata
69
73
  """
70
74
  return self._meta or ExecutionMetadata.empty()
71
75
 
72
76
 
73
77
  class Output(GenericOutput[dict[str, Any]]):
74
78
  def data(self) -> dict[str, Any]:
75
- return {k: v for k, v in self.__dict__.items() if k != "_meta"}
79
+ """
80
+ The output data as a dictionary.
76
81
 
77
- def __repr__(self) -> str:
78
- return str(self)
82
+ Returns
83
+ -------
84
+ dict[str, Any]
85
+ """
86
+ return {k: v for k, v in self.__dict__.items() if k != "_meta"}
79
87
 
80
88
  def __str__(self) -> str:
81
- return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.data().items()])})"
89
+ return str(self.data())
82
90
 
83
91
 
84
92
  class BaseCollectionMixin(Collection[Any]):
85
93
  __slots__ = ["_data"]
86
94
 
87
95
  def data(self) -> Any:
96
+ """
97
+ The output data as a collection.
98
+
99
+ Returns
100
+ -------
101
+ Collection
102
+ """
88
103
  return self._data
89
104
 
90
105
  def __len__(self) -> int:
@@ -102,7 +117,7 @@ TValue = TypeVar("TValue")
102
117
 
103
118
 
104
119
  class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Mapping[TKey, TValue]]):
105
- def __init__(self, data: Mapping[TKey, TValue]):
120
+ def __init__(self, data: Mapping[TKey, TValue]) -> None:
106
121
  self._data = data
107
122
 
108
123
  def __getitem__(self, key: TKey) -> TValue:
@@ -113,7 +128,7 @@ class MappingOutput(Mapping[TKey, TValue], BaseCollectionMixin, GenericOutput[Ma
113
128
 
114
129
 
115
130
  class SequenceOutput(Sequence[TValue], BaseCollectionMixin, GenericOutput[Sequence[TValue]]):
116
- def __init__(self, data: Sequence[TValue]):
131
+ def __init__(self, data: Sequence[TValue]) -> None:
117
132
  self._data = data
118
133
 
119
134
  @overload
@@ -140,7 +155,7 @@ def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None =
140
155
 
141
156
  @wraps(fn)
142
157
  def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
143
- def fmt(v):
158
+ def fmt(v: Any) -> Any:
144
159
  if np.isscalar(v):
145
160
  return v
146
161
  if hasattr(v, "shape"):
dataeval/outputs/_bias.py CHANGED
@@ -7,10 +7,10 @@ from dataclasses import asdict, dataclass
7
7
  from typing import Any, Literal, TypeVar, overload
8
8
 
9
9
  import numpy as np
10
+ import pandas as pd
10
11
  from numpy.typing import NDArray
11
12
 
12
13
  with contextlib.suppress(ImportError):
13
- import pandas as pd
14
14
  from matplotlib.figure import Figure
15
15
 
16
16
  from dataeval.data._images import Images
@@ -38,8 +38,6 @@ class ToDataFrameMixin:
38
38
  -----
39
39
  This method requires `pandas <https://pandas.pydata.org/>`_ to be installed.
40
40
  """
41
- import pandas as pd
42
-
43
41
  return pd.DataFrame(
44
42
  index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
45
43
  data={
@@ -130,33 +128,30 @@ class CoverageOutput(Output):
130
128
 
131
129
  import matplotlib.pyplot as plt
132
130
 
131
+ images = Images(images) if isinstance(images, Dataset) else images
132
+ if np.max(self.uncovered_indices) > len(images):
133
+ raise ValueError(
134
+ f"Uncovered indices {self.uncovered_indices} specify images "
135
+ f"unavailable in the provided number of images {len(images)}."
136
+ )
137
+
133
138
  # Determine which images to plot
134
139
  selected_indices = self.uncovered_indices[:top_k]
135
140
 
136
- images = Images(images) if isinstance(images, Dataset) else images
137
-
138
141
  # Plot the images
139
142
  num_images = min(top_k, len(selected_indices))
140
143
 
141
144
  rows = int(np.ceil(num_images / 3))
142
- fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
143
-
144
- if rows == 1:
145
- for j in range(3):
146
- if j >= len(selected_indices):
147
- continue
148
- image = channels_first_to_last(as_numpy(images[selected_indices[j]]))
149
- axs[j].imshow(image)
150
- axs[j].axis("off")
151
- else:
152
- for i in range(rows):
153
- for j in range(3):
154
- i_j = i * 3 + j
155
- if i_j >= len(selected_indices):
156
- continue
157
- image = channels_first_to_last(as_numpy(images[selected_indices[i_j]]))
158
- axs[i, j].imshow(image)
159
- axs[i, j].axis("off")
145
+ cols = min(3, num_images)
146
+ fig, axs = plt.subplots(rows, cols, figsize=(3 * cols, 3 * rows))
147
+
148
+ for image, ax in zip(images[:num_images], axs.flat):
149
+ image = channels_first_to_last(as_numpy(image))
150
+ ax.imshow(image)
151
+ ax.axis("off")
152
+
153
+ for ax in axs.flat[num_images:]:
154
+ ax.axis("off")
160
155
 
161
156
  fig.tight_layout()
162
157
  return fig
@@ -235,14 +230,15 @@ class BalanceOutput(Output):
235
230
  # return the masked attribute
236
231
  if attr == "factor_names":
237
232
  return [x.replace(f"-{factor_type}", "") for x in self.factor_names if mask_lambda(x)]
238
- else:
239
- factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
240
- if attr == "factors":
241
- return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
242
- elif attr == "balance":
243
- return self.balance[factor_type_mask]
244
- elif attr == "classwise":
245
- return self.classwise[:, factor_type_mask]
233
+ factor_type_mask = np.asarray([mask_lambda(x) for x in self.factor_names])
234
+ if attr == "factors":
235
+ return self.factors[factor_type_mask[1:]][:, factor_type_mask[1:]]
236
+ if attr == "balance":
237
+ return self.balance[factor_type_mask]
238
+ if attr == "classwise":
239
+ return self.classwise[:, factor_type_mask]
240
+
241
+ raise ValueError(f"Unknown attr {attr} specified.")
246
242
 
247
243
  def plot(
248
244
  self,