dataeval 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. dataeval/__init__.py +7 -2
  2. dataeval/config.py +78 -11
  3. dataeval/detectors/drift/_mmd.py +9 -9
  4. dataeval/detectors/drift/_torch.py +7 -7
  5. dataeval/detectors/drift/_uncertainty.py +4 -4
  6. dataeval/detectors/linters/duplicates.py +3 -3
  7. dataeval/detectors/linters/outliers.py +3 -3
  8. dataeval/detectors/ood/ae.py +5 -4
  9. dataeval/detectors/ood/base.py +2 -2
  10. dataeval/detectors/ood/mixin.py +1 -1
  11. dataeval/detectors/ood/vae.py +2 -1
  12. dataeval/metadata/__init__.py +2 -2
  13. dataeval/metadata/_distance.py +11 -44
  14. dataeval/metadata/_ood.py +152 -33
  15. dataeval/metrics/bias/_balance.py +9 -5
  16. dataeval/metrics/bias/_diversity.py +3 -0
  17. dataeval/metrics/bias/_parity.py +2 -0
  18. dataeval/metrics/estimators/_ber.py +2 -1
  19. dataeval/metrics/stats/_base.py +20 -21
  20. dataeval/metrics/stats/_boxratiostats.py +1 -1
  21. dataeval/metrics/stats/_dimensionstats.py +2 -2
  22. dataeval/metrics/stats/_hashstats.py +2 -2
  23. dataeval/metrics/stats/_imagestats.py +8 -8
  24. dataeval/metrics/stats/_pixelstats.py +2 -2
  25. dataeval/metrics/stats/_visualstats.py +2 -2
  26. dataeval/outputs/__init__.py +5 -0
  27. dataeval/outputs/_base.py +50 -21
  28. dataeval/outputs/_bias.py +1 -1
  29. dataeval/outputs/_linters.py +4 -2
  30. dataeval/outputs/_metadata.py +61 -0
  31. dataeval/outputs/_stats.py +12 -6
  32. dataeval/typing.py +40 -9
  33. dataeval/utils/_mst.py +1 -2
  34. dataeval/utils/data/_embeddings.py +23 -19
  35. dataeval/utils/data/_metadata.py +16 -7
  36. dataeval/utils/data/_selection.py +22 -15
  37. dataeval/utils/data/_split.py +3 -2
  38. dataeval/utils/data/datasets/_base.py +4 -2
  39. dataeval/utils/data/datasets/_cifar10.py +17 -9
  40. dataeval/utils/data/datasets/_milco.py +18 -12
  41. dataeval/utils/data/datasets/_mnist.py +24 -8
  42. dataeval/utils/data/datasets/_ships.py +18 -8
  43. dataeval/utils/data/datasets/_types.py +1 -5
  44. dataeval/utils/data/datasets/_voc.py +47 -24
  45. dataeval/utils/data/selections/__init__.py +2 -0
  46. dataeval/utils/data/selections/_classfilter.py +5 -3
  47. dataeval/utils/data/selections/_prioritize.py +296 -0
  48. dataeval/utils/data/selections/_shuffle.py +13 -4
  49. dataeval/utils/torch/_gmm.py +3 -2
  50. dataeval/utils/torch/_internal.py +5 -5
  51. dataeval/utils/torch/trainer.py +8 -8
  52. {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/METADATA +4 -4
  53. dataeval-0.83.0.dist-info/RECORD +105 -0
  54. dataeval/detectors/ood/metadata_ood_mi.py +0 -93
  55. dataeval-0.82.0.dist-info/RECORD +0 -104
  56. {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/LICENSE.txt +0 -0
  57. {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/WHEEL +0 -0
dataeval/metadata/_ood.py CHANGED
@@ -6,13 +6,44 @@ import warnings
6
6
 
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
9
+ from sklearn.feature_selection import mutual_info_classif
9
10
 
11
+ from dataeval.config import get_seed
10
12
  from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
11
- from dataeval.outputs import OODOutput
13
+ from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorOutput
14
+ from dataeval.outputs._base import set_metadata
12
15
  from dataeval.utils.data import Metadata
13
16
 
14
17
 
15
- def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[str], list[NDArray], list[NDArray]]:
18
+ def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
19
+ """Combines the discrete and continuous data of a :class:`Metadata` object
20
+
21
+ Returns
22
+ -------
23
+ Tuple[list[str], NDArray]
24
+ The combined list of factors names and the combined discrete and continuous data
25
+
26
+ Note
27
+ ----
28
+ Discrete and continuous data must have the same number of samples
29
+ """
30
+ names = []
31
+ data = []
32
+
33
+ if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
34
+ names.extend(metadata.discrete_factor_names)
35
+ data.append(metadata.discrete_data)
36
+
37
+ if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
38
+ names.extend(metadata.continuous_factor_names)
39
+ data.append(metadata.continuous_data)
40
+
41
+ return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
42
+
43
+
44
+ def _combine_metadata(
45
+ metadata_1: Metadata, metadata_2: Metadata
46
+ ) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
16
47
  """
17
48
  Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
18
49
  match exactly and data has the same number of columns (factors).
@@ -41,8 +72,8 @@ def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[
41
72
  If the length of keys do not match the length of the data
42
73
  """
43
74
  factor_names: list[str] = []
44
- m1_data: list[NDArray] = []
45
- m2_data: list[NDArray] = []
75
+ m1_data: list[NDArray[np.int64 | np.float64]] = []
76
+ m2_data: list[NDArray[np.int64 | np.float64]] = []
46
77
 
47
78
  # Both metadata must have the same number of factors (cols), but not necessarily samples (row)
48
79
  if metadata_1.total_num_factors != metadata_2.total_num_factors:
@@ -119,36 +150,38 @@ def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
119
150
  return np.abs(np.where(test_dev >= 0, test_dev / pscale, test_dev / nscale)) # (S_t, F)
120
151
 
121
152
 
122
- def most_deviated_factors(
123
- metadata_1: Metadata,
124
- metadata_2: Metadata,
153
+ @set_metadata
154
+ def find_most_deviated_factors(
155
+ metadata_ref: Metadata,
156
+ metadata_tst: Metadata,
125
157
  ood: OODOutput,
126
- ) -> list[tuple[str, float]]:
158
+ ) -> MostDeviatedFactorsOutput:
127
159
  """
128
- Determines greatest deviation in metadata features per out of distribution sample in metadata_2.
160
+ Determine greatest deviation in metadata features per out of distribution sample in test metadata.
129
161
 
130
162
  Parameters
131
163
  ----------
132
- metadata_1 : Metadata
164
+ metadata_ref : Metadata
133
165
  A reference set of Metadata containing factor names and samples
134
166
  with discrete and/or continuous values per factor
135
- metadata_2 : Metadata
167
+ metadata_tst : Metadata
136
168
  The set of Metadata that is tested against the reference metadata.
137
169
  This set must have the same number of features but does not require the same number of samples.
138
170
  ood : OODOutput
139
- A class output by the DataEval's OOD functions that contains which examples are OOD.
171
+ A class output by DataEval's OOD functions that contains which examples are OOD.
140
172
 
141
173
  Returns
142
174
  -------
143
- list[tuple[str, float]]
144
- An array of the factor name and deviation of the highest metadata deviation for each OOD example in metadata_2.
175
+ MostDeviatedFactorsOutput
176
+ An output class containing the factor name and deviation of the highest metadata deviations for each
177
+ OOD example in the test metadata.
145
178
 
146
179
  Notes
147
180
  -----
148
181
  1. Both :class:`.Metadata` inputs must have discrete and continuous data in the shape (samples, factors)
149
182
  and have equivalent factor names and lengths
150
183
  2. The flag at index `i` in :attr:`.OODOutput.is_ood` must correspond
151
- directly to sample `i` of `metadata_2` being out-of-distribution from `metadata_1`
184
+ directly to sample `i` of `metadata_tst` being out-of-distribution from `metadata_ref`
152
185
 
153
186
  Examples
154
187
  --------
@@ -158,50 +191,49 @@ def most_deviated_factors(
158
191
  All samples are out-of-distribution
159
192
 
160
193
  >>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
161
- >>> most_deviated_factors(metadata1, metadata2, is_ood)
162
- [('time', 2.0), ('time', 2.592), ('time', 3.51)]
194
+ >>> find_most_deviated_factors(metadata1, metadata2, is_ood)
195
+ MostDeviatedFactorsOutput([('time', 2.0), ('time', 2.592), ('time', 3.51)])
163
196
 
164
- If there are no out-of-distribution samples, a list is returned
197
+ No samples are out-of-distribution
165
198
 
166
199
  >>> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
167
- >>> most_deviated_factors(metadata1, metadata2, is_ood)
168
- []
200
+ >>> find_most_deviated_factors(metadata1, metadata2, is_ood)
201
+ MostDeviatedFactorsOutput([])
169
202
  """
170
203
 
171
204
  ood_mask: NDArray[np.bool] = ood.is_ood
172
205
 
173
206
  # No metadata correlated with out of distribution data
174
207
  if not any(ood_mask):
175
- return []
208
+ return MostDeviatedFactorsOutput([])
176
209
 
177
210
  # Combines reference and test factor names and data if exists and match exactly
178
211
  # shape -> (samples, factors)
179
212
  factor_names, md_1, md_2 = _combine_metadata(
180
- metadata_1=metadata_1,
181
- metadata_2=metadata_2,
213
+ metadata_1=metadata_ref,
214
+ metadata_2=metadata_tst,
182
215
  )
183
216
 
184
217
  # Stack discrete and continuous factors as separate factors. Must have equal sample counts
185
- metadata_ref = np.hstack(md_1) if md_1 else np.array([])
186
- metadata_tst = np.hstack(md_2) if md_2 else np.array([])
218
+ ref_data = np.hstack(md_1) if md_1 else np.array([]) # (S, Fd + Fc)
219
+ tst_data = np.hstack(md_2) if md_2 else np.array([]) # (S, Fd + Fc)
187
220
 
188
- if len(metadata_ref) < 3:
221
+ if len(ref_data) < 3:
189
222
  warnings.warn(
190
- f"At least 3 reference metadata samples are needed, got {len(metadata_ref)}",
223
+ f"At least 3 reference metadata samples are needed, got {len(ref_data)}",
191
224
  UserWarning,
192
225
  )
193
- return []
226
+ return MostDeviatedFactorsOutput([])
194
227
 
195
- if len(metadata_tst) != len(ood_mask):
228
+ if len(tst_data) != len(ood_mask):
196
229
  raise ValueError(
197
- f"ood and test metadata must have the same length, "
198
- f"got {len(ood_mask)} and {len(metadata_tst)} respectively."
230
+ f"ood and test metadata must have the same length, got {len(ood_mask)} and {len(tst_data)} respectively."
199
231
  )
200
232
 
201
233
  # Calculates deviations of all samples in m2_data
202
234
  # from the median values of the corresponding index in m1_data
203
235
  # Guaranteed for inputs to not be empty
204
- deviations = _calc_median_deviations(metadata_ref, metadata_tst)
236
+ deviations = _calc_median_deviations(ref_data, tst_data)
205
237
 
206
238
  # Get most impactful factor deviation of each sample for ood samples only
207
239
  deviation = np.max(deviations, axis=1)[ood_mask].astype(np.float16)
@@ -214,4 +246,91 @@ def most_deviated_factors(
214
246
 
215
247
  # List of tuples matching the factor name with its deviation
216
248
 
217
- return [(factor, dev) for factor, dev in zip(most_ood_factors, deviation)]
249
+ return MostDeviatedFactorsOutput([(factor, dev) for factor, dev in zip(most_ood_factors, deviation)])
250
+
251
+
252
+ _NATS2BITS = 1.442695
253
+ """
254
+ _NATS2BITS is the reciprocal of natural log of 2. If you have an information/entropy-type quantity measured in nats,
255
+ which is what many library functions return, multiply it by _NATS2BITS to get it in bits.
256
+ """
257
+
258
+
259
+ def find_ood_predictors(
260
+ metadata: Metadata,
261
+ ood: OODOutput,
262
+ ) -> OODPredictorOutput:
263
+ """Computes mutual information between a set of metadata features and per sample out-of-distribution flags.
264
+
265
+ Given a set of metadata features per sample and a corresponding OODOutput that indicates whether a sample was
266
+ determined to be out of distribution, this function calculates the mutual information between each factor and being
267
+ out of distribution. In other words, it finds which metadata factors most likely correlate to an
268
+ out of distribution sample.
269
+
270
+ Note
271
+ ----
272
+ A high mutual information between a factor and ood samples is an indication of correlation, but not causation.
273
+ Additional analysis should be done to determine how to handle factors with a high mutual information.
274
+
275
+
276
+ Parameters
277
+ ----------
278
+ metadata : Metadata
279
+ A set of arrays of values, indexed by metadata feature names, with one value per data example per feature.
280
+ ood : OODOutput
281
+ A class output by DataEval's OOD functions that contains which examples are OOD.
282
+
283
+ Returns
284
+ -------
285
+ OODPredictorOutput
286
+ A dictionary with keys corresponding to metadata feature names, and values indicating the strength of
287
+ association between each named feature and the OOD flag, as mutual information measured in bits.
288
+
289
+ Examples
290
+ --------
291
+ >>> from dataeval.outputs import OODOutput
292
+
293
+ All samples are out-of-distribution
294
+
295
+ >>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
296
+ >>> find_ood_predictors(metadata1, is_ood)
297
+ OODPredictorOutput({'time': 8.008566032557951e-17, 'altitude': 8.008566032557951e-17})
298
+
299
+ No out-of-distribution samples
300
+
301
+ >> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
302
+ >> find_ood_predictors(metadata1, is_ood)
303
+ OODPredictorOutput({})
304
+ """
305
+
306
+ ood_mask: NDArray[np.bool] = ood.is_ood
307
+
308
+ discrete_features_count = len(metadata.discrete_factor_names)
309
+ factors, data = _combine_discrete_continuous(metadata) # (F, ), (S, F) => F = Fd + Fc
310
+
311
+ # No metadata correlated with out of distribution data, return 0.0 for all factors
312
+ if not any(ood_mask):
313
+ return OODPredictorOutput(dict.fromkeys(factors, 0.0))
314
+
315
+ if len(data) != len(ood_mask):
316
+ raise ValueError(
317
+ f"ood and metadata must have the same length, got {len(ood_mask)} and {len(data)} respectively."
318
+ )
319
+
320
+ # Calculate mean, std of each factor over all samples
321
+ scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
322
+
323
+ discrete_features = np.zeros_like(factors, dtype=np.bool)
324
+ discrete_features[:discrete_features_count] = True
325
+
326
+ mutual_info_values = (
327
+ mutual_info_classif(
328
+ X=scaled_data,
329
+ y=ood_mask,
330
+ discrete_features=discrete_features, # type: ignore -> sklearn issue - NDArray[bool] not of accepted type Union[ArrayLike, 'auto']
331
+ random_state=get_seed(),
332
+ )
333
+ * _NATS2BITS
334
+ )
335
+
336
+ return OODPredictorOutput({k: mutual_info_values[i] for i, k in enumerate(factors)})
@@ -8,6 +8,7 @@ import numpy as np
8
8
  import scipy as sp
9
9
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
10
10
 
11
+ from dataeval.config import EPSILON, get_seed
11
12
  from dataeval.outputs import BalanceOutput
12
13
  from dataeval.outputs._base import set_metadata
13
14
  from dataeval.utils._bin import get_counts
@@ -91,6 +92,9 @@ def balance(
91
92
  sklearn.feature_selection.mutual_info_regression
92
93
  sklearn.metrics.mutual_info_score
93
94
  """
95
+ if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
96
+ raise ValueError("No factors found in provided metadata.")
97
+
94
98
  num_neighbors = _validate_num_neighbors(num_neighbors)
95
99
 
96
100
  num_factors = metadata.total_num_factors
@@ -110,7 +114,7 @@ def balance(
110
114
  data[:, idx],
111
115
  discrete_features=is_discrete, # type: ignore
112
116
  n_neighbors=num_neighbors,
113
- random_state=0,
117
+ random_state=get_seed(),
114
118
  )
115
119
  else:
116
120
  mi[idx, :] = mutual_info_classif(
@@ -118,13 +122,13 @@ def balance(
118
122
  data[:, idx],
119
123
  discrete_features=is_discrete, # type: ignore
120
124
  n_neighbors=num_neighbors,
121
- random_state=0,
125
+ random_state=get_seed(),
122
126
  )
123
127
 
124
128
  # Normalization via entropy
125
129
  bin_cnts = get_counts(discretized_data)
126
130
  ent_factor = sp.stats.entropy(bin_cnts, axis=0)
127
- norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + 1e-6
131
+ norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + EPSILON
128
132
 
129
133
  # in principle MI should be symmetric, but it is not in practice.
130
134
  nmi = 0.5 * (mi + mi.T) / norm_factor
@@ -147,13 +151,13 @@ def balance(
147
151
  tgt_bin[:, idx],
148
152
  discrete_features=is_discrete, # type: ignore
149
153
  n_neighbors=num_neighbors,
150
- random_state=0,
154
+ random_state=get_seed(),
151
155
  )
152
156
 
153
157
  # Classwise normalization via entropy
154
158
  classwise_bin_cnts = get_counts(tgt_bin)
155
159
  ent_tgt_bin = sp.stats.entropy(classwise_bin_cnts, axis=0)
156
- norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) + 1e-6
160
+ norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) + EPSILON
157
161
  classwise = classwise_mi / norm_factor
158
162
 
159
163
  # Grabbing factor names for plotting function
@@ -158,6 +158,9 @@ def diversity(
158
158
  --------
159
159
  scipy.stats.entropy
160
160
  """
161
+ if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
162
+ raise ValueError("No factors found in provided metadata.")
163
+
161
164
  diversity_fn = get_method(_DIVERSITY_FN_MAP, method)
162
165
  discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
163
166
  cnts = get_counts(discretized_data)
@@ -241,6 +241,8 @@ def parity(metadata: Metadata) -> ParityOutput:
241
241
  >>> parity(metadata)
242
242
  ParityOutput(score=array([7.357, 5.467, 0.515]), p_value=array([0.289, 0.243, 0.773]), factor_names=['age', 'income', 'gender'], insufficient_data={'age': {3: {'artist': 4}, 4: {'artist': 4, 'teacher': 3}}, 'income': {1: {'artist': 3}}})
243
243
  """ # noqa: E501
244
+ if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
245
+ raise ValueError("No factors found in provided metadata.")
244
246
 
245
247
  chi_scores = np.zeros(metadata.discrete_data.shape[1])
246
248
  p_values = np.zeros_like(chi_scores)
@@ -19,6 +19,7 @@ from numpy.typing import NDArray
19
19
  from scipy.sparse import coo_matrix
20
20
  from scipy.stats import mode
21
21
 
22
+ from dataeval.config import EPSILON
22
23
  from dataeval.outputs import BEROutput
23
24
  from dataeval.outputs._base import set_metadata
24
25
  from dataeval.typing import ArrayLike
@@ -82,7 +83,7 @@ def ber_knn(images: NDArray[np.float64], labels: NDArray[np.int_], k: int) -> tu
82
83
 
83
84
  def knn_lowerbound(value: float, classes: int, k: int) -> float:
84
85
  """Several cases for computing the BER lower bound"""
85
- if value <= 1e-10:
86
+ if value <= EPSILON:
86
87
  return 0.0
87
88
 
88
89
  if classes == 2 and k != 1:
@@ -9,7 +9,7 @@ from copy import deepcopy
9
9
  from dataclasses import dataclass
10
10
  from functools import partial
11
11
  from multiprocessing import Pool
12
- from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
12
+ from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
13
13
 
14
14
  import numpy as np
15
15
  import tqdm
@@ -17,7 +17,7 @@ from numpy.typing import NDArray
17
17
 
18
18
  from dataeval.config import get_max_processes
19
19
  from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
20
- from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
20
+ from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
21
21
  from dataeval.utils._array import to_numpy
22
22
  from dataeval.utils._image import normalize_image_shape, rescale
23
23
 
@@ -122,22 +122,19 @@ class StatsProcessorOutput:
122
122
 
123
123
  def process_stats(
124
124
  i: int,
125
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
125
+ image: ArrayLike,
126
+ target: Any,
126
127
  per_box: bool,
127
128
  per_channel: bool,
128
129
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
129
130
  ) -> StatsProcessorOutput:
130
- data = dataset[i]
131
- image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
132
- target = None if not isinstance(target, ObjectDetectionTarget) else target
133
- boxes = to_numpy(target.boxes) if target is not None else None
131
+ image = to_numpy(image)
132
+ boxes = to_numpy(target.boxes) if isinstance(target, ObjectDetectionTarget) else None
134
133
  results_list: list[dict[str, Any]] = []
135
134
  source_indices: list[SourceIndex] = []
136
135
  box_counts: list[int] = []
137
136
  warnings_list: list[str] = []
138
- nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
139
- for i_b, box in enumerate(nboxes):
140
- i_b = None if box is None else i_b
137
+ for i_b, box in [(None, None)] if boxes is None else enumerate(normalize_box_shape(boxes)):
141
138
  processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
142
139
  if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
143
140
  warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
@@ -151,17 +148,16 @@ def process_stats(
151
148
 
152
149
 
153
150
  def process_stats_unpack(
154
- i: int,
155
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
151
+ args: tuple[int, ArrayLike, Any],
156
152
  per_box: bool,
157
153
  per_channel: bool,
158
154
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
159
155
  ) -> StatsProcessorOutput:
160
- return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
156
+ return process_stats(*args, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
161
157
 
162
158
 
163
159
  def run_stats(
164
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
160
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
165
161
  per_box: bool,
166
162
  per_channel: bool,
167
163
  stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
@@ -175,7 +171,7 @@ def run_stats(
175
171
 
176
172
  Parameters
177
173
  ----------
178
- data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
174
+ data : Dataset[Array] | Dataset[tuple[Array, Any, Any]]
179
175
  A dataset of images and targets to compute statistics on.
180
176
  per_box : bool
181
177
  A flag which determines if the statistics should be evaluated on a per-box basis or not.
@@ -206,18 +202,21 @@ def run_stats(
206
202
  warning_list = []
207
203
  stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
208
204
 
209
- # TODO: Introduce global controls for CPU job parallelism and GPU configurations
205
+ def _enumerate(dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]], per_box: bool):
206
+ for i in range(len(dataset)):
207
+ d = dataset[i]
208
+ yield i, d[0] if isinstance(d, tuple) else d, d[1] if isinstance(d, tuple) and per_box else None
209
+
210
210
  with Pool(processes=get_max_processes()) as p:
211
211
  for r in tqdm.tqdm(
212
212
  p.imap(
213
213
  partial(
214
214
  process_stats_unpack,
215
- dataset=dataset,
216
215
  per_box=per_box,
217
216
  per_channel=per_channel,
218
217
  stats_processor_cls=stats_processor_cls,
219
218
  ),
220
- range(len(dataset)),
219
+ _enumerate(dataset, per_box),
221
220
  ),
222
221
  total=len(dataset),
223
222
  ):
@@ -248,13 +247,13 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
248
247
  if type(a) is not type(b):
249
248
  raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
250
249
 
251
- sum_dict = deepcopy(a.dict())
250
+ sum_dict = deepcopy(a.data())
252
251
 
253
252
  for k in sum_dict:
254
253
  if isinstance(sum_dict[k], list):
255
- sum_dict[k].extend(b.dict()[k])
254
+ sum_dict[k].extend(b.data()[k])
256
255
  else:
257
- sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
256
+ sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
258
257
 
259
258
  return type(a)(**sum_dict)
260
259
 
@@ -153,7 +153,7 @@ def boxratiostats(
153
153
  raise ValueError("Input for boxstats and imgstats must have matching channel information.")
154
154
 
155
155
  output_dict = {}
156
- for key in boxstats.dict():
156
+ for key in boxstats.data():
157
157
  output_dict[key] = calculate_ratios(key, boxstats, imgstats)
158
158
 
159
159
  return output_cls(**output_dict)
@@ -9,7 +9,7 @@ import numpy as np
9
9
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
10
  from dataeval.outputs import DimensionStatsOutput
11
11
  from dataeval.outputs._base import set_metadata
12
- from dataeval.typing import ArrayLike, Dataset
12
+ from dataeval.typing import Array, Dataset
13
13
  from dataeval.utils._image import get_bitdepth
14
14
 
15
15
 
@@ -34,7 +34,7 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
34
34
 
35
35
  @set_metadata
36
36
  def dimensionstats(
37
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
37
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
38
38
  *,
39
39
  per_box: bool = False,
40
40
  ) -> DimensionStatsOutput:
@@ -14,7 +14,7 @@ from scipy.fftpack import dct
14
14
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
15
15
  from dataeval.outputs import HashStatsOutput
16
16
  from dataeval.outputs._base import set_metadata
17
- from dataeval.typing import ArrayLike, Dataset
17
+ from dataeval.typing import Array, ArrayLike, Dataset
18
18
  from dataeval.utils._array import as_numpy
19
19
  from dataeval.utils._image import normalize_image_shape, rescale
20
20
 
@@ -105,7 +105,7 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
105
105
 
106
106
  @set_metadata
107
107
  def hashstats(
108
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
108
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
109
109
  *,
110
110
  per_box: bool = False,
111
111
  ) -> HashStatsOutput:
@@ -10,12 +10,12 @@ from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
10
10
  from dataeval.metrics.stats._visualstats import VisualStatsProcessor
11
11
  from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
12
12
  from dataeval.outputs._base import set_metadata
13
- from dataeval.typing import ArrayLike, Dataset
13
+ from dataeval.typing import Array, Dataset
14
14
 
15
15
 
16
16
  @overload
17
17
  def imagestats(
18
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
18
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
19
19
  *,
20
20
  per_box: bool = False,
21
21
  per_channel: Literal[True],
@@ -24,7 +24,7 @@ def imagestats(
24
24
 
25
25
  @overload
26
26
  def imagestats(
27
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
27
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
28
28
  *,
29
29
  per_box: bool = False,
30
30
  per_channel: Literal[False] = False,
@@ -33,7 +33,7 @@ def imagestats(
33
33
 
34
34
  @set_metadata
35
35
  def imagestats(
36
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
36
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
37
37
  *,
38
38
  per_box: bool = False,
39
39
  per_channel: bool = False,
@@ -42,8 +42,8 @@ def imagestats(
42
42
  Calculates various :term:`statistics<Statistics>` for each image.
43
43
 
44
44
  This function computes dimension, pixel and visual metrics
45
- on the images or individual bounding boxes for each image as
46
- well as label statistics if provided.
45
+ on the images or individual bounding boxes for each image. If
46
+ performing calculations per channel dimension stats are excluded.
47
47
 
48
48
  Parameters
49
49
  ----------
@@ -61,7 +61,7 @@ def imagestats(
61
61
 
62
62
  See Also
63
63
  --------
64
- dimensionstats, labelstats, pixelstats, visualstats, Outliers
64
+ dimensionstats, pixelstats, visualstats
65
65
 
66
66
  Examples
67
67
  --------
@@ -91,4 +91,4 @@ def imagestats(
91
91
  output_cls = ImageStatsOutput
92
92
 
93
93
  outputs = run_stats(dataset, per_box, per_channel, processors)
94
- return output_cls(**{k: v for d in outputs for k, v in d.dict().items()})
94
+ return output_cls(**{k: v for d in outputs for k, v in d.data().items()})
@@ -10,7 +10,7 @@ from scipy.stats import entropy, kurtosis, skew
10
10
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
11
11
  from dataeval.outputs import PixelStatsOutput
12
12
  from dataeval.outputs._base import set_metadata
13
- from dataeval.typing import ArrayLike, Dataset
13
+ from dataeval.typing import Array, Dataset
14
14
 
15
15
 
16
16
  class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
@@ -37,7 +37,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
37
37
 
38
38
  @set_metadata
39
39
  def pixelstats(
40
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
40
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
41
41
  *,
42
42
  per_box: bool = False,
43
43
  per_channel: bool = False,
@@ -9,7 +9,7 @@ import numpy as np
9
9
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
10
10
  from dataeval.outputs import VisualStatsOutput
11
11
  from dataeval.outputs._base import set_metadata
12
- from dataeval.typing import ArrayLike, Dataset
12
+ from dataeval.typing import Array, Dataset
13
13
  from dataeval.utils._image import edge_filter
14
14
 
15
15
  QUARTILES = (0, 25, 50, 75, 100)
@@ -44,7 +44,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
44
44
 
45
45
  @set_metadata
46
46
  def visualstats(
47
- dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
47
+ dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
48
48
  *,
49
49
  per_box: bool = False,
50
50
  per_channel: bool = False,
@@ -8,6 +8,7 @@ from ._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOu
8
8
  from ._drift import DriftMMDOutput, DriftOutput
9
9
  from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
10
10
  from ._linters import DuplicatesOutput, OutliersOutput
11
+ from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
11
12
  from ._ood import OODOutput, OODScoreOutput
12
13
  from ._stats import (
13
14
  ChannelStatsOutput,
@@ -39,7 +40,11 @@ __all__ = [
39
40
  "ImageStatsOutput",
40
41
  "LabelParityOutput",
41
42
  "LabelStatsOutput",
43
+ "MetadataDistanceOutput",
44
+ "MetadataDistanceValues",
45
+ "MostDeviatedFactorsOutput",
42
46
  "OODOutput",
47
+ "OODPredictorOutput",
43
48
  "OODScoreOutput",
44
49
  "OutliersOutput",
45
50
  "ParityOutput",