dataeval 0.76.0__py3-none-any.whl → 0.76.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataeval/__init__.py CHANGED
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
11
- __version__ = "0.76.0"
11
+ __version__ = "0.76.1"
12
12
 
13
13
  import logging
14
14
 
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- # import contextlib
5
+ import contextlib
6
6
  from dataclasses import dataclass
7
7
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
8
8
 
@@ -18,8 +18,8 @@ from dataeval.metrics.stats.pixelstats import PixelStatsOutput
18
18
  from dataeval.metrics.stats.visualstats import VisualStatsOutput
19
19
  from dataeval.output import Output, set_metadata
20
20
 
21
- # with contextlib.suppress(ImportError):
22
- # import pandas as pd
21
+ with contextlib.suppress(ImportError):
22
+ import pandas as pd
23
23
 
24
24
 
25
25
  IndexIssueMap = dict[int, dict[str, float]]
@@ -69,16 +69,16 @@ def _create_table(metrics, class_wise):
69
69
  return table
70
70
 
71
71
 
72
- # def _create_pandas_dataframe(class_wise):
73
- # """Create data for pandas dataframe"""
74
- # data = []
75
- # for label, metrics_dict in class_wise.items():
76
- # row = {"Class": label}
77
- # total = sum(metrics_dict.values())
78
- # row.update(metrics_dict) # Add metric counts
79
- # row["Total"] = total
80
- # data.append(row)
81
- # return data
72
+ def _create_pandas_dataframe(class_wise):
73
+ """Create data for pandas dataframe"""
74
+ data = []
75
+ for label, metrics_dict in class_wise.items():
76
+ row = {"Class": label}
77
+ total = sum(metrics_dict.values())
78
+ row.update(metrics_dict) # Add metric counts
79
+ row["Total"] = total
80
+ data.append(row)
81
+ return data
82
82
 
83
83
 
84
84
  @dataclass(frozen=True)
@@ -120,23 +120,23 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
120
120
  table = "\n\n".join(outertable)
121
121
  return table
122
122
 
123
- # def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
124
- # import pandas as pd
125
-
126
- # if isinstance(self.issues, dict):
127
- # _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
128
- # data = _create_pandas_dataframe(classwise)
129
- # df = pd.DataFrame(data)
130
- # else:
131
- # df_list = []
132
- # for i, d in enumerate(self.issues):
133
- # _, classwise = _reorganize_by_class_and_metric(d, labelstats)
134
- # data = _create_pandas_dataframe(classwise)
135
- # single_df = pd.DataFrame(data)
136
- # single_df["Dataset"] = i
137
- # df_list.append(single_df)
138
- # df = pd.concat(df_list)
139
- # return df
123
+ def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
124
+ import pandas as pd
125
+
126
+ if isinstance(self.issues, dict):
127
+ _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
128
+ data = _create_pandas_dataframe(classwise)
129
+ df = pd.DataFrame(data)
130
+ else:
131
+ df_list = []
132
+ for i, d in enumerate(self.issues):
133
+ _, classwise = _reorganize_by_class_and_metric(d, labelstats)
134
+ data = _create_pandas_dataframe(classwise)
135
+ single_df = pd.DataFrame(data)
136
+ single_df["Dataset"] = i
137
+ df_list.append(single_df)
138
+ df = pd.concat(df_list)
139
+ return df
140
140
 
141
141
 
142
142
  def _get_outlier_mask(
@@ -253,13 +253,11 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
253
253
  >>> from dataeval.utils.metadata import preprocess
254
254
  >>> rng = np.random.default_rng(175)
255
255
  >>> labels = rng.choice([0, 1, 2], (100))
256
- >>> metadata_dict = [
257
- ... {
256
+ >>> metadata_dict = {
258
257
  ... "age": list(rng.choice([25, 30, 35, 45], (100))),
259
258
  ... "income": list(rng.choice([50000, 65000, 80000], (100))),
260
259
  ... "gender": list(rng.choice(["M", "F"], (100))),
261
- ... }
262
- ... ]
260
+ ... }
263
261
  >>> continuous_factor_bincounts = {"age": 4, "income": 3}
264
262
  >>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
265
263
  >>> parity(metadata)
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- # import contextlib
5
+ import contextlib
6
6
  from collections import Counter, defaultdict
7
7
  from dataclasses import dataclass
8
8
  from typing import Any, Iterable, Mapping, TypeVar
@@ -13,8 +13,8 @@ from numpy.typing import ArrayLike
13
13
  from dataeval.interop import as_numpy
14
14
  from dataeval.output import Output, set_metadata
15
15
 
16
- # with contextlib.suppress(ImportError):
17
- # import pandas as pd
16
+ with contextlib.suppress(ImportError):
17
+ import pandas as pd
18
18
 
19
19
 
20
20
  @dataclass(frozen=True)
@@ -73,24 +73,24 @@ class LabelStatsOutput(Output):
73
73
 
74
74
  return table_str
75
75
 
76
- # def to_dataframe(self) -> pd.DataFrame:
77
- # import pandas as pd
78
-
79
- # class_list = []
80
- # total_count = []
81
- # image_count = []
82
- # for cls in self.label_counts_per_class:
83
- # class_list.append(cls)
84
- # total_count.append(self.label_counts_per_class[cls])
85
- # image_count.append(self.image_counts_per_label[cls])
86
-
87
- # return pd.DataFrame(
88
- # {
89
- # "Label": class_list,
90
- # "Total Count": total_count,
91
- # "Image Count": image_count,
92
- # }
93
- # )
76
+ def to_dataframe(self) -> pd.DataFrame:
77
+ import pandas as pd
78
+
79
+ class_list = []
80
+ total_count = []
81
+ image_count = []
82
+ for cls in self.label_counts_per_class:
83
+ class_list.append(cls)
84
+ total_count.append(self.label_counts_per_class[cls])
85
+ image_count.append(self.image_counts_per_label[cls])
86
+
87
+ return pd.DataFrame(
88
+ {
89
+ "Label": class_list,
90
+ "Total Count": total_count,
91
+ "Image Count": image_count,
92
+ }
93
+ )
94
94
 
95
95
 
96
96
  TKey = TypeVar("TKey", int, str)
@@ -9,6 +9,7 @@ __all__ = ["Metadata", "preprocess", "merge", "flatten"]
9
9
 
10
10
  import warnings
11
11
  from dataclasses import dataclass
12
+ from enum import Enum
12
13
  from typing import Any, Iterable, Literal, Mapping, TypeVar, overload
13
14
 
14
15
  import numpy as np
@@ -20,6 +21,13 @@ from dataeval.output import Output, set_metadata
20
21
 
21
22
  DISCRETE_MIN_WD = 0.054
22
23
  CONTINUOUS_MIN_SAMPLE_SIZE = 20
24
+ DEFAULT_IMAGE_INDEX_KEY = "_image_index"
25
+
26
+
27
+ class DropReason(Enum):
28
+ INCONSISTENT_KEY = "inconsistent_key"
29
+ INCONSISTENT_SIZE = "inconsistent_size"
30
+ NESTED_LIST = "nested_list"
23
31
 
24
32
 
25
33
  T = TypeVar("T")
@@ -41,8 +49,8 @@ def _convert_type(data: str) -> int | float | str: ...
41
49
 
42
50
  def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
43
51
  """
44
- Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
45
- `float`, or `string`.
52
+ Converts a value or a list of values to the simplest form possible,
53
+ in preferred order of `int`, `float`, or `string`.
46
54
 
47
55
  Parameters
48
56
  ----------
@@ -99,8 +107,16 @@ def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], i
99
107
  return indices
100
108
 
101
109
 
110
+ def _sorted_drop_reasons(d: dict[str, set[DropReason]]) -> dict[str, list[str]]:
111
+ return {k: sorted({vv.value for vv in v}) for k, v in sorted(d.items(), key=lambda item: item[1])}
112
+
113
+
102
114
  def _flatten_dict_inner(
103
- d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
115
+ d: Mapping[str, Any],
116
+ dropped: dict[tuple[str, ...], set[DropReason]],
117
+ parent_keys: tuple[str, ...],
118
+ size: int | None = None,
119
+ nested: bool = False,
104
120
  ) -> tuple[dict[tuple[str, ...], Any], int | None]:
105
121
  """
106
122
  Recursive internal function for flattening a dictionary.
@@ -109,6 +125,8 @@ def _flatten_dict_inner(
109
125
  ----------
110
126
  d : dict[str, Any]
111
127
  Dictionary to flatten
128
+ dropped: set[tuple[str, ...]]
129
+ Reference to set of dropped keys from the dictionary
112
130
  parent_keys : tuple[str, ...]
113
131
  Parent keys to the current dictionary being flattened
114
132
  size : int or None, default None
@@ -119,33 +137,62 @@ def _flatten_dict_inner(
119
137
  Returns
120
138
  -------
121
139
  tuple[dict[tuple[str, ...], Any], int | None]
122
- - [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
140
+ - [0]: Dictionary of flattened values with the keys reformatted as a
141
+ hierarchical tuple of strings
123
142
  - [1]: Size, if any, of the current list of values
124
143
  """
125
144
  items: dict[tuple[str, ...], Any] = {}
126
145
  for k, v in d.items():
127
146
  new_keys: tuple[str, ...] = parent_keys + (k,)
128
147
  if isinstance(v, dict):
129
- fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
148
+ fd, size = _flatten_dict_inner(v, dropped, new_keys, size=size, nested=nested)
130
149
  items.update(fd)
131
150
  elif isinstance(v, (list, tuple)):
132
- if not nested and (size is None or size == len(v)):
151
+ if nested:
152
+ dropped.setdefault(parent_keys + (k,), set()).add(DropReason.NESTED_LIST)
153
+ elif size is not None and size != len(v):
154
+ dropped.setdefault(parent_keys + (k,), set()).add(DropReason.INCONSISTENT_SIZE)
155
+ else:
133
156
  size = len(v)
134
157
  if all(isinstance(i, dict) for i in v):
135
158
  for sub_dict in v:
136
- fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
159
+ fd, size = _flatten_dict_inner(sub_dict, dropped, new_keys, size=size, nested=True)
137
160
  for fk, fv in fd.items():
138
161
  items.setdefault(fk, []).append(fv)
139
162
  else:
140
163
  items[new_keys] = v
141
- else:
142
- warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
143
164
  else:
144
165
  items[new_keys] = v
145
166
  return items, size
146
167
 
147
168
 
148
- def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> tuple[dict[str, Any], int]:
169
+ @overload
170
+ def flatten(
171
+ d: Mapping[str, Any],
172
+ return_dropped: Literal[True],
173
+ sep: str = "_",
174
+ ignore_lists: bool = False,
175
+ fully_qualified: bool = False,
176
+ ) -> tuple[dict[str, Any], int, dict[str, list[str]]]: ...
177
+
178
+
179
+ @overload
180
+ def flatten(
181
+ d: Mapping[str, Any],
182
+ return_dropped: Literal[False] = False,
183
+ sep: str = "_",
184
+ ignore_lists: bool = False,
185
+ fully_qualified: bool = False,
186
+ ) -> tuple[dict[str, Any], int]: ...
187
+
188
+
189
+ def flatten(
190
+ d: Mapping[str, Any],
191
+ return_dropped: bool = False,
192
+ sep: str = "_",
193
+ ignore_lists: bool = False,
194
+ fully_qualified: bool = False,
195
+ ):
149
196
  """
150
197
  Flattens a dictionary and converts values to numeric values when possible.
151
198
 
@@ -153,33 +200,53 @@ def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified:
153
200
  ----------
154
201
  d : dict[str, Any]
155
202
  Dictionary to flatten
156
- sep : str
203
+ return_dropped: bool, default False
204
+ Option to return a dictionary of dropped keys and the reason(s) for dropping
205
+ sep : str, default "_"
157
206
  String separator to use when concatenating key names
158
- ignore_lists : bool
207
+ ignore_lists : bool, default False
159
208
  Option to skip expanding lists within metadata
160
- fully_qualified : bool
161
- Option to return dictionary keys full qualified instead of reduced
209
+ fully_qualified : bool, default False
210
+ Option to return dictionary keys fully qualified instead of reduced
162
211
 
163
212
  Returns
164
213
  -------
165
- tuple[dict[str, Any], int]
166
- A tuple of the flattened dictionary and the length of detected lists in metadata
214
+ dict[str, Any]
215
+ Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
216
+ int
217
+ Size of the values in the flattened dictionary
218
+ dict[str, list[str]], Optional
219
+ Dictionary containing dropped keys and reason(s) for dropping
167
220
  """
168
- expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
221
+ dropped_inner: dict[tuple[str, ...], set[DropReason]] = {}
222
+ expanded, size = _flatten_dict_inner(d, dropped=dropped_inner, parent_keys=(), nested=ignore_lists)
169
223
 
170
224
  output = {}
171
- if fully_qualified:
172
- expanded = {sep.join(k): v for k, v in expanded.items()}
173
- else:
174
- keys = _get_key_indices(expanded)
175
- expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
176
225
  for k, v in expanded.items():
177
226
  cv = _convert_type(v)
178
- if isinstance(cv, list) and len(cv) == size:
179
- output[k] = cv
227
+ if isinstance(cv, list):
228
+ if len(cv) == size:
229
+ output[k] = cv
230
+ else:
231
+ dropped_inner.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
180
232
  elif not isinstance(cv, list):
181
233
  output[k] = cv if not size else [cv] * size
182
- return output, size if size is not None else 1
234
+
235
+ if fully_qualified:
236
+ output = {sep.join(k): v for k, v in output.items()}
237
+ else:
238
+ keys = _get_key_indices(output)
239
+ output = {sep.join(k[keys[k] :]): v for k, v in output.items()}
240
+
241
+ size = size if size is not None else 1
242
+ dropped = {sep.join(k): v for k, v in dropped_inner.items()}
243
+
244
+ if return_dropped:
245
+ return output, size, _sorted_drop_reasons(dropped)
246
+ else:
247
+ if dropped:
248
+ warnings.warn(f"Metadata keys {list(dropped)} were dropped.")
249
+ return output, size
183
250
 
184
251
 
185
252
  def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
@@ -197,48 +264,75 @@ def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
197
264
  return set(metadata[keys[0]]) == set(metadata[keys[1]])
198
265
 
199
266
 
267
+ @overload
268
+ def merge(
269
+ metadata: Iterable[Mapping[str, Any]],
270
+ return_dropped: Literal[True],
271
+ ignore_lists: bool = False,
272
+ fully_qualified: bool = False,
273
+ return_numpy: bool = False,
274
+ ) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
275
+
276
+
277
+ @overload
278
+ def merge(
279
+ metadata: Iterable[Mapping[str, Any]],
280
+ return_dropped: Literal[False] = False,
281
+ ignore_lists: bool = False,
282
+ fully_qualified: bool = False,
283
+ return_numpy: bool = False,
284
+ ) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
285
+
286
+
200
287
  def merge(
201
288
  metadata: Iterable[Mapping[str, Any]],
289
+ return_dropped: bool = False,
202
290
  ignore_lists: bool = False,
203
291
  fully_qualified: bool = False,
204
- as_numpy: bool = False,
205
- ) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], NDArray[np.int_]]:
292
+ return_numpy: bool = False,
293
+ ):
206
294
  """
207
- Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
295
+ Merges a collection of metadata dictionaries into a single flattened
296
+ dictionary of keys and values.
208
297
 
209
- Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
210
- expanding into multiple hierarchical trees is not supported.
298
+ Nested dictionaries are flattened, and lists are expanded. Nested lists are
299
+ dropped as the expanding into multiple hierarchical trees is not supported.
300
+ The function adds an internal "_image_index" key to the metadata dictionary
301
+ for consumption by the preprocess function.
211
302
 
212
303
  Parameters
213
304
  ----------
214
305
  metadata : Iterable[Mapping[str, Any]]
215
306
  Iterable collection of metadata dictionaries to flatten and merge
307
+ return_dropped: bool, default False
308
+ Option to return a dictionary of dropped keys and the reason(s) for dropping
216
309
  ignore_lists : bool, default False
217
310
  Option to skip expanding lists within metadata
218
311
  fully_qualified : bool, default False
219
312
  Option to return dictionary keys full qualified instead of minimized
220
- as_numpy : bool, default False
313
+ return_numpy : bool, default False
221
314
  Option to return results as lists or NumPy arrays
222
315
 
223
316
  Returns
224
317
  -------
225
- dict[str, list[Any]] or dict[str, NDArray[Any]]
318
+ dict[str, list[Any]] | dict[str, NDArray[Any]]
226
319
  A single dictionary containing the flattened data as lists or NumPy arrays
227
- NDArray[np.int_]
228
- Array defining where individual images start, helpful when working with object detection metadata
320
+ dict[str, list[str]], Optional
321
+ Dictionary containing dropped keys and reason(s) for dropping
229
322
 
230
323
  Note
231
324
  ----
232
- Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
325
+ Nested lists of values and inconsistent keys are dropped in the merged
326
+ metadata dictionary
233
327
 
234
328
  Example
235
329
  -------
236
330
  >>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3, "c": 5}, {"a": 2, "b": 4}], "source": "example"}]
237
- >>> reorganized_metadata, image_indicies = merge(list_metadata)
331
+ >>> reorganized_metadata, dropped_keys = merge(list_metadata, return_dropped=True)
238
332
  >>> reorganized_metadata
239
- {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
240
- >>> image_indicies
241
- array([0])
333
+ {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example'], '_image_index': [0, 0]}
334
+ >>> dropped_keys
335
+ {'target_c': ['inconsistent_key']}
242
336
  """
243
337
  merged: dict[str, list[Any]] = {}
244
338
  isect: set[str] = set()
@@ -255,37 +349,51 @@ def merge(
255
349
  else:
256
350
  dicts = list(metadata)
257
351
 
258
- image_repeats = np.zeros(len(dicts))
352
+ image_repeats = np.zeros(len(dicts), dtype=np.int_)
353
+ dropped: dict[str, set[DropReason]] = {}
259
354
  for i, d in enumerate(dicts):
260
- flattened, image_repeats[i] = flatten(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
355
+ flattened, image_repeats[i], dropped_inner = flatten(
356
+ d,
357
+ return_dropped=True,
358
+ ignore_lists=ignore_lists,
359
+ fully_qualified=fully_qualified,
360
+ )
261
361
  isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
262
- union = union.union(flattened.keys())
362
+ union.update(flattened.keys())
363
+ for k, v in dropped_inner.items():
364
+ dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
263
365
  for k, v in flattened.items():
264
366
  merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
265
367
 
266
- if len(union) > len(isect):
267
- warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
268
-
269
- output: dict[str, Any] = {}
368
+ for k in union - isect:
369
+ dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
270
370
 
271
371
  if image_repeats.sum() == image_repeats.size:
272
- image_indicies = np.arange(image_repeats.size)
372
+ image_indices = np.arange(image_repeats.size)
273
373
  else:
274
374
  image_ids = np.arange(image_repeats.size)
275
375
  image_data = np.concatenate(
276
376
  [np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
277
377
  )
278
- _, image_unsorted = np.unique(image_data, return_index=True)
279
- image_indicies = np.sort(image_unsorted)
378
+ _, image_unsorted = np.unique(image_data, return_inverse=True)
379
+ image_indices = np.sort(image_unsorted)
380
+
381
+ output: dict[str, Any] = {}
280
382
 
281
383
  if keys:
282
- output["keys"] = np.array(keys) if as_numpy else keys
384
+ output["keys"] = np.array(keys) if return_numpy else keys
283
385
 
284
386
  for k in (key for key in merged if key in isect):
285
387
  cv = _convert_type(merged[k])
286
- output[k] = np.array(cv) if as_numpy else cv
388
+ output[k] = np.array(cv) if return_numpy else cv
389
+ output[DEFAULT_IMAGE_INDEX_KEY] = np.array(image_indices) if return_numpy else list(image_indices)
287
390
 
288
- return output, image_indicies
391
+ if return_dropped:
392
+ return output, _sorted_drop_reasons(dropped)
393
+ else:
394
+ if dropped:
395
+ warnings.warn(f"Metadata keys {list(dropped)} were dropped.")
396
+ return output
289
397
 
290
398
 
291
399
  @dataclass(frozen=True)
@@ -296,13 +404,16 @@ class Metadata(Output):
296
404
  Attributes
297
405
  ----------
298
406
  discrete_factor_names : list[str]
299
- List containing factor names for the original data that was discrete and the binned continuous data
407
+ List containing factor names for the original data that was discrete and
408
+ the binned continuous data
300
409
  discrete_data : NDArray[np.int]
301
- Array containing values for the original data that was discrete and the binned continuous data
410
+ Array containing values for the original data that was discrete and the
411
+ binned continuous data
302
412
  continuous_factor_names : list[str]
303
413
  List containing factor names for the original continuous data
304
414
  continuous_data : NDArray[np.int or np.double] | None
305
- Array containing values for the original continuous data or None if there was no continuous data
415
+ Array containing values for the original continuous data or None if there
416
+ was no continuous data
306
417
  class_labels : NDArray[np.int]
307
418
  Numerical class labels for the images/objects
308
419
  class_names : NDArray[Any]
@@ -322,11 +433,12 @@ class Metadata(Output):
322
433
 
323
434
  @set_metadata
324
435
  def preprocess(
325
- raw_metadata: Iterable[Mapping[str, Any]],
436
+ metadata: dict[str, list[Any]] | dict[str, NDArray[Any]],
326
437
  class_labels: ArrayLike | str,
327
438
  continuous_factor_bins: Mapping[str, int | Iterable[float]] | None = None,
328
439
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
329
440
  exclude: Iterable[str] | None = None,
441
+ image_index_key: str = "_image_index",
330
442
  ) -> Metadata:
331
443
  """
332
444
  Restructures the metadata to be in the correct format for the bias functions.
@@ -338,28 +450,54 @@ def preprocess(
338
450
 
339
451
  Parameters
340
452
  ----------
341
- raw_metadata : Iterable[Mapping[str, Any]]
342
- Iterable collection of metadata dictionaries to flatten and merge.
453
+ metadata : dict[str, list[Any] | NDArray[Any]]
454
+ A flat dictionary which contains all of the metadata on a per image (classification)
455
+ or per object (object detection) basis. Length of lists/array should match the length
456
+ of the label list/array.
343
457
  class_labels : ArrayLike or string
344
- If arraylike, expects the labels for each image (image classification) or each object (object detection).
345
- If the labels are included in the metadata dictionary, pass in the key value.
458
+ If arraylike, expects the labels for each image (image classification)
459
+ or each object (object detection). If the labels are included in the
460
+ metadata dictionary, pass in the key value.
346
461
  continuous_factor_bins : Mapping[str, int or Iterable[float]] or None, default None
347
- User provided dictionary specifying how to bin the continuous metadata factors where the value is either
348
- an int to represent the number of bins, or a list of floats representing the edges for each bin.
462
+ User provided dictionary specifying how to bin the continuous metadata
463
+ factors where the value is either an int to represent the number of bins,
464
+ or a list of floats representing the edges for each bin.
349
465
  auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
350
- Method by which the function will automatically bin continuous metadata factors. It is recommended
351
- that the user provide the bins through the `continuous_factor_bins`.
466
+ Method by which the function will automatically bin continuous metadata factors.
467
+ It is recommended that the user provide the bins through the `continuous_factor_bins`.
352
468
  exclude : Iterable[str] or None, default None
353
469
  User provided collection of metadata keys to exclude when processing metadata.
470
+ image_index_key : str, default "_image_index"
471
+ User provided metadata key which maps the metadata entry to the source image.
354
472
 
355
473
  Returns
356
474
  -------
357
475
  Metadata
358
476
  Output class containing the binned metadata
477
+
478
+ See Also
479
+ --------
480
+ merge
359
481
  """
360
- # Transform metadata into single, flattened dictionary
361
- metadata, image_repeats = merge(raw_metadata)
482
+ # Check that metadata is a single, flattened dictionary with uniform array lengths
483
+ check_length = -1
484
+ for k, v in metadata.items():
485
+ if not isinstance(v, (list, tuple, np.ndarray)):
486
+ raise TypeError(
487
+ "Metadata dictionary needs to be a single dictionary whose values "
488
+ "are arraylike containing the metadata on a per image or per object basis."
489
+ )
490
+ else:
491
+ if check_length == -1:
492
+ check_length = len(v)
493
+ else:
494
+ if check_length != len(v):
495
+ raise ValueError(
496
+ "The lists/arrays in the metadata dict have varying lengths. "
497
+ "Preprocess needs them to be uniform in length."
498
+ )
362
499
 
500
+ # Grab continuous factors if supplied
363
501
  continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else None
364
502
 
365
503
  # Drop any excluded metadata keys
@@ -368,19 +506,28 @@ def preprocess(
368
506
  if continuous_factor_bins:
369
507
  continuous_factor_bins.pop(k, None)
370
508
 
371
- # Get the class label array in numeric form
509
+ # Get the class label array in numeric form and check its dimensions
372
510
  class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
373
511
  if class_array.ndim > 1:
374
512
  raise ValueError(
375
513
  f"Got class labels with {class_array.ndim}-dimensional "
376
514
  f"shape {class_array.shape}, but expected a 1-dimensional array."
377
515
  )
516
+ # Check if the label array is the same length as the metadata arrays
517
+ elif len(class_array) != check_length:
518
+ raise ValueError(
519
+ f"The length of the label array {len(class_array)} is not the same as "
520
+ f"the length of the metadata arrays {check_length}."
521
+ )
378
522
  if not np.issubdtype(class_array.dtype, np.int_):
379
523
  unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
380
524
  else:
381
525
  numerical_labels = class_array
382
526
  unique_classes = np.unique(class_array)
383
527
 
528
+ # Determine if _image_index is given
529
+ image_indices = as_numpy(metadata[image_index_key]) if image_index_key in metadata else np.arange(check_length)
530
+
384
531
  # Bin according to user supplied bins
385
532
  continuous_metadata = {}
386
533
  discrete_metadata = {}
@@ -401,7 +548,7 @@ def preprocess(
401
548
  for key in remaining_keys:
402
549
  data = to_numpy(metadata[key])
403
550
  if np.issubdtype(data.dtype, np.number):
404
- result = _is_continuous(data, image_repeats)
551
+ result = _is_continuous(data, image_indices)
405
552
  if result:
406
553
  continuous_metadata[key] = data
407
554
  unique_samples, ordinal_data = np.unique(data, return_inverse=True)
@@ -419,7 +566,7 @@ def preprocess(
419
566
  else:
420
567
  _, discrete_metadata[key] = np.unique(data, return_inverse=True)
421
568
 
422
- # splitting out the dictionaries into the keys and values
569
+ # Split out the dictionaries into the keys and values
423
570
  discrete_factor_names = list(discrete_metadata.keys())
424
571
  discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
425
572
  continuous_factor_names = list(continuous_metadata.keys())
@@ -499,7 +646,7 @@ def _bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
499
646
  return np.digitize(data, bin_edges) # type: ignore
500
647
 
501
648
 
502
- def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
649
+ def _is_continuous(data: NDArray[np.number], image_indices: NDArray[np.number]) -> bool:
503
650
  """
504
651
  Determines whether the data is continuous or discrete using the Wasserstein distance.
505
652
 
@@ -518,11 +665,11 @@ def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number])
518
665
  measured from a uniform distribution is greater or less than 0.054, respectively.
519
666
  """
520
667
  # Check if the metadata is image specific
521
- _, data_indicies_unsorted = np.unique(data, return_index=True)
522
- if data_indicies_unsorted.size == image_indicies.size:
523
- data_indicies = np.sort(data_indicies_unsorted)
524
- if (data_indicies == image_indicies).all():
525
- data = data[data_indicies]
668
+ _, data_indices_unsorted = np.unique(data, return_index=True)
669
+ if data_indices_unsorted.size == image_indices.size:
670
+ data_indices = np.sort(data_indices_unsorted)
671
+ if (data_indices == image_indices).all():
672
+ data = data[data_indices]
526
673
 
527
674
  # OLD METHOD
528
675
  # uvals = np.unique(data)
@@ -570,7 +717,7 @@ def get_counts(data: NDArray[np.int_], min_num_bins: int | None = None) -> NDArr
570
717
 
571
718
  Returns
572
719
  -------
573
- NDArray[np.int_]
720
+ NDArray[np.int]
574
721
  Bin counts per column of data.
575
722
  """
576
723
  max_value = data.max() + 1 if min_num_bins is None else min_num_bins
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.76.0
3
+ Version: 0.76.1
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -21,8 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3 :: Only
22
22
  Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
- Requires-Dist: matplotlib ; extra == "all"
24
+ Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
25
25
  Requires-Dist: numpy (>=1.24.2)
26
+ Requires-Dist: pandas (>=2.0) ; extra == "all"
26
27
  Requires-Dist: pillow (>=10.3.0)
27
28
  Requires-Dist: requests
28
29
  Requires-Dist: scikit-learn (>=1.5.0)
@@ -38,13 +39,17 @@ Description-Content-Type: text/markdown
38
39
 
39
40
  # DataEval
40
41
 
41
- To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
42
+ To view our extensive collection of tutorials, how-to's, explanation guides,
43
+ and reference material, please visit our documentation on
44
+ **[Read the Docs](https://dataeval.readthedocs.io/)**
42
45
 
43
46
  ## About DataEval
44
47
 
45
48
  <!-- start tagline -->
46
49
 
47
- DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
50
+ DataEval curates datasets to train and test performant, robust, unbiased and
51
+ reliable AI models and monitors for data shifts that impact performance of
52
+ deployed models.
48
53
 
49
54
  <!-- end tagline -->
50
55
 
@@ -52,22 +57,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
52
57
 
53
58
  <!-- start needs -->
54
59
 
55
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
60
+ DataEval is an effective, powerful, and reliable set of tools for any T&E
61
+ engineer. Throughout all stages of the machine learning lifecycle, DataEval
62
+ supports model development, data analysis, and monitoring with state-of-the-art
63
+ algorithms to help you solve difficult problems. With a focus on computer
64
+ vision tasks, DataEval provides simple, but effective metrics for performance
65
+ estimation, bias detection, and dataset linting.
56
66
 
57
67
  <!-- end needs -->
58
68
 
59
69
  <!-- start JATIC interop -->
60
- DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
61
- DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
70
+ DataEval is easy to install, supports a wide range of Python versions, and is
71
+ compatible with many of the most popular packages in the scientific and T&E
72
+ communities.
73
+
74
+ DataEval also has native interopability between JATIC's suite of tools when
75
+ using MAITE-compliant datasets and models.
62
76
  <!-- end JATIC interop -->
63
77
 
64
78
  ## Getting Started
65
79
 
66
80
  **Python versions:** 3.9 - 3.12
67
81
 
68
- **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
82
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
83
+ *Gradient*
69
84
 
70
- Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
85
+ Choose your preferred method of installation below or follow our
86
+ [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
71
87
 
72
88
  * [Installing with pip](#installing-with-pip)
73
89
  * [Installing with conda/mamba](#installing-with-conda)
@@ -75,7 +91,8 @@ Choose your preferred method of installation below or follow our [installation g
75
91
 
76
92
  ### **Installing with pip**
77
93
 
78
- You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
94
+ You can install DataEval directly from pypi.org using the following command.
95
+ The optional dependencies of DataEval are `all`.
79
96
 
80
97
  ```bash
81
98
  pip install dataeval[all]
@@ -83,8 +100,9 @@ pip install dataeval[all]
83
100
 
84
101
  ### **Installing with conda**
85
102
 
86
- DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
87
- are installed from the `pytorch` channel, the channel is specified in the below example.
103
+ DataEval can be installed in a Conda/Mamba environment using the provided
104
+ `environment.yaml` file. As some dependencies are installed from the `pytorch`
105
+ channel, the channel is specified in the below example.
88
106
 
89
107
  ```bash
90
108
  micromamba create -f environment\environment.yaml -c pytorch
@@ -92,7 +110,9 @@ micromamba create -f environment\environment.yaml -c pytorch
92
110
 
93
111
  ### **Installing from GitHub**
94
112
 
95
- To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
113
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
114
+ download larger, binary source files and `poetry` for project dependency
115
+ management.
96
116
 
97
117
  ```bash
98
118
  sudo apt-get install git-lfs
@@ -112,7 +132,9 @@ Install DataEval with optional dependencies for development.
112
132
  poetry install --all-extras --with dev
113
133
  ```
114
134
 
115
- Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
135
+ Now that DataEval is installed, you can run commands in the poetry virtual
136
+ environment by prefixing shell commands with `poetry run`, or activate the
137
+ virtual environment directly in the shell.
116
138
 
117
139
  ```bash
118
140
  poetry shell
@@ -131,7 +153,11 @@ If you have any questions, feel free to reach out to the people below:
131
153
 
132
154
  ### CDAO Funding Acknowledgement
133
155
 
134
- This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
156
+ This material is based upon work supported by the Chief Digital and Artificial
157
+ Intelligence Office under Contract No. W519TC-23-9-2033. The views and
158
+ conclusions contained herein are those of the author(s) and should not be
159
+ interpreted as necessarily representing the official policies or endorsements,
160
+ either expressed or implied, of the U.S. Government.
135
161
 
136
162
  <!-- end acknowledgement -->
137
163
 
@@ -1,4 +1,4 @@
1
- dataeval/__init__.py,sha256=TSINwIPlGIGiYd66kY8gnBnEpBhcgWm7_029htFBgv8,1474
1
+ dataeval/__init__.py,sha256=vqyenyxYGE0OXW3C8PC1YDZRak1uLFIYd45-vh9qafQ,1474
2
2
  dataeval/detectors/__init__.py,sha256=iifG-Z08mH5B4QhkKtAieDGJBKldKvmCXpDQJD9qVY8,206
3
3
  dataeval/detectors/drift/__init__.py,sha256=wO294Oz--l0GuZTAkBpyGwZphbQsot57HoiEX6kjNOc,652
4
4
  dataeval/detectors/drift/base.py,sha256=8zHUnUpmgpWMzDv5C-tUX61lbpDjhJ-eAIiNxaNvWP8,14469
@@ -12,7 +12,7 @@ dataeval/detectors/linters/__init__.py,sha256=CZV5naeYQYL3sHXO_CXB26AXkyTeKHI-TM
12
12
  dataeval/detectors/linters/clusterer.py,sha256=V-bNs4ut2E6SIqU4MR1Y96WBZcs4cavQhvXBB0vFZPw,20937
13
13
  dataeval/detectors/linters/duplicates.py,sha256=Ba-Nmbjqg_HDMlEBqlWW1aFO_BA-HSc-uWHc3cmI394,5620
14
14
  dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
15
- dataeval/detectors/linters/outliers.py,sha256=aGGGOJKs0FTObQtj1m-ench0MHADOhrhC8idf1wRB0s,13786
15
+ dataeval/detectors/linters/outliers.py,sha256=o0LtAHdazLfj5GM2HcVDjVY_AfSU5GpBUjxHPC9VfIc,13728
16
16
  dataeval/detectors/ood/__init__.py,sha256=Ws6_un4pFWNknki7Bp7qjrslZVB9pYNE-K72u2lF65k,291
17
17
  dataeval/detectors/ood/ae.py,sha256=SL8oKTERhMwaZTQWwDhQQ6H07UKj8ozXqEWO3TaOAos,2151
18
18
  dataeval/detectors/ood/base.py,sha256=-ApcC9lyZJAgk-joMpLXF20sJqtvlAugg-W18TcAsEw,3010
@@ -28,7 +28,7 @@ dataeval/metrics/bias/__init__.py,sha256=SIg4Qxza9BqXyKNQLIY0bpqoFvZfK5-GaejpTH6
28
28
  dataeval/metrics/bias/balance.py,sha256=B1sPackyodiBct9Hs88BR4nJde_R61JyjwSBIG_CFug,9171
29
29
  dataeval/metrics/bias/coverage.py,sha256=igVDWJSrO2MvaTEiDUhVzVWPGNB1QOZvngCi8UF0RwA,5746
30
30
  dataeval/metrics/bias/diversity.py,sha256=nF1y2FaQIU0yHQtckoddjqoty2hsVVMqwaXWHRdGfqA,8521
31
- dataeval/metrics/bias/parity.py,sha256=rzi7Z0Z6injCaj2vkbSsZvbKMfk1EN648oKinv5y5Dk,12760
31
+ dataeval/metrics/bias/parity.py,sha256=2gSpXkg6ASnkywRTqqx3b3k1T5Qg1Jm-ihMKNZgEwys,12732
32
32
  dataeval/metrics/estimators/__init__.py,sha256=oY_9jX7V-Kg7-4KpvMNB4rUhsk8QTA0DIoM8d2VtVIg,380
33
33
  dataeval/metrics/estimators/ber.py,sha256=vcndXr0PNLRlYz7u7K74f-B5g3DnUkaTO_WigGdj0cg,5012
34
34
  dataeval/metrics/estimators/divergence.py,sha256=joqqlH0AQFibJkHCCb7i7dMJIGF28fmZIR-tGupQQJQ,4247
@@ -39,7 +39,7 @@ dataeval/metrics/stats/boxratiostats.py,sha256=PS1wvWwhTCMJX56erfPW-BZymXrevvXnK
39
39
  dataeval/metrics/stats/datasetstats.py,sha256=mt5t5WhlVI7mo56dmhqgnk1eH8oBV7dahgmqkFDcKo0,7387
40
40
  dataeval/metrics/stats/dimensionstats.py,sha256=AlPor23dUH718jFNiVNedHQVaQzN-6OKQEVDQbnGE50,4027
41
41
  dataeval/metrics/stats/hashstats.py,sha256=5nNSJ3Tl8gPqpYlWpxl7EHfW6pJd1BtbXYUiuGgH4Eo,5070
42
- dataeval/metrics/stats/labelstats.py,sha256=v9EAg-9h0OtuoU0r3K5TJbHj87fjmnWnNdtg0EPp8co,7030
42
+ dataeval/metrics/stats/labelstats.py,sha256=MW6kB7V8pdIc7yHdXzRwlD6xSl6SYZonNsLUPKAVILI,6992
43
43
  dataeval/metrics/stats/pixelstats.py,sha256=tfvu0tYPgDS0jCCSY2sZ2Ice5r1nNuKx-LYXxZQCw7s,4220
44
44
  dataeval/metrics/stats/visualstats.py,sha256=pEQnAPFg-zQ1U5orwF0-U7kfHuZGjMJDsdEMAoDZd4I,4634
45
45
  dataeval/output.py,sha256=Dyfv1xlrwSbCe7HdDyq8t-kiIRJbBeaMEmMROr1FrVQ,4034
@@ -50,7 +50,7 @@ dataeval/utils/dataset/datasets.py,sha256=7tSqN3d8UncqmXh4eiEwarXgVxc4sMuIKPTqBC
50
50
  dataeval/utils/dataset/read.py,sha256=Q_RaNTFXhkMsx3PrgJEIySdHAA-QxGuih6eq6mnJv-4,1524
51
51
  dataeval/utils/dataset/split.py,sha256=1vNy5I1zZx-LIf8B0y57dUaO_UdVd1hyJggUANkwNtM,18958
52
52
  dataeval/utils/image.py,sha256=AQljELyMFkYsf2AoNOH5dZG8DYE4hPw0MCk85eIXqAw,1926
53
- dataeval/utils/metadata.py,sha256=SjYPXvM7x_3OyQbdfn4WsViqMplEjRxTdz8tjSJEP3E,22497
53
+ dataeval/utils/metadata.py,sha256=tRcXgJsM1l7vt_naNJj8g8_EHD_AB5MGi1uWxqZsA6M,27431
54
54
  dataeval/utils/plot.py,sha256=YyFL1KoJgnl2Bip7m73WVBJa6zbsBnn5c1b3skFfUrA,7068
55
55
  dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
56
56
  dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
@@ -61,7 +61,7 @@ dataeval/utils/torch/models.py,sha256=Df3B_9x5uu-Y5ZOyhRZYpKJnDvxt0hgMeJLy1E4oxp
61
61
  dataeval/utils/torch/trainer.py,sha256=Qay0LK63RuyoGYiJ5zI2C5BVym309ORvp6shhpcrIU4,5589
62
62
  dataeval/workflows/__init__.py,sha256=L9yfBipNFGnYuN2JbMknIHDvziwfa2XAGFnOwifZbls,216
63
63
  dataeval/workflows/sufficiency.py,sha256=jf53J1PAlfRHSjGpMCWRJzImitLtCQvTMCaMm28ZuPM,18675
64
- dataeval-0.76.0.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
65
- dataeval-0.76.0.dist-info/METADATA,sha256=zk12Bkp0R6Glx-VSrG7ip45aTU4y6i_P_mPw2c_SQ6w,5140
66
- dataeval-0.76.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
67
- dataeval-0.76.0.dist-info/RECORD,,
64
+ dataeval-0.76.1.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
65
+ dataeval-0.76.1.dist-info/METADATA,sha256=w02IzEy_S5kgRZFRGbWayMg98uFdn3jJT4Gl6MOQzek,5196
66
+ dataeval-0.76.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
67
+ dataeval-0.76.1.dist-info/RECORD,,