dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/detectors/__init__.py +1 -1
  3. dataeval/detectors/drift/__init__.py +1 -1
  4. dataeval/detectors/drift/base.py +2 -2
  5. dataeval/detectors/linters/clusterer.py +1 -1
  6. dataeval/detectors/ood/__init__.py +1 -1
  7. dataeval/detectors/ood/ae.py +14 -6
  8. dataeval/detectors/ood/aegmm.py +14 -6
  9. dataeval/detectors/ood/base.py +9 -3
  10. dataeval/detectors/ood/llr.py +22 -16
  11. dataeval/detectors/ood/vae.py +14 -6
  12. dataeval/detectors/ood/vaegmm.py +14 -6
  13. dataeval/interop.py +9 -7
  14. dataeval/metrics/bias/balance.py +50 -44
  15. dataeval/metrics/bias/coverage.py +38 -6
  16. dataeval/metrics/bias/diversity.py +117 -65
  17. dataeval/metrics/bias/metadata.py +225 -60
  18. dataeval/metrics/bias/parity.py +68 -54
  19. dataeval/utils/__init__.py +4 -3
  20. dataeval/utils/lazy.py +26 -0
  21. dataeval/utils/metadata.py +258 -0
  22. dataeval/utils/shared.py +1 -1
  23. dataeval/utils/split_dataset.py +12 -6
  24. dataeval/utils/tensorflow/_internal/gmm.py +8 -2
  25. dataeval/utils/tensorflow/_internal/loss.py +20 -11
  26. dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
  27. dataeval/utils/tensorflow/_internal/trainer.py +12 -5
  28. dataeval/utils/tensorflow/_internal/utils.py +70 -71
  29. dataeval/utils/torch/datasets.py +2 -2
  30. dataeval/workflows/__init__.py +1 -1
  31. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
  32. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
  33. dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
  34. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
  35. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
@@ -11,6 +11,7 @@ from numpy.typing import ArrayLike, NDArray
11
11
  from scipy.stats import chi2_contingency, chisquare
12
12
 
13
13
  from dataeval.interop import to_numpy
14
+ from dataeval.metrics.bias.metadata import CLASS_LABEL, preprocess_metadata
14
15
  from dataeval.output import OutputMetadata, set_metadata
15
16
 
16
17
  TData = TypeVar("TData", np.float64, NDArray[np.float64])
@@ -27,10 +28,13 @@ class ParityOutput(Generic[TData], OutputMetadata):
27
28
  chi-squared score(s) of the test
28
29
  p_value : np.float64 | NDArray[np.float64]
29
30
  p-value(s) of the test
31
+ metadata_names : list[str] | None
32
+ Names of each metadata factor
30
33
  """
31
34
 
32
35
  score: TData
33
36
  p_value: TData
37
+ metadata_names: list[str] | None
34
38
 
35
39
 
36
40
  def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name: str) -> NDArray[np.intp]:
@@ -39,16 +43,16 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
39
43
 
40
44
  Parameters
41
45
  ----------
42
- continuous_values: NDArray
46
+ continuous_values : NDArray
43
47
  The values to be digitized.
44
- bins: int
48
+ bins : int
45
49
  The number of bins for the discrete values that continuous_values will be digitized into.
46
- factor_name: str
50
+ factor_name : str
47
51
  The name of the factor to be digitized.
48
52
 
49
53
  Returns
50
54
  -------
51
- NDArray
55
+ NDArray[np.intp]
52
56
  The digitized values
53
57
  """
54
58
 
@@ -66,17 +70,21 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
66
70
 
67
71
 
68
72
  def format_discretize_factors(
69
- data_factors: Mapping[str, NDArray[Any]], continuous_factor_bincounts: Mapping[str, int]
73
+ data: NDArray[Any],
74
+ names: list[str],
75
+ is_categorical: list[bool],
76
+ continuous_factor_bincounts: Mapping[str, int] | None,
70
77
  ) -> dict[str, NDArray[Any]]:
71
78
  """
72
79
  Sets up the internal list of metadata factors.
73
80
 
74
81
  Parameters
75
82
  ----------
76
- data_factors: Dict[str, NDArray]
83
+ data : NDArray
77
84
  The dataset factors, which are per-image attributes including class label and metadata.
78
- Each key of dataset_factors is a factor, whose value is the per-image factor values.
79
- continuous_factor_bincounts : Dict[str, int]
85
+ names : list[str]
86
+ The class label
87
+ continuous_factor_bincounts : Mapping[str, int] or None
80
88
  The factors in data_factors that have continuous values and the array of bin counts to
81
89
  discretize values into. All factors are treated as having discrete values unless they
82
90
  are specified as keys in this dictionary. Each element of this array must occur as a key
@@ -89,30 +97,33 @@ def format_discretize_factors(
89
97
  Each key is a metadata factor, whose value is the discrete per-image factor values.
90
98
  """
91
99
 
92
- invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
93
- if invalid_keys:
94
- raise KeyError(
95
- f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
96
- "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
97
- )
100
+ if continuous_factor_bincounts:
101
+ invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
102
+ if invalid_keys:
103
+ raise KeyError(
104
+ f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
105
+ "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
106
+ )
98
107
 
108
+ warn = []
99
109
  metadata_factors = {}
100
-
101
- # make sure each factor has the same number of entries
102
- lengths = []
103
- for arr in data_factors.values():
104
- lengths.append(arr.shape)
105
-
106
- if lengths[1:] != lengths[:-1]:
107
- raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
108
-
109
- metadata_factors = {
110
- name: val
111
- if name not in continuous_factor_bincounts
112
- else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
113
- for name, val in data_factors.items()
114
- if name != "class"
115
- }
110
+ for i, name in enumerate(names):
111
+ if name == CLASS_LABEL:
112
+ continue
113
+ if continuous_factor_bincounts and name in continuous_factor_bincounts:
114
+ metadata_factors[name] = digitize_factor_bins(data[:, i], continuous_factor_bincounts[name], name)
115
+ elif not is_categorical[i]:
116
+ warn.append(name)
117
+ metadata_factors[name] = data[:, i]
118
+ else:
119
+ metadata_factors[name] = data[:, i]
120
+
121
+ if warn:
122
+ warnings.warn(
123
+ f"The following factors appear to be continuous but did not have the desired number of bins specified: \n\
124
+ {warn}",
125
+ UserWarning,
126
+ )
116
127
 
117
128
  return metadata_factors
118
129
 
@@ -126,14 +137,14 @@ def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[
126
137
 
127
138
  Parameters
128
139
  ----------
129
- expected_dist : np.ndarray
140
+ expected_dist : NDArray
130
141
  The expected label distribution. This array represents the anticipated distribution of labels.
131
- observed_dist : np.ndarray
142
+ observed_dist : NDArray
132
143
  The observed label distribution. This array represents the actual distribution of labels in the dataset.
133
144
 
134
145
  Returns
135
146
  -------
136
- np.ndarray
147
+ NDArray
137
148
  The normalized expected distribution, scaled to have the same sum as the observed distribution.
138
149
 
139
150
  Raises
@@ -173,6 +184,8 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
173
184
  ----------
174
185
  label_dist : NDArray
175
186
  Array representing label distributions
187
+ label_name : str
188
+ String representing label name
176
189
 
177
190
  Raises
178
191
  ------
@@ -213,7 +226,7 @@ def label_parity(
213
226
  List of class labels in the expected dataset
214
227
  observed_labels : ArrayLike
215
228
  List of class labels in the observed dataset
216
- num_classes : int | None, default None
229
+ num_classes : int or None, default None
217
230
  The number of unique classes in the datasets. If not provided, the function will infer it
218
231
  from the set of unique labels in expected_labels and observed_labels
219
232
 
@@ -247,7 +260,7 @@ def label_parity(
247
260
  >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
248
261
  >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
249
262
  >>> label_parity(expected_labels, observed_labels)
250
- ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
263
+ ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
251
264
  """
252
265
 
253
266
  # Calculate
@@ -278,13 +291,13 @@ def label_parity(
278
291
  )
279
292
 
280
293
  cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
281
- return ParityOutput(cs, p)
294
+ return ParityOutput(cs, p, None)
282
295
 
283
296
 
284
297
  @set_metadata()
285
298
  def parity(
286
299
  class_labels: ArrayLike,
287
- data_factors: Mapping[str, ArrayLike],
300
+ metadata: Mapping[str, ArrayLike],
288
301
  continuous_factor_bincounts: Mapping[str, int] | None = None,
289
302
  ) -> ParityOutput[NDArray[np.float64]]:
290
303
  """
@@ -297,14 +310,14 @@ def parity(
297
310
 
298
311
  Parameters
299
312
  ----------
300
- class_labels: ArrayLike
313
+ class_labels : ArrayLike
301
314
  List of class labels for each image
302
- data_factors: Mapping[str, ArrayLike]
315
+ metadata : Mapping[str, ArrayLike]
303
316
  The dataset factors, which are per-image metadata attributes.
304
317
  Each key of dataset_factors is a factor, whose value is the per-image factor values.
305
- continuous_factor_bincounts : Mapping[str, int] | None, default None
318
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
306
319
  A dictionary specifying the number of bins for discretizing the continuous factors.
307
- The keys should correspond to the names of continuous factors in `data_factors`,
320
+ The keys should correspond to the names of continuous factors in `metadata`,
308
321
  and the values should be the number of bins to use for discretization.
309
322
  If not provided, no discretization is applied.
310
323
 
@@ -337,42 +350,43 @@ def parity(
337
350
  Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
338
351
 
339
352
  >>> labels = np_random_gen.choice([0, 1, 2], (100))
340
- >>> data_factors = {
353
+ >>> metadata = {
341
354
  ... "age": np_random_gen.choice([25, 30, 35, 45], (100)),
342
355
  ... "income": np_random_gen.choice([50000, 65000, 80000], (100)),
343
356
  ... "gender": np_random_gen.choice(["M", "F"], (100)),
344
357
  ... }
345
358
  >>> continuous_factor_bincounts = {"age": 4, "income": 3}
346
- >>> parity(labels, data_factors, continuous_factor_bincounts)
347
- ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]))
348
- """
359
+ >>> parity(labels, metadata, continuous_factor_bincounts)
360
+ ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
361
+ """ # noqa: E501
349
362
  if len(np.shape(class_labels)) > 1:
350
363
  raise ValueError(
351
364
  f"Got class labels with {len(np.shape(class_labels))}-dimensional",
352
365
  f" shape {np.shape(class_labels)}, but expected a 1-dimensional array.",
353
366
  )
354
367
 
355
- data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
356
- continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
368
+ data, names, is_categorical, _ = preprocess_metadata(class_labels, metadata)
369
+
370
+ factors = format_discretize_factors(data, names, is_categorical, continuous_factor_bincounts)
357
371
 
358
- labels = to_numpy(class_labels)
359
- factors = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
372
+ # unique class labels
373
+ class_idx = names.index(CLASS_LABEL)
374
+ u_cls = np.unique(data[:, class_idx])
360
375
 
361
376
  chi_scores = np.zeros(len(factors))
362
377
  p_values = np.zeros(len(factors))
363
- n_cls = len(np.unique(labels))
364
378
  not_enough_data = {}
365
379
  for i, (current_factor_name, factor_values) in enumerate(factors.items()):
366
380
  unique_factor_values = np.unique(factor_values)
367
- contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
381
+ contingency_matrix = np.zeros((len(unique_factor_values), u_cls.size))
368
382
  # Builds a contingency matrix where entry at index (r,c) represents
369
383
  # the frequency of current_factor_name achieving value unique_factor_values[r]
370
384
  # at a data point with class c.
371
385
 
372
386
  # TODO: Vectorize this nested for loop
373
387
  for fi, factor_value in enumerate(unique_factor_values):
374
- for label in range(n_cls):
375
- with_both = np.bitwise_and((labels == label), factor_values == factor_value)
388
+ for label in u_cls:
389
+ with_both = np.bitwise_and((data[:, class_idx] == label), factor_values == factor_value)
376
390
  contingency_matrix[fi, label] = np.sum(with_both)
377
391
  if 0 < contingency_matrix[fi, label] < 5:
378
392
  if current_factor_name not in not_enough_data:
@@ -414,4 +428,4 @@ def parity(
414
428
  UserWarning,
415
429
  )
416
430
 
417
- return ParityOutput(chi_scores, p_values)
431
+ return ParityOutput(chi_scores, p_values, list(metadata.keys()))
@@ -5,16 +5,17 @@ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backend
5
5
  """
6
6
 
7
7
  from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
8
+ from dataeval.utils.metadata import merge_metadata
8
9
  from dataeval.utils.split_dataset import split_dataset
9
10
 
10
- __all__ = ["split_dataset"]
11
+ __all__ = ["split_dataset", "merge_metadata"]
11
12
 
12
- if _IS_TORCH_AVAILABLE: # pragma: no cover
13
+ if _IS_TORCH_AVAILABLE:
13
14
  from dataeval.utils import torch
14
15
 
15
16
  __all__ += ["torch"]
16
17
 
17
- if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
18
+ if _IS_TENSORFLOW_AVAILABLE:
18
19
  from dataeval.utils import tensorflow
19
20
 
20
21
  __all__ += ["tensorflow"]
dataeval/utils/lazy.py ADDED
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import cached_property
4
+ from importlib import import_module
5
+ from typing import Any
6
+
7
+
8
+ class LazyModule:
9
+ def __init__(self, name: str) -> None:
10
+ self._name = name
11
+
12
+ def __getattr__(self, key: str) -> Any:
13
+ return getattr(self._module, key)
14
+
15
+ @cached_property
16
+ def _module(self):
17
+ return import_module(self._name)
18
+
19
+
20
+ LAZY_MODULES: dict[str, LazyModule] = {}
21
+
22
+
23
+ def lazyload(name: str) -> LazyModule:
24
+ if name not in LAZY_MODULES:
25
+ LAZY_MODULES[name] = LazyModule(name)
26
+ return LAZY_MODULES[name]
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ["merge_metadata"]
4
+
5
+ import warnings
6
+ from typing import Any, Iterable, Mapping, TypeVar, overload
7
+
8
+ import numpy as np
9
+ from numpy.typing import NDArray
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ def _try_cast(v: Any, t: type[T]) -> T | None:
15
+ """Casts a value to a type or returns None if unable"""
16
+ try:
17
+ return t(v) # type: ignore
18
+ except (TypeError, ValueError):
19
+ return None
20
+
21
+
22
+ @overload
23
+ def _convert_type(data: list[str]) -> list[int] | list[float] | list[str]: ...
24
+ @overload
25
+ def _convert_type(data: str) -> int | float | str: ...
26
+
27
+
28
+ def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
29
+ """
30
+ Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
31
+ `float`, or `string`.
32
+
33
+ Parameters
34
+ ----------
35
+ data : list[str] | str
36
+ A list of values or a single value
37
+
38
+ Returns
39
+ -------
40
+ list[int | float | str] | int | float | str
41
+ The same values converted to the numerical type if possible
42
+ """
43
+ if not isinstance(data, list):
44
+ value = _try_cast(data, float)
45
+ return str(data) if value is None else int(value) if value.is_integer() else value
46
+
47
+ converted = []
48
+ TYPE_MAP = {int: 0, float: 1, str: 2}
49
+ max_type = 0
50
+ for value in data:
51
+ value = _convert_type(value)
52
+ max_type = max(max_type, TYPE_MAP.get(type(value), 2))
53
+ converted.append(value)
54
+ for i in range(len(converted)):
55
+ converted[i] = list(TYPE_MAP)[max_type](converted[i])
56
+ return converted
57
+
58
+
59
+ def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], int]:
60
+ """
61
+ Finds indices to minimize unique tuple keys
62
+
63
+ Parameters
64
+ ----------
65
+ keys : Iterable[tuple[str, ...]]
66
+ Collection of unique expanded tuple keys
67
+
68
+ Returns
69
+ -------
70
+ dict[tuple[str, ...], int]
71
+ Mapping of tuple keys to starting index
72
+ """
73
+ indices = {k: -1 for k in keys}
74
+ ks = list(keys)
75
+ while len(ks) > 0:
76
+ seen: dict[tuple[str, ...], list[tuple[str, ...]]] = {}
77
+ for k in ks:
78
+ seen.setdefault(k[indices[k] :], []).append(k)
79
+ ks.clear()
80
+ for sk in seen.values():
81
+ if len(sk) > 1:
82
+ ks.extend(sk)
83
+ for k in sk:
84
+ indices[k] -= 1
85
+ return indices
86
+
87
+
88
+ def _flatten_dict_inner(
89
+ d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
90
+ ) -> tuple[dict[tuple[str, ...], Any], int | None]:
91
+ """
92
+ Recursive internal function for flattening a dictionary.
93
+
94
+ Parameters
95
+ ----------
96
+ d : dict[str, Any]
97
+ Dictionary to flatten
98
+ parent_keys : tuple[str, ...]
99
+ Parent keys to the current dictionary being flattened
100
+ size : int or None, default None
101
+ Tracking int for length of lists
102
+ nested : bool, default False
103
+ Tracking if inside a list
104
+
105
+ Returns
106
+ -------
107
+ tuple[dict[tuple[str, ...], Any], int | None]
108
+ - [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
109
+ - [1]: Size, if any, of the current list of values
110
+ """
111
+ items: dict[tuple[str, ...], Any] = {}
112
+ for k, v in d.items():
113
+ new_keys: tuple[str, ...] = parent_keys + (k,)
114
+ if isinstance(v, dict):
115
+ fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
116
+ items.update(fd)
117
+ elif isinstance(v, (list, tuple)):
118
+ if not nested and (size is None or size == len(v)):
119
+ size = len(v)
120
+ if all(isinstance(i, dict) for i in v):
121
+ for sub_dict in v:
122
+ fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
123
+ for fk, fv in fd.items():
124
+ items.setdefault(fk, []).append(fv)
125
+ else:
126
+ items[new_keys] = v
127
+ else:
128
+ warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
129
+ else:
130
+ items[new_keys] = v
131
+ return items, size
132
+
133
+
134
+ def _flatten_dict(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> dict[str, Any]:
135
+ """
136
+ Flattens a dictionary and converts values to numeric values when possible.
137
+
138
+ Parameters
139
+ ----------
140
+ d : dict[str, Any]
141
+ Dictionary to flatten
142
+ sep : str
143
+ String separator to use when concatenating key names
144
+ ignore_lists : bool
145
+ Option to skip expanding lists within metadata
146
+ fully_qualified : bool
147
+ Option to return dictionary keys full qualified instead of minimized
148
+
149
+ Returns
150
+ -------
151
+ dict[str, Any]
152
+ A flattened dictionary
153
+ """
154
+ expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
155
+
156
+ output = {}
157
+ if fully_qualified:
158
+ expanded = {sep.join(k): v for k, v in expanded.items()}
159
+ else:
160
+ keys = _get_key_indices(expanded)
161
+ expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
162
+ for k, v in expanded.items():
163
+ cv = _convert_type(v)
164
+ if isinstance(cv, list) and len(cv) == size:
165
+ output[k] = cv
166
+ elif not isinstance(cv, list):
167
+ output[k] = cv if not size else [cv] * size
168
+ return output
169
+
170
+
171
+ def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
172
+ """EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
173
+ # single dict
174
+ if len(metadata) < 2:
175
+ return False
176
+
177
+ # dict of non dicts
178
+ keys = list(metadata)
179
+ if not isinstance(metadata[keys[0]], Mapping):
180
+ return False
181
+
182
+ # dict of dicts with matching keys
183
+ return set(metadata[keys[0]]) == set(metadata[keys[1]])
184
+
185
+
186
+ def merge_metadata(
187
+ metadata: Iterable[Mapping[str, Any]],
188
+ ignore_lists: bool = False,
189
+ fully_qualified: bool = False,
190
+ as_numpy: bool = False,
191
+ ) -> dict[str, list[Any]] | dict[str, NDArray[Any]]:
192
+ """
193
+ Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
194
+
195
+ Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
196
+ expanding into multiple hierarchical trees is not supported.
197
+
198
+ Parameters
199
+ ----------
200
+ metadata : Iterable[Mapping[str, Any]]
201
+ Iterable collection of metadata dictionaries to flatten and merge
202
+ ignore_lists : bool, default False
203
+ Option to skip expanding lists within metadata
204
+ fully_qualified : bool, default False
205
+ Option to return dictionary keys full qualified instead of minimized
206
+ as_numpy : bool, default False
207
+ Option to return results as lists or NumPy arrays
208
+
209
+ Returns
210
+ -------
211
+ dict[str, list[Any]] | dict[str, NDArray[Any]]
212
+ A single dictionary containing the flattened data as lists or NumPy arrays
213
+
214
+ Note
215
+ ----
216
+ Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
217
+
218
+ Example
219
+ -------
220
+ >>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3}, {"a": 2, "b": 4}], "source": "example"}]
221
+ >>> merge_metadata(list_metadata)
222
+ {'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
223
+ """
224
+ merged: dict[str, list[Any]] = {}
225
+ isect: set[str] = set()
226
+ union: set[str] = set()
227
+ keys: list[str] | None = None
228
+ dicts: list[Mapping[str, Any]]
229
+
230
+ # EXPERIMENTAL
231
+ if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
232
+ warnings.warn("Experimental processing for dict of dicts.")
233
+ keys = [str(k) for k in metadata]
234
+ dicts = list(metadata.values())
235
+ ignore_lists = True
236
+ else:
237
+ dicts = list(metadata)
238
+
239
+ for d in dicts:
240
+ flattened = _flatten_dict(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
241
+ isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
242
+ union = union.union(flattened.keys())
243
+ for k, v in flattened.items():
244
+ merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
245
+
246
+ if len(union) > len(isect):
247
+ warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
248
+
249
+ output: dict[str, Any] = {}
250
+
251
+ if keys:
252
+ output["keys"] = np.array(keys) if as_numpy else keys
253
+
254
+ for k in (key for key in merged if key in isect):
255
+ cv = _convert_type(merged[k])
256
+ output[k] = np.array(cv) if as_numpy else cv
257
+
258
+ return output
dataeval/utils/shared.py CHANGED
@@ -95,7 +95,7 @@ def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
95
95
  M = len(classes)
96
96
  if M < 2:
97
97
  raise ValueError("Label vector contains less than 2 classes!")
98
- N = np.sum(counts).astype(int)
98
+ N = int(np.sum(counts))
99
99
  return M, N
100
100
 
101
101
 
@@ -144,7 +144,7 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
144
144
  ----------
145
145
  group_ids : np.ndarray
146
146
  Identifies the group to which a sample at the same index belongs.
147
- num_partitions: int
147
+ num_partitions : int
148
148
  How many total (train, val) folds will be generated (+1 if also specifying a test fold).
149
149
 
150
150
  Warns
@@ -242,12 +242,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
242
242
 
243
243
  Returns
244
244
  -------
245
- group_ids: np.ndarray
245
+ group_ids : np.ndarray
246
246
  group identifiers from metadata
247
247
  """
248
248
  features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
249
249
  if not features2group:
250
- return np.zeros(num_samples, dtype=int)
250
+ return np.zeros(num_samples, dtype=np.int_)
251
251
  for name, feature in features2group.items():
252
252
  if len(feature) != num_samples:
253
253
  raise IndexError(f"""Feature length does not match number of labels.
@@ -300,7 +300,13 @@ def make_splits(
300
300
  splits = splitter.split(index, labels)
301
301
  for train_idx, eval_idx in splits:
302
302
  test_ratio = len(eval_idx) / index.shape[0]
303
- split_defs.append({"train": train_idx.astype(int), "eval": eval_idx.astype(int), "eval_frac": test_ratio})
303
+ split_defs.append(
304
+ {
305
+ "train": train_idx.astype(np.int_),
306
+ "eval": eval_idx.astype(np.int_),
307
+ "eval_frac": test_ratio,
308
+ }
309
+ )
304
310
  return split_defs
305
311
 
306
312
 
@@ -318,9 +324,9 @@ def find_best_split(
318
324
  split_defs : list[dict]
319
325
  List of dictionaries, which specifying train index, validation index, and the ratio of
320
326
  validation to all data.
321
- stratified: bool
327
+ stratified : bool
322
328
  If True, maintain dataset class balance within each train/val split
323
- eval_frac: float
329
+ eval_frac : float
324
330
  Desired fraction of the dataset sequestered for evaluation
325
331
 
326
332
  Returns
@@ -8,10 +8,16 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import NamedTuple
11
+ from typing import TYPE_CHECKING, NamedTuple
12
12
 
13
13
  import numpy as np
14
- import tensorflow as tf
14
+
15
+ from dataeval.utils.lazy import lazyload
16
+
17
+ if TYPE_CHECKING:
18
+ import tensorflow as tf
19
+ else:
20
+ tf = lazyload("tensorflow")
15
21
 
16
22
 
17
23
  class GaussianMixtureModelParams(NamedTuple):