dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +1 -1
- dataeval/detectors/drift/base.py +2 -2
- dataeval/detectors/linters/clusterer.py +1 -1
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +14 -6
- dataeval/detectors/ood/aegmm.py +14 -6
- dataeval/detectors/ood/base.py +9 -3
- dataeval/detectors/ood/llr.py +22 -16
- dataeval/detectors/ood/vae.py +14 -6
- dataeval/detectors/ood/vaegmm.py +14 -6
- dataeval/interop.py +9 -7
- dataeval/metrics/bias/balance.py +50 -44
- dataeval/metrics/bias/coverage.py +38 -6
- dataeval/metrics/bias/diversity.py +117 -65
- dataeval/metrics/bias/metadata.py +225 -60
- dataeval/metrics/bias/parity.py +68 -54
- dataeval/utils/__init__.py +4 -3
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/shared.py +1 -1
- dataeval/utils/split_dataset.py +12 -6
- dataeval/utils/tensorflow/_internal/gmm.py +8 -2
- dataeval/utils/tensorflow/_internal/loss.py +20 -11
- dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
- dataeval/utils/tensorflow/_internal/trainer.py +12 -5
- dataeval/utils/tensorflow/_internal/utils.py +70 -71
- dataeval/utils/torch/datasets.py +2 -2
- dataeval/workflows/__init__.py +1 -1
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
- dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
dataeval/metrics/bias/parity.py
CHANGED
@@ -11,6 +11,7 @@ from numpy.typing import ArrayLike, NDArray
|
|
11
11
|
from scipy.stats import chi2_contingency, chisquare
|
12
12
|
|
13
13
|
from dataeval.interop import to_numpy
|
14
|
+
from dataeval.metrics.bias.metadata import CLASS_LABEL, preprocess_metadata
|
14
15
|
from dataeval.output import OutputMetadata, set_metadata
|
15
16
|
|
16
17
|
TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
@@ -27,10 +28,13 @@ class ParityOutput(Generic[TData], OutputMetadata):
|
|
27
28
|
chi-squared score(s) of the test
|
28
29
|
p_value : np.float64 | NDArray[np.float64]
|
29
30
|
p-value(s) of the test
|
31
|
+
metadata_names : list[str] | None
|
32
|
+
Names of each metadata factor
|
30
33
|
"""
|
31
34
|
|
32
35
|
score: TData
|
33
36
|
p_value: TData
|
37
|
+
metadata_names: list[str] | None
|
34
38
|
|
35
39
|
|
36
40
|
def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name: str) -> NDArray[np.intp]:
|
@@ -39,16 +43,16 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
|
|
39
43
|
|
40
44
|
Parameters
|
41
45
|
----------
|
42
|
-
continuous_values: NDArray
|
46
|
+
continuous_values : NDArray
|
43
47
|
The values to be digitized.
|
44
|
-
bins: int
|
48
|
+
bins : int
|
45
49
|
The number of bins for the discrete values that continuous_values will be digitized into.
|
46
|
-
factor_name: str
|
50
|
+
factor_name : str
|
47
51
|
The name of the factor to be digitized.
|
48
52
|
|
49
53
|
Returns
|
50
54
|
-------
|
51
|
-
NDArray
|
55
|
+
NDArray[np.intp]
|
52
56
|
The digitized values
|
53
57
|
"""
|
54
58
|
|
@@ -66,17 +70,21 @@ def digitize_factor_bins(continuous_values: NDArray[Any], bins: int, factor_name
|
|
66
70
|
|
67
71
|
|
68
72
|
def format_discretize_factors(
|
69
|
-
|
73
|
+
data: NDArray[Any],
|
74
|
+
names: list[str],
|
75
|
+
is_categorical: list[bool],
|
76
|
+
continuous_factor_bincounts: Mapping[str, int] | None,
|
70
77
|
) -> dict[str, NDArray[Any]]:
|
71
78
|
"""
|
72
79
|
Sets up the internal list of metadata factors.
|
73
80
|
|
74
81
|
Parameters
|
75
82
|
----------
|
76
|
-
|
83
|
+
data : NDArray
|
77
84
|
The dataset factors, which are per-image attributes including class label and metadata.
|
78
|
-
|
79
|
-
|
85
|
+
names : list[str]
|
86
|
+
The class label
|
87
|
+
continuous_factor_bincounts : Mapping[str, int] or None
|
80
88
|
The factors in data_factors that have continuous values and the array of bin counts to
|
81
89
|
discretize values into. All factors are treated as having discrete values unless they
|
82
90
|
are specified as keys in this dictionary. Each element of this array must occur as a key
|
@@ -89,30 +97,33 @@ def format_discretize_factors(
|
|
89
97
|
Each key is a metadata factor, whose value is the discrete per-image factor values.
|
90
98
|
"""
|
91
99
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
100
|
+
if continuous_factor_bincounts:
|
101
|
+
invalid_keys = set(continuous_factor_bincounts.keys()) - set(names)
|
102
|
+
if invalid_keys:
|
103
|
+
raise KeyError(
|
104
|
+
f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
|
105
|
+
"keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
|
106
|
+
)
|
98
107
|
|
108
|
+
warn = []
|
99
109
|
metadata_factors = {}
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
110
|
+
for i, name in enumerate(names):
|
111
|
+
if name == CLASS_LABEL:
|
112
|
+
continue
|
113
|
+
if continuous_factor_bincounts and name in continuous_factor_bincounts:
|
114
|
+
metadata_factors[name] = digitize_factor_bins(data[:, i], continuous_factor_bincounts[name], name)
|
115
|
+
elif not is_categorical[i]:
|
116
|
+
warn.append(name)
|
117
|
+
metadata_factors[name] = data[:, i]
|
118
|
+
else:
|
119
|
+
metadata_factors[name] = data[:, i]
|
120
|
+
|
121
|
+
if warn:
|
122
|
+
warnings.warn(
|
123
|
+
f"The following factors appear to be continuous but did not have the desired number of bins specified: \n\
|
124
|
+
{warn}",
|
125
|
+
UserWarning,
|
126
|
+
)
|
116
127
|
|
117
128
|
return metadata_factors
|
118
129
|
|
@@ -126,14 +137,14 @@ def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[
|
|
126
137
|
|
127
138
|
Parameters
|
128
139
|
----------
|
129
|
-
expected_dist :
|
140
|
+
expected_dist : NDArray
|
130
141
|
The expected label distribution. This array represents the anticipated distribution of labels.
|
131
|
-
observed_dist :
|
142
|
+
observed_dist : NDArray
|
132
143
|
The observed label distribution. This array represents the actual distribution of labels in the dataset.
|
133
144
|
|
134
145
|
Returns
|
135
146
|
-------
|
136
|
-
|
147
|
+
NDArray
|
137
148
|
The normalized expected distribution, scaled to have the same sum as the observed distribution.
|
138
149
|
|
139
150
|
Raises
|
@@ -173,6 +184,8 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
|
|
173
184
|
----------
|
174
185
|
label_dist : NDArray
|
175
186
|
Array representing label distributions
|
187
|
+
label_name : str
|
188
|
+
String representing label name
|
176
189
|
|
177
190
|
Raises
|
178
191
|
------
|
@@ -213,7 +226,7 @@ def label_parity(
|
|
213
226
|
List of class labels in the expected dataset
|
214
227
|
observed_labels : ArrayLike
|
215
228
|
List of class labels in the observed dataset
|
216
|
-
num_classes : int
|
229
|
+
num_classes : int or None, default None
|
217
230
|
The number of unique classes in the datasets. If not provided, the function will infer it
|
218
231
|
from the set of unique labels in expected_labels and observed_labels
|
219
232
|
|
@@ -247,7 +260,7 @@ def label_parity(
|
|
247
260
|
>>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
|
248
261
|
>>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
|
249
262
|
>>> label_parity(expected_labels, observed_labels)
|
250
|
-
ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
|
263
|
+
ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
|
251
264
|
"""
|
252
265
|
|
253
266
|
# Calculate
|
@@ -278,13 +291,13 @@ def label_parity(
|
|
278
291
|
)
|
279
292
|
|
280
293
|
cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
|
281
|
-
return ParityOutput(cs, p)
|
294
|
+
return ParityOutput(cs, p, None)
|
282
295
|
|
283
296
|
|
284
297
|
@set_metadata()
|
285
298
|
def parity(
|
286
299
|
class_labels: ArrayLike,
|
287
|
-
|
300
|
+
metadata: Mapping[str, ArrayLike],
|
288
301
|
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
289
302
|
) -> ParityOutput[NDArray[np.float64]]:
|
290
303
|
"""
|
@@ -297,14 +310,14 @@ def parity(
|
|
297
310
|
|
298
311
|
Parameters
|
299
312
|
----------
|
300
|
-
class_labels: ArrayLike
|
313
|
+
class_labels : ArrayLike
|
301
314
|
List of class labels for each image
|
302
|
-
|
315
|
+
metadata : Mapping[str, ArrayLike]
|
303
316
|
The dataset factors, which are per-image metadata attributes.
|
304
317
|
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
305
|
-
continuous_factor_bincounts : Mapping[str, int]
|
318
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
306
319
|
A dictionary specifying the number of bins for discretizing the continuous factors.
|
307
|
-
The keys should correspond to the names of continuous factors in `
|
320
|
+
The keys should correspond to the names of continuous factors in `metadata`,
|
308
321
|
and the values should be the number of bins to use for discretization.
|
309
322
|
If not provided, no discretization is applied.
|
310
323
|
|
@@ -337,42 +350,43 @@ def parity(
|
|
337
350
|
Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
|
338
351
|
|
339
352
|
>>> labels = np_random_gen.choice([0, 1, 2], (100))
|
340
|
-
>>>
|
353
|
+
>>> metadata = {
|
341
354
|
... "age": np_random_gen.choice([25, 30, 35, 45], (100)),
|
342
355
|
... "income": np_random_gen.choice([50000, 65000, 80000], (100)),
|
343
356
|
... "gender": np_random_gen.choice(["M", "F"], (100)),
|
344
357
|
... }
|
345
358
|
>>> continuous_factor_bincounts = {"age": 4, "income": 3}
|
346
|
-
>>> parity(labels,
|
347
|
-
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]))
|
348
|
-
"""
|
359
|
+
>>> parity(labels, metadata, continuous_factor_bincounts)
|
360
|
+
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
|
361
|
+
""" # noqa: E501
|
349
362
|
if len(np.shape(class_labels)) > 1:
|
350
363
|
raise ValueError(
|
351
364
|
f"Got class labels with {len(np.shape(class_labels))}-dimensional",
|
352
365
|
f" shape {np.shape(class_labels)}, but expected a 1-dimensional array.",
|
353
366
|
)
|
354
367
|
|
355
|
-
|
356
|
-
|
368
|
+
data, names, is_categorical, _ = preprocess_metadata(class_labels, metadata)
|
369
|
+
|
370
|
+
factors = format_discretize_factors(data, names, is_categorical, continuous_factor_bincounts)
|
357
371
|
|
358
|
-
|
359
|
-
|
372
|
+
# unique class labels
|
373
|
+
class_idx = names.index(CLASS_LABEL)
|
374
|
+
u_cls = np.unique(data[:, class_idx])
|
360
375
|
|
361
376
|
chi_scores = np.zeros(len(factors))
|
362
377
|
p_values = np.zeros(len(factors))
|
363
|
-
n_cls = len(np.unique(labels))
|
364
378
|
not_enough_data = {}
|
365
379
|
for i, (current_factor_name, factor_values) in enumerate(factors.items()):
|
366
380
|
unique_factor_values = np.unique(factor_values)
|
367
|
-
contingency_matrix = np.zeros((len(unique_factor_values),
|
381
|
+
contingency_matrix = np.zeros((len(unique_factor_values), u_cls.size))
|
368
382
|
# Builds a contingency matrix where entry at index (r,c) represents
|
369
383
|
# the frequency of current_factor_name achieving value unique_factor_values[r]
|
370
384
|
# at a data point with class c.
|
371
385
|
|
372
386
|
# TODO: Vectorize this nested for loop
|
373
387
|
for fi, factor_value in enumerate(unique_factor_values):
|
374
|
-
for label in
|
375
|
-
with_both = np.bitwise_and((
|
388
|
+
for label in u_cls:
|
389
|
+
with_both = np.bitwise_and((data[:, class_idx] == label), factor_values == factor_value)
|
376
390
|
contingency_matrix[fi, label] = np.sum(with_both)
|
377
391
|
if 0 < contingency_matrix[fi, label] < 5:
|
378
392
|
if current_factor_name not in not_enough_data:
|
@@ -414,4 +428,4 @@ def parity(
|
|
414
428
|
UserWarning,
|
415
429
|
)
|
416
430
|
|
417
|
-
return ParityOutput(chi_scores, p_values)
|
431
|
+
return ParityOutput(chi_scores, p_values, list(metadata.keys()))
|
dataeval/utils/__init__.py
CHANGED
@@ -5,16 +5,17 @@ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backend
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
|
8
|
+
from dataeval.utils.metadata import merge_metadata
|
8
9
|
from dataeval.utils.split_dataset import split_dataset
|
9
10
|
|
10
|
-
__all__ = ["split_dataset"]
|
11
|
+
__all__ = ["split_dataset", "merge_metadata"]
|
11
12
|
|
12
|
-
if _IS_TORCH_AVAILABLE:
|
13
|
+
if _IS_TORCH_AVAILABLE:
|
13
14
|
from dataeval.utils import torch
|
14
15
|
|
15
16
|
__all__ += ["torch"]
|
16
17
|
|
17
|
-
if _IS_TENSORFLOW_AVAILABLE:
|
18
|
+
if _IS_TENSORFLOW_AVAILABLE:
|
18
19
|
from dataeval.utils import tensorflow
|
19
20
|
|
20
21
|
__all__ += ["tensorflow"]
|
dataeval/utils/lazy.py
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import cached_property
|
4
|
+
from importlib import import_module
|
5
|
+
from typing import Any
|
6
|
+
|
7
|
+
|
8
|
+
class LazyModule:
|
9
|
+
def __init__(self, name: str) -> None:
|
10
|
+
self._name = name
|
11
|
+
|
12
|
+
def __getattr__(self, key: str) -> Any:
|
13
|
+
return getattr(self._module, key)
|
14
|
+
|
15
|
+
@cached_property
|
16
|
+
def _module(self):
|
17
|
+
return import_module(self._name)
|
18
|
+
|
19
|
+
|
20
|
+
LAZY_MODULES: dict[str, LazyModule] = {}
|
21
|
+
|
22
|
+
|
23
|
+
def lazyload(name: str) -> LazyModule:
|
24
|
+
if name not in LAZY_MODULES:
|
25
|
+
LAZY_MODULES[name] = LazyModule(name)
|
26
|
+
return LAZY_MODULES[name]
|
@@ -0,0 +1,258 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = ["merge_metadata"]
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
from typing import Any, Iterable, Mapping, TypeVar, overload
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from numpy.typing import NDArray
|
10
|
+
|
11
|
+
T = TypeVar("T")
|
12
|
+
|
13
|
+
|
14
|
+
def _try_cast(v: Any, t: type[T]) -> T | None:
|
15
|
+
"""Casts a value to a type or returns None if unable"""
|
16
|
+
try:
|
17
|
+
return t(v) # type: ignore
|
18
|
+
except (TypeError, ValueError):
|
19
|
+
return None
|
20
|
+
|
21
|
+
|
22
|
+
@overload
|
23
|
+
def _convert_type(data: list[str]) -> list[int] | list[float] | list[str]: ...
|
24
|
+
@overload
|
25
|
+
def _convert_type(data: str) -> int | float | str: ...
|
26
|
+
|
27
|
+
|
28
|
+
def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
|
29
|
+
"""
|
30
|
+
Converts a value or a list of values to the simplest form possible, in preferred order of `int`,
|
31
|
+
`float`, or `string`.
|
32
|
+
|
33
|
+
Parameters
|
34
|
+
----------
|
35
|
+
data : list[str] | str
|
36
|
+
A list of values or a single value
|
37
|
+
|
38
|
+
Returns
|
39
|
+
-------
|
40
|
+
list[int | float | str] | int | float | str
|
41
|
+
The same values converted to the numerical type if possible
|
42
|
+
"""
|
43
|
+
if not isinstance(data, list):
|
44
|
+
value = _try_cast(data, float)
|
45
|
+
return str(data) if value is None else int(value) if value.is_integer() else value
|
46
|
+
|
47
|
+
converted = []
|
48
|
+
TYPE_MAP = {int: 0, float: 1, str: 2}
|
49
|
+
max_type = 0
|
50
|
+
for value in data:
|
51
|
+
value = _convert_type(value)
|
52
|
+
max_type = max(max_type, TYPE_MAP.get(type(value), 2))
|
53
|
+
converted.append(value)
|
54
|
+
for i in range(len(converted)):
|
55
|
+
converted[i] = list(TYPE_MAP)[max_type](converted[i])
|
56
|
+
return converted
|
57
|
+
|
58
|
+
|
59
|
+
def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], int]:
|
60
|
+
"""
|
61
|
+
Finds indices to minimize unique tuple keys
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
keys : Iterable[tuple[str, ...]]
|
66
|
+
Collection of unique expanded tuple keys
|
67
|
+
|
68
|
+
Returns
|
69
|
+
-------
|
70
|
+
dict[tuple[str, ...], int]
|
71
|
+
Mapping of tuple keys to starting index
|
72
|
+
"""
|
73
|
+
indices = {k: -1 for k in keys}
|
74
|
+
ks = list(keys)
|
75
|
+
while len(ks) > 0:
|
76
|
+
seen: dict[tuple[str, ...], list[tuple[str, ...]]] = {}
|
77
|
+
for k in ks:
|
78
|
+
seen.setdefault(k[indices[k] :], []).append(k)
|
79
|
+
ks.clear()
|
80
|
+
for sk in seen.values():
|
81
|
+
if len(sk) > 1:
|
82
|
+
ks.extend(sk)
|
83
|
+
for k in sk:
|
84
|
+
indices[k] -= 1
|
85
|
+
return indices
|
86
|
+
|
87
|
+
|
88
|
+
def _flatten_dict_inner(
|
89
|
+
d: Mapping[str, Any], parent_keys: tuple[str, ...], size: int | None = None, nested: bool = False
|
90
|
+
) -> tuple[dict[tuple[str, ...], Any], int | None]:
|
91
|
+
"""
|
92
|
+
Recursive internal function for flattening a dictionary.
|
93
|
+
|
94
|
+
Parameters
|
95
|
+
----------
|
96
|
+
d : dict[str, Any]
|
97
|
+
Dictionary to flatten
|
98
|
+
parent_keys : tuple[str, ...]
|
99
|
+
Parent keys to the current dictionary being flattened
|
100
|
+
size : int or None, default None
|
101
|
+
Tracking int for length of lists
|
102
|
+
nested : bool, default False
|
103
|
+
Tracking if inside a list
|
104
|
+
|
105
|
+
Returns
|
106
|
+
-------
|
107
|
+
tuple[dict[tuple[str, ...], Any], int | None]
|
108
|
+
- [0]: Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
|
109
|
+
- [1]: Size, if any, of the current list of values
|
110
|
+
"""
|
111
|
+
items: dict[tuple[str, ...], Any] = {}
|
112
|
+
for k, v in d.items():
|
113
|
+
new_keys: tuple[str, ...] = parent_keys + (k,)
|
114
|
+
if isinstance(v, dict):
|
115
|
+
fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
|
116
|
+
items.update(fd)
|
117
|
+
elif isinstance(v, (list, tuple)):
|
118
|
+
if not nested and (size is None or size == len(v)):
|
119
|
+
size = len(v)
|
120
|
+
if all(isinstance(i, dict) for i in v):
|
121
|
+
for sub_dict in v:
|
122
|
+
fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
|
123
|
+
for fk, fv in fd.items():
|
124
|
+
items.setdefault(fk, []).append(fv)
|
125
|
+
else:
|
126
|
+
items[new_keys] = v
|
127
|
+
else:
|
128
|
+
warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
|
129
|
+
else:
|
130
|
+
items[new_keys] = v
|
131
|
+
return items, size
|
132
|
+
|
133
|
+
|
134
|
+
def _flatten_dict(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified: bool) -> dict[str, Any]:
|
135
|
+
"""
|
136
|
+
Flattens a dictionary and converts values to numeric values when possible.
|
137
|
+
|
138
|
+
Parameters
|
139
|
+
----------
|
140
|
+
d : dict[str, Any]
|
141
|
+
Dictionary to flatten
|
142
|
+
sep : str
|
143
|
+
String separator to use when concatenating key names
|
144
|
+
ignore_lists : bool
|
145
|
+
Option to skip expanding lists within metadata
|
146
|
+
fully_qualified : bool
|
147
|
+
Option to return dictionary keys full qualified instead of minimized
|
148
|
+
|
149
|
+
Returns
|
150
|
+
-------
|
151
|
+
dict[str, Any]
|
152
|
+
A flattened dictionary
|
153
|
+
"""
|
154
|
+
expanded, size = _flatten_dict_inner(d, parent_keys=(), nested=ignore_lists)
|
155
|
+
|
156
|
+
output = {}
|
157
|
+
if fully_qualified:
|
158
|
+
expanded = {sep.join(k): v for k, v in expanded.items()}
|
159
|
+
else:
|
160
|
+
keys = _get_key_indices(expanded)
|
161
|
+
expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
|
162
|
+
for k, v in expanded.items():
|
163
|
+
cv = _convert_type(v)
|
164
|
+
if isinstance(cv, list) and len(cv) == size:
|
165
|
+
output[k] = cv
|
166
|
+
elif not isinstance(cv, list):
|
167
|
+
output[k] = cv if not size else [cv] * size
|
168
|
+
return output
|
169
|
+
|
170
|
+
|
171
|
+
def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
|
172
|
+
"""EXPERIMENTAL: Attempt to detect if metadata is a dict of dicts"""
|
173
|
+
# single dict
|
174
|
+
if len(metadata) < 2:
|
175
|
+
return False
|
176
|
+
|
177
|
+
# dict of non dicts
|
178
|
+
keys = list(metadata)
|
179
|
+
if not isinstance(metadata[keys[0]], Mapping):
|
180
|
+
return False
|
181
|
+
|
182
|
+
# dict of dicts with matching keys
|
183
|
+
return set(metadata[keys[0]]) == set(metadata[keys[1]])
|
184
|
+
|
185
|
+
|
186
|
+
def merge_metadata(
|
187
|
+
metadata: Iterable[Mapping[str, Any]],
|
188
|
+
ignore_lists: bool = False,
|
189
|
+
fully_qualified: bool = False,
|
190
|
+
as_numpy: bool = False,
|
191
|
+
) -> dict[str, list[Any]] | dict[str, NDArray[Any]]:
|
192
|
+
"""
|
193
|
+
Merges a collection of metadata dictionaries into a single flattened dictionary of keys and values.
|
194
|
+
|
195
|
+
Nested dictionaries are flattened, and lists are expanded. Nested lists are dropped as the
|
196
|
+
expanding into multiple hierarchical trees is not supported.
|
197
|
+
|
198
|
+
Parameters
|
199
|
+
----------
|
200
|
+
metadata : Iterable[Mapping[str, Any]]
|
201
|
+
Iterable collection of metadata dictionaries to flatten and merge
|
202
|
+
ignore_lists : bool, default False
|
203
|
+
Option to skip expanding lists within metadata
|
204
|
+
fully_qualified : bool, default False
|
205
|
+
Option to return dictionary keys full qualified instead of minimized
|
206
|
+
as_numpy : bool, default False
|
207
|
+
Option to return results as lists or NumPy arrays
|
208
|
+
|
209
|
+
Returns
|
210
|
+
-------
|
211
|
+
dict[str, list[Any]] | dict[str, NDArray[Any]]
|
212
|
+
A single dictionary containing the flattened data as lists or NumPy arrays
|
213
|
+
|
214
|
+
Note
|
215
|
+
----
|
216
|
+
Nested lists of values and inconsistent keys are dropped in the merged metadata dictionary
|
217
|
+
|
218
|
+
Example
|
219
|
+
-------
|
220
|
+
>>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3}, {"a": 2, "b": 4}], "source": "example"}]
|
221
|
+
>>> merge_metadata(list_metadata)
|
222
|
+
{'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
|
223
|
+
"""
|
224
|
+
merged: dict[str, list[Any]] = {}
|
225
|
+
isect: set[str] = set()
|
226
|
+
union: set[str] = set()
|
227
|
+
keys: list[str] | None = None
|
228
|
+
dicts: list[Mapping[str, Any]]
|
229
|
+
|
230
|
+
# EXPERIMENTAL
|
231
|
+
if isinstance(metadata, Mapping) and _is_metadata_dict_of_dicts(metadata):
|
232
|
+
warnings.warn("Experimental processing for dict of dicts.")
|
233
|
+
keys = [str(k) for k in metadata]
|
234
|
+
dicts = list(metadata.values())
|
235
|
+
ignore_lists = True
|
236
|
+
else:
|
237
|
+
dicts = list(metadata)
|
238
|
+
|
239
|
+
for d in dicts:
|
240
|
+
flattened = _flatten_dict(d, sep="_", ignore_lists=ignore_lists, fully_qualified=fully_qualified)
|
241
|
+
isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
|
242
|
+
union = union.union(flattened.keys())
|
243
|
+
for k, v in flattened.items():
|
244
|
+
merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
|
245
|
+
|
246
|
+
if len(union) > len(isect):
|
247
|
+
warnings.warn(f"Inconsistent metadata keys found. Dropping {union - isect} from metadata.")
|
248
|
+
|
249
|
+
output: dict[str, Any] = {}
|
250
|
+
|
251
|
+
if keys:
|
252
|
+
output["keys"] = np.array(keys) if as_numpy else keys
|
253
|
+
|
254
|
+
for k in (key for key in merged if key in isect):
|
255
|
+
cv = _convert_type(merged[k])
|
256
|
+
output[k] = np.array(cv) if as_numpy else cv
|
257
|
+
|
258
|
+
return output
|
dataeval/utils/shared.py
CHANGED
dataeval/utils/split_dataset.py
CHANGED
@@ -144,7 +144,7 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
|
|
144
144
|
----------
|
145
145
|
group_ids : np.ndarray
|
146
146
|
Identifies the group to which a sample at the same index belongs.
|
147
|
-
num_partitions: int
|
147
|
+
num_partitions : int
|
148
148
|
How many total (train, val) folds will be generated (+1 if also specifying a test fold).
|
149
149
|
|
150
150
|
Warns
|
@@ -242,12 +242,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
|
|
242
242
|
|
243
243
|
Returns
|
244
244
|
-------
|
245
|
-
group_ids: np.ndarray
|
245
|
+
group_ids : np.ndarray
|
246
246
|
group identifiers from metadata
|
247
247
|
"""
|
248
248
|
features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
|
249
249
|
if not features2group:
|
250
|
-
return np.zeros(num_samples, dtype=
|
250
|
+
return np.zeros(num_samples, dtype=np.int_)
|
251
251
|
for name, feature in features2group.items():
|
252
252
|
if len(feature) != num_samples:
|
253
253
|
raise IndexError(f"""Feature length does not match number of labels.
|
@@ -300,7 +300,13 @@ def make_splits(
|
|
300
300
|
splits = splitter.split(index, labels)
|
301
301
|
for train_idx, eval_idx in splits:
|
302
302
|
test_ratio = len(eval_idx) / index.shape[0]
|
303
|
-
split_defs.append(
|
303
|
+
split_defs.append(
|
304
|
+
{
|
305
|
+
"train": train_idx.astype(np.int_),
|
306
|
+
"eval": eval_idx.astype(np.int_),
|
307
|
+
"eval_frac": test_ratio,
|
308
|
+
}
|
309
|
+
)
|
304
310
|
return split_defs
|
305
311
|
|
306
312
|
|
@@ -318,9 +324,9 @@ def find_best_split(
|
|
318
324
|
split_defs : list[dict]
|
319
325
|
List of dictionaries, which specifying train index, validation index, and the ratio of
|
320
326
|
validation to all data.
|
321
|
-
stratified: bool
|
327
|
+
stratified : bool
|
322
328
|
If True, maintain dataset class balance within each train/val split
|
323
|
-
eval_frac: float
|
329
|
+
eval_frac : float
|
324
330
|
Desired fraction of the dataset sequestered for evaluation
|
325
331
|
|
326
332
|
Returns
|
@@ -8,10 +8,16 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
from typing import NamedTuple
|
11
|
+
from typing import TYPE_CHECKING, NamedTuple
|
12
12
|
|
13
13
|
import numpy as np
|
14
|
-
|
14
|
+
|
15
|
+
from dataeval.utils.lazy import lazyload
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
import tensorflow as tf
|
19
|
+
else:
|
20
|
+
tf = lazyload("tensorflow")
|
15
21
|
|
16
22
|
|
17
23
|
class GaussianMixtureModelParams(NamedTuple):
|