dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/{output.py → _output.py} +14 -0
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +41 -30
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +0 -3
- dataeval/detectors/linters/duplicates.py +17 -8
- dataeval/detectors/linters/outliers.py +52 -43
- dataeval/detectors/ood/ae.py +29 -8
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/metadata_ks_compare.py +1 -1
- dataeval/detectors/ood/mixin.py +20 -5
- dataeval/detectors/ood/output.py +1 -1
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +5 -0
- dataeval/metadata/_ood.py +238 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +5 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
- dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
- dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
- dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
- dataeval/metrics/estimators/__init__.py +14 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
- dataeval/metrics/estimators/_clusterer.py +104 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/metrics/stats/{base.py → _base.py} +52 -16
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
- dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
- dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
- dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
- dataeval/typing.py +54 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +18 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +4 -4
- dataeval/utils/data/__init__.py +22 -0
- dataeval/utils/data/_embeddings.py +105 -0
- dataeval/utils/data/_images.py +65 -0
- dataeval/utils/data/_metadata.py +352 -0
- dataeval/utils/data/_selection.py +119 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
- dataeval/utils/data/_targets.py +73 -0
- dataeval/utils/data/_types.py +58 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +60 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +198 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/sufficiency.py +10 -9
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
- dataeval-0.81.0.dist-info/RECORD +94 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.0.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
dataeval/utils/metadata.py
CHANGED
@@ -1,48 +1,37 @@
|
|
1
1
|
"""
|
2
|
-
|
3
|
-
:class:`Metadata` objects for use within `DataEval`.
|
2
|
+
Utility functions that help organize raw metadata.
|
4
3
|
"""
|
5
4
|
|
6
5
|
from __future__ import annotations
|
7
6
|
|
8
|
-
__all__ = ["
|
7
|
+
__all__ = ["merge", "flatten"]
|
9
8
|
|
10
9
|
import warnings
|
11
|
-
from
|
12
|
-
from typing import Any, Iterable, Literal, Mapping,
|
10
|
+
from enum import Enum
|
11
|
+
from typing import Any, Iterable, Literal, Mapping, Sequence, overload
|
13
12
|
|
14
13
|
import numpy as np
|
15
|
-
from numpy.typing import
|
16
|
-
from scipy.stats import wasserstein_distance as wd
|
14
|
+
from numpy.typing import NDArray
|
17
15
|
|
18
|
-
|
19
|
-
from dataeval.output import Output, set_metadata
|
16
|
+
_TYPE_MAP = {int: 0, float: 1, str: 2}
|
20
17
|
|
21
|
-
DISCRETE_MIN_WD = 0.054
|
22
|
-
CONTINUOUS_MIN_SAMPLE_SIZE = 20
|
23
18
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def _try_cast(v: Any, t: type[T]) -> T | None:
|
29
|
-
"""Casts a value to a type or returns None if unable"""
|
30
|
-
try:
|
31
|
-
return t(v) # type: ignore
|
32
|
-
except (TypeError, ValueError):
|
33
|
-
return None
|
19
|
+
class DropReason(Enum):
|
20
|
+
INCONSISTENT_KEY = "inconsistent_key"
|
21
|
+
INCONSISTENT_SIZE = "inconsistent_size"
|
22
|
+
NESTED_LIST = "nested_list"
|
34
23
|
|
35
24
|
|
36
25
|
@overload
|
37
|
-
def
|
26
|
+
def _simplify_type(data: list[str]) -> list[int] | list[float] | list[str]: ...
|
38
27
|
@overload
|
39
|
-
def
|
28
|
+
def _simplify_type(data: str) -> int | float | str: ...
|
40
29
|
|
41
30
|
|
42
|
-
def
|
31
|
+
def _simplify_type(data: list[str] | str) -> list[int] | list[float] | list[str] | int | float | str:
|
43
32
|
"""
|
44
|
-
|
45
|
-
`float`, or `string`.
|
33
|
+
Simplifies a value or a list of values to the simplest form possible,
|
34
|
+
in preferred order of `int`, `float`, or `string`.
|
46
35
|
|
47
36
|
Parameters
|
48
37
|
----------
|
@@ -55,18 +44,20 @@ def _convert_type(data: list[str] | str) -> list[int] | list[float] | list[str]
|
|
55
44
|
The same values converted to the numerical type if possible
|
56
45
|
"""
|
57
46
|
if not isinstance(data, list):
|
58
|
-
|
47
|
+
try:
|
48
|
+
value = float(data)
|
49
|
+
except (TypeError, ValueError):
|
50
|
+
value = None
|
59
51
|
return str(data) if value is None else int(value) if value.is_integer() else value
|
60
52
|
|
61
53
|
converted = []
|
62
|
-
TYPE_MAP = {int: 0, float: 1, str: 2}
|
63
54
|
max_type = 0
|
64
55
|
for value in data:
|
65
|
-
value =
|
66
|
-
max_type = max(max_type,
|
56
|
+
value = _simplify_type(value)
|
57
|
+
max_type = max(max_type, _TYPE_MAP.get(type(value), 2))
|
67
58
|
converted.append(value)
|
68
59
|
for i in range(len(converted)):
|
69
|
-
converted[i] = list(
|
60
|
+
converted[i] = list(_TYPE_MAP)[max_type](converted[i])
|
70
61
|
return converted
|
71
62
|
|
72
63
|
|
@@ -84,7 +75,7 @@ def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], i
|
|
84
75
|
dict[tuple[str, ...], int]
|
85
76
|
Mapping of tuple keys to starting index
|
86
77
|
"""
|
87
|
-
indices =
|
78
|
+
indices = dict.fromkeys(keys, -1)
|
88
79
|
ks = list(keys)
|
89
80
|
while len(ks) > 0:
|
90
81
|
seen: dict[tuple[str, ...], list[tuple[str, ...]]] = {}
|
@@ -99,8 +90,16 @@ def _get_key_indices(keys: Iterable[tuple[str, ...]]) -> dict[tuple[str, ...], i
|
|
99
90
|
return indices
|
100
91
|
|
101
92
|
|
93
|
+
def _sorted_drop_reasons(d: dict[str, set[DropReason]]) -> dict[str, list[str]]:
|
94
|
+
return {k: sorted({vv.value for vv in v}) for k, v in sorted(d.items(), key=lambda item: item[1])}
|
95
|
+
|
96
|
+
|
102
97
|
def _flatten_dict_inner(
|
103
|
-
d: Mapping[str, Any],
|
98
|
+
d: Mapping[str, Any],
|
99
|
+
dropped: dict[tuple[str, ...], set[DropReason]],
|
100
|
+
parent_keys: tuple[str, ...],
|
101
|
+
size: int | None = None,
|
102
|
+
nested: bool = False,
|
104
103
|
) -> tuple[dict[tuple[str, ...], Any], int | None]:
|
105
104
|
"""
|
106
105
|
Recursive internal function for flattening a dictionary.
|
@@ -109,6 +108,8 @@ def _flatten_dict_inner(
|
|
109
108
|
----------
|
110
109
|
d : dict[str, Any]
|
111
110
|
Dictionary to flatten
|
111
|
+
dropped: set[tuple[str, ...]]
|
112
|
+
Reference to set of dropped keys from the dictionary
|
112
113
|
parent_keys : tuple[str, ...]
|
113
114
|
Parent keys to the current dictionary being flattened
|
114
115
|
size : int or None, default None
|
@@ -119,33 +120,64 @@ def _flatten_dict_inner(
|
|
119
120
|
Returns
|
120
121
|
-------
|
121
122
|
tuple[dict[tuple[str, ...], Any], int | None]
|
122
|
-
- [0]: Dictionary of flattened values with the keys reformatted as a
|
123
|
+
- [0]: Dictionary of flattened values with the keys reformatted as a
|
124
|
+
hierarchical tuple of strings
|
123
125
|
- [1]: Size, if any, of the current list of values
|
124
126
|
"""
|
125
127
|
items: dict[tuple[str, ...], Any] = {}
|
126
128
|
for k, v in d.items():
|
127
129
|
new_keys: tuple[str, ...] = parent_keys + (k,)
|
130
|
+
if isinstance(v, np.ndarray):
|
131
|
+
v = v.tolist()
|
128
132
|
if isinstance(v, dict):
|
129
|
-
fd, size = _flatten_dict_inner(v, new_keys, size=size, nested=nested)
|
133
|
+
fd, size = _flatten_dict_inner(v, dropped, new_keys, size=size, nested=nested)
|
130
134
|
items.update(fd)
|
131
135
|
elif isinstance(v, (list, tuple)):
|
132
|
-
if
|
136
|
+
if nested:
|
137
|
+
dropped.setdefault(parent_keys + (k,), set()).add(DropReason.NESTED_LIST)
|
138
|
+
elif size is not None and size != len(v):
|
139
|
+
dropped.setdefault(parent_keys + (k,), set()).add(DropReason.INCONSISTENT_SIZE)
|
140
|
+
else:
|
133
141
|
size = len(v)
|
134
142
|
if all(isinstance(i, dict) for i in v):
|
135
143
|
for sub_dict in v:
|
136
|
-
fd, size = _flatten_dict_inner(sub_dict, new_keys, size=size, nested=True)
|
144
|
+
fd, size = _flatten_dict_inner(sub_dict, dropped, new_keys, size=size, nested=True)
|
137
145
|
for fk, fv in fd.items():
|
138
146
|
items.setdefault(fk, []).append(fv)
|
139
147
|
else:
|
140
148
|
items[new_keys] = v
|
141
|
-
else:
|
142
|
-
warnings.warn(f"Dropping nested list found in '{parent_keys + (k, )}'.")
|
143
149
|
else:
|
144
150
|
items[new_keys] = v
|
145
151
|
return items, size
|
146
152
|
|
147
153
|
|
148
|
-
|
154
|
+
@overload
|
155
|
+
def flatten(
|
156
|
+
d: Mapping[str, Any],
|
157
|
+
return_dropped: Literal[True],
|
158
|
+
sep: str = "_",
|
159
|
+
ignore_lists: bool = False,
|
160
|
+
fully_qualified: bool = False,
|
161
|
+
) -> tuple[dict[str, Any], int, dict[str, list[str]]]: ...
|
162
|
+
|
163
|
+
|
164
|
+
@overload
|
165
|
+
def flatten(
|
166
|
+
d: Mapping[str, Any],
|
167
|
+
return_dropped: Literal[False] = False,
|
168
|
+
sep: str = "_",
|
169
|
+
ignore_lists: bool = False,
|
170
|
+
fully_qualified: bool = False,
|
171
|
+
) -> tuple[dict[str, Any], int]: ...
|
172
|
+
|
173
|
+
|
174
|
+
def flatten(
|
175
|
+
d: Mapping[str, Any],
|
176
|
+
return_dropped: bool = False,
|
177
|
+
sep: str = "_",
|
178
|
+
ignore_lists: bool = False,
|
179
|
+
fully_qualified: bool = False,
|
180
|
+
):
|
149
181
|
"""
|
150
182
|
Flattens a dictionary and converts values to numeric values when possible.
|
151
183
|
|
@@ -153,33 +185,54 @@ def flatten(d: Mapping[str, Any], sep: str, ignore_lists: bool, fully_qualified:
|
|
153
185
|
----------
|
154
186
|
d : dict[str, Any]
|
155
187
|
Dictionary to flatten
|
156
|
-
|
188
|
+
return_dropped: bool, default False
|
189
|
+
Option to return a dictionary of dropped keys and the reason(s) for dropping
|
190
|
+
sep : str, default "_"
|
157
191
|
String separator to use when concatenating key names
|
158
|
-
ignore_lists : bool
|
192
|
+
ignore_lists : bool, default False
|
159
193
|
Option to skip expanding lists within metadata
|
160
|
-
fully_qualified : bool
|
161
|
-
Option to return dictionary keys
|
194
|
+
fully_qualified : bool, default False
|
195
|
+
Option to return dictionary keys fully qualified instead of reduced
|
162
196
|
|
163
197
|
Returns
|
164
198
|
-------
|
165
|
-
|
166
|
-
|
199
|
+
dict[str, Any]
|
200
|
+
Dictionary of flattened values with the keys reformatted as a hierarchical tuple of strings
|
201
|
+
int
|
202
|
+
Size of the values in the flattened dictionary
|
203
|
+
dict[str, list[str]], Optional
|
204
|
+
Dictionary containing dropped keys and reason(s) for dropping
|
167
205
|
"""
|
168
|
-
|
206
|
+
dropped_inner: dict[tuple[str, ...], set[DropReason]] = {}
|
207
|
+
expanded, size = _flatten_dict_inner(d, dropped=dropped_inner, parent_keys=(), nested=ignore_lists)
|
169
208
|
|
170
209
|
output = {}
|
171
|
-
if fully_qualified:
|
172
|
-
expanded = {sep.join(k): v for k, v in expanded.items()}
|
173
|
-
else:
|
174
|
-
keys = _get_key_indices(expanded)
|
175
|
-
expanded = {sep.join(k[keys[k] :]): v for k, v in expanded.items()}
|
176
210
|
for k, v in expanded.items():
|
177
|
-
cv =
|
178
|
-
if isinstance(cv, list)
|
179
|
-
|
211
|
+
cv = _simplify_type(v)
|
212
|
+
if isinstance(cv, list):
|
213
|
+
if len(cv) == size:
|
214
|
+
output[k] = cv
|
215
|
+
else:
|
216
|
+
dropped_inner.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
|
180
217
|
elif not isinstance(cv, list):
|
181
218
|
output[k] = cv if not size else [cv] * size
|
182
|
-
|
219
|
+
|
220
|
+
if fully_qualified:
|
221
|
+
output = {sep.join(k): v for k, v in output.items()}
|
222
|
+
else:
|
223
|
+
keys = _get_key_indices(output)
|
224
|
+
output = {sep.join(k[keys[k] :]): v for k, v in output.items()}
|
225
|
+
|
226
|
+
size = size if size is not None else 1
|
227
|
+
dropped = {sep.join(k): v for k, v in dropped_inner.items()}
|
228
|
+
|
229
|
+
if return_dropped:
|
230
|
+
return output, size, _sorted_drop_reasons(dropped)
|
231
|
+
else:
|
232
|
+
if dropped:
|
233
|
+
dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
|
234
|
+
warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
|
235
|
+
return output, size
|
183
236
|
|
184
237
|
|
185
238
|
def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
|
@@ -197,48 +250,85 @@ def _is_metadata_dict_of_dicts(metadata: Mapping) -> bool:
|
|
197
250
|
return set(metadata[keys[0]]) == set(metadata[keys[1]])
|
198
251
|
|
199
252
|
|
253
|
+
@overload
|
254
|
+
def merge(
|
255
|
+
metadata: Iterable[Mapping[str, Any]],
|
256
|
+
return_dropped: Literal[True],
|
257
|
+
ignore_lists: bool = False,
|
258
|
+
fully_qualified: bool = False,
|
259
|
+
return_numpy: bool = False,
|
260
|
+
targets_per_image: Sequence[int] | None = None,
|
261
|
+
image_index_key: str = "_image_index",
|
262
|
+
) -> tuple[dict[str, list[Any]] | dict[str, NDArray[Any]], dict[str, list[str]]]: ...
|
263
|
+
|
264
|
+
|
265
|
+
@overload
|
266
|
+
def merge(
|
267
|
+
metadata: Iterable[Mapping[str, Any]],
|
268
|
+
return_dropped: Literal[False] = False,
|
269
|
+
ignore_lists: bool = False,
|
270
|
+
fully_qualified: bool = False,
|
271
|
+
return_numpy: bool = False,
|
272
|
+
targets_per_image: Sequence[int] | None = None,
|
273
|
+
image_index_key: str = "_image_index",
|
274
|
+
) -> dict[str, list[Any]] | dict[str, NDArray[Any]]: ...
|
275
|
+
|
276
|
+
|
200
277
|
def merge(
|
201
278
|
metadata: Iterable[Mapping[str, Any]],
|
279
|
+
return_dropped: bool = False,
|
202
280
|
ignore_lists: bool = False,
|
203
281
|
fully_qualified: bool = False,
|
204
|
-
|
205
|
-
|
282
|
+
return_numpy: bool = False,
|
283
|
+
targets_per_image: Sequence[int] | None = None,
|
284
|
+
image_index_key: str = "_image_index",
|
285
|
+
):
|
206
286
|
"""
|
207
|
-
Merges a collection of metadata dictionaries into a single flattened
|
287
|
+
Merges a collection of metadata dictionaries into a single flattened
|
288
|
+
dictionary of keys and values.
|
208
289
|
|
209
|
-
Nested dictionaries are flattened, and lists are expanded. Nested lists are
|
210
|
-
expanding into multiple hierarchical trees is not supported.
|
290
|
+
Nested dictionaries are flattened, and lists are expanded. Nested lists are
|
291
|
+
dropped as the expanding into multiple hierarchical trees is not supported.
|
292
|
+
The function adds an internal "_image_index" key to the metadata dictionary
|
293
|
+
used by the `Metadata` class.
|
211
294
|
|
212
295
|
Parameters
|
213
296
|
----------
|
214
297
|
metadata : Iterable[Mapping[str, Any]]
|
215
298
|
Iterable collection of metadata dictionaries to flatten and merge
|
299
|
+
return_dropped: bool, default False
|
300
|
+
Option to return a dictionary of dropped keys and the reason(s) for dropping
|
216
301
|
ignore_lists : bool, default False
|
217
302
|
Option to skip expanding lists within metadata
|
218
303
|
fully_qualified : bool, default False
|
219
304
|
Option to return dictionary keys full qualified instead of minimized
|
220
|
-
|
305
|
+
return_numpy : bool, default False
|
221
306
|
Option to return results as lists or NumPy arrays
|
307
|
+
targets_per_image : Sequence[int] or None, default None
|
308
|
+
Number of targets for each image metadata entry
|
309
|
+
image_index_key : str, default "_image_index"
|
310
|
+
User provided metadata key which maps the metadata entry to the source image.
|
222
311
|
|
223
312
|
Returns
|
224
313
|
-------
|
225
|
-
dict[str, list[Any]]
|
314
|
+
dict[str, list[Any]] | dict[str, NDArray[Any]]
|
226
315
|
A single dictionary containing the flattened data as lists or NumPy arrays
|
227
|
-
|
228
|
-
|
316
|
+
dict[str, list[str]], Optional
|
317
|
+
Dictionary containing dropped keys and reason(s) for dropping
|
229
318
|
|
230
319
|
Note
|
231
320
|
----
|
232
|
-
Nested lists of values and inconsistent keys are dropped in the merged
|
321
|
+
Nested lists of values and inconsistent keys are dropped in the merged
|
322
|
+
metadata dictionary
|
233
323
|
|
234
324
|
Example
|
235
325
|
-------
|
236
326
|
>>> list_metadata = [{"common": 1, "target": [{"a": 1, "b": 3, "c": 5}, {"a": 2, "b": 4}], "source": "example"}]
|
237
|
-
>>> reorganized_metadata,
|
327
|
+
>>> reorganized_metadata, dropped_keys = merge(list_metadata, return_dropped=True)
|
238
328
|
>>> reorganized_metadata
|
239
|
-
{'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example']}
|
240
|
-
>>>
|
241
|
-
|
329
|
+
{'common': [1, 1], 'a': [1, 2], 'b': [3, 4], 'source': ['example', 'example'], '_image_index': [0, 0]}
|
330
|
+
>>> dropped_keys
|
331
|
+
{'target_c': ['inconsistent_key']}
|
242
332
|
"""
|
243
333
|
merged: dict[str, list[Any]] = {}
|
244
334
|
isect: set[str] = set()
|
@@ -255,327 +345,59 @@ def merge(
|
|
255
345
|
else:
|
256
346
|
dicts = list(metadata)
|
257
347
|
|
258
|
-
|
348
|
+
if targets_per_image is not None and len(dicts) != len(targets_per_image):
|
349
|
+
raise ValueError("Number of targets per image must be equal to number of metadata entries.")
|
350
|
+
|
351
|
+
image_repeats = np.zeros(len(dicts), dtype=np.int_)
|
352
|
+
dropped: dict[str, set[DropReason]] = {}
|
259
353
|
for i, d in enumerate(dicts):
|
260
|
-
flattened, image_repeats[i] = flatten(
|
354
|
+
flattened, image_repeats[i], dropped_inner = flatten(
|
355
|
+
d, return_dropped=True, ignore_lists=ignore_lists, fully_qualified=fully_qualified
|
356
|
+
)
|
357
|
+
if targets_per_image is not None:
|
358
|
+
# check for mismatch in targets per image and force ignore_lists
|
359
|
+
if not ignore_lists and targets_per_image[i] != image_repeats[i]:
|
360
|
+
flattened, image_repeats[i], dropped_inner = flatten(
|
361
|
+
d, return_dropped=True, ignore_lists=True, fully_qualified=fully_qualified
|
362
|
+
)
|
363
|
+
if targets_per_image[i] != image_repeats[i]:
|
364
|
+
flattened = {k: [v] * targets_per_image[i] for k, v in flattened.items()}
|
365
|
+
image_repeats[i] = targets_per_image[i]
|
261
366
|
isect = isect.intersection(flattened.keys()) if isect else set(flattened.keys())
|
262
|
-
union
|
367
|
+
union.update(flattened.keys())
|
368
|
+
for k, v in dropped_inner.items():
|
369
|
+
dropped.setdefault(k, set()).update({DropReason(vv) for vv in v})
|
263
370
|
for k, v in flattened.items():
|
264
371
|
merged.setdefault(k, []).extend(flattened[k]) if isinstance(v, list) else merged.setdefault(k, []).append(v)
|
265
372
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
output: dict[str, Any] = {}
|
373
|
+
for k in union - isect:
|
374
|
+
dropped.setdefault(k, set()).add(DropReason.INCONSISTENT_KEY)
|
270
375
|
|
271
376
|
if image_repeats.sum() == image_repeats.size:
|
272
|
-
|
377
|
+
image_indices = np.arange(image_repeats.size)
|
273
378
|
else:
|
274
379
|
image_ids = np.arange(image_repeats.size)
|
275
380
|
image_data = np.concatenate(
|
276
381
|
[np.repeat(image_ids[i], image_repeats[i]) for i in range(image_ids.size)], dtype=np.int_
|
277
382
|
)
|
278
|
-
_, image_unsorted = np.unique(image_data,
|
279
|
-
|
383
|
+
_, image_unsorted = np.unique(image_data, return_inverse=True)
|
384
|
+
image_indices = np.sort(image_unsorted)
|
385
|
+
|
386
|
+
output: dict[str, Any] = {}
|
280
387
|
|
281
388
|
if keys:
|
282
|
-
output["keys"] = np.array(keys) if
|
389
|
+
output["keys"] = np.array(keys) if return_numpy else keys
|
283
390
|
|
284
391
|
for k in (key for key in merged if key in isect):
|
285
|
-
cv =
|
286
|
-
output[k] = np.array(cv) if
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
@dataclass(frozen=True)
|
292
|
-
class Metadata(Output):
|
293
|
-
"""
|
294
|
-
Dataclass containing binned metadata from the :func:`preprocess` function.
|
295
|
-
|
296
|
-
Attributes
|
297
|
-
----------
|
298
|
-
discrete_factor_names : list[str]
|
299
|
-
List containing factor names for the original data that was discrete and the binned continuous data
|
300
|
-
discrete_data : NDArray[np.int]
|
301
|
-
Array containing values for the original data that was discrete and the binned continuous data
|
302
|
-
continuous_factor_names : list[str]
|
303
|
-
List containing factor names for the original continuous data
|
304
|
-
continuous_data : NDArray[np.int or np.double] | None
|
305
|
-
Array containing values for the original continuous data or None if there was no continuous data
|
306
|
-
class_labels : NDArray[np.int]
|
307
|
-
Numerical class labels for the images/objects
|
308
|
-
class_names : NDArray[Any]
|
309
|
-
Array of unique class names (for use with plotting)
|
310
|
-
total_num_factors : int
|
311
|
-
Sum of discrete_factor_names and continuous_factor_names plus 1 for class
|
312
|
-
"""
|
392
|
+
cv = _simplify_type(merged[k])
|
393
|
+
output[k] = np.array(cv) if return_numpy else cv
|
394
|
+
if image_index_key not in output:
|
395
|
+
output[image_index_key] = image_indices if return_numpy else image_indices.tolist()
|
313
396
|
|
314
|
-
|
315
|
-
|
316
|
-
continuous_factor_names: list[str]
|
317
|
-
continuous_data: NDArray[np.int_ | np.double] | None
|
318
|
-
class_labels: NDArray[np.int_]
|
319
|
-
class_names: NDArray[Any]
|
320
|
-
total_num_factors: int
|
321
|
-
|
322
|
-
|
323
|
-
@set_metadata
|
324
|
-
def preprocess(
|
325
|
-
raw_metadata: Iterable[Mapping[str, Any]],
|
326
|
-
class_labels: ArrayLike | str,
|
327
|
-
continuous_factor_bins: Mapping[str, int | Iterable[float]] | None = None,
|
328
|
-
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
329
|
-
exclude: Iterable[str] | None = None,
|
330
|
-
) -> Metadata:
|
331
|
-
"""
|
332
|
-
Restructures the metadata to be in the correct format for the bias functions.
|
333
|
-
|
334
|
-
This identifies whether the incoming metadata is discrete or continuous,
|
335
|
-
and whether the data is already binned or still needs binning.
|
336
|
-
It accepts a list of dictionaries containing the per image metadata and
|
337
|
-
automatically adjusts for multiple targets in an image.
|
338
|
-
|
339
|
-
Parameters
|
340
|
-
----------
|
341
|
-
raw_metadata : Iterable[Mapping[str, Any]]
|
342
|
-
Iterable collection of metadata dictionaries to flatten and merge.
|
343
|
-
class_labels : ArrayLike or string
|
344
|
-
If arraylike, expects the labels for each image (image classification) or each object (object detection).
|
345
|
-
If the labels are included in the metadata dictionary, pass in the key value.
|
346
|
-
continuous_factor_bins : Mapping[str, int or Iterable[float]] or None, default None
|
347
|
-
User provided dictionary specifying how to bin the continuous metadata factors where the value is either
|
348
|
-
an int to represent the number of bins, or a list of floats representing the edges for each bin.
|
349
|
-
auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
|
350
|
-
Method by which the function will automatically bin continuous metadata factors. It is recommended
|
351
|
-
that the user provide the bins through the `continuous_factor_bins`.
|
352
|
-
exclude : Iterable[str] or None, default None
|
353
|
-
User provided collection of metadata keys to exclude when processing metadata.
|
354
|
-
|
355
|
-
Returns
|
356
|
-
-------
|
357
|
-
Metadata
|
358
|
-
Output class containing the binned metadata
|
359
|
-
"""
|
360
|
-
# Transform metadata into single, flattened dictionary
|
361
|
-
metadata, image_repeats = merge(raw_metadata)
|
362
|
-
|
363
|
-
continuous_factor_bins = dict(continuous_factor_bins) if continuous_factor_bins else None
|
364
|
-
|
365
|
-
# Drop any excluded metadata keys
|
366
|
-
for k in exclude or ():
|
367
|
-
metadata.pop(k, None)
|
368
|
-
if continuous_factor_bins:
|
369
|
-
continuous_factor_bins.pop(k, None)
|
370
|
-
|
371
|
-
# Get the class label array in numeric form
|
372
|
-
class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
|
373
|
-
if class_array.ndim > 1:
|
374
|
-
raise ValueError(
|
375
|
-
f"Got class labels with {class_array.ndim}-dimensional "
|
376
|
-
f"shape {class_array.shape}, but expected a 1-dimensional array."
|
377
|
-
)
|
378
|
-
if not np.issubdtype(class_array.dtype, np.int_):
|
379
|
-
unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
|
397
|
+
if return_dropped:
|
398
|
+
return output, _sorted_drop_reasons(dropped)
|
380
399
|
else:
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
continuous_metadata = {}
|
386
|
-
discrete_metadata = {}
|
387
|
-
if continuous_factor_bins is not None and continuous_factor_bins != {}:
|
388
|
-
invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
|
389
|
-
if invalid_keys:
|
390
|
-
raise KeyError(
|
391
|
-
f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
|
392
|
-
"but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
|
393
|
-
"or add corresponding entries to the `metadata` dictionary."
|
394
|
-
)
|
395
|
-
for factor, bins in continuous_factor_bins.items():
|
396
|
-
discrete_metadata[factor] = _digitize_data(metadata[factor], bins)
|
397
|
-
continuous_metadata[factor] = metadata[factor]
|
398
|
-
|
399
|
-
# Determine category of the rest of the keys
|
400
|
-
remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
|
401
|
-
for key in remaining_keys:
|
402
|
-
data = to_numpy(metadata[key])
|
403
|
-
if np.issubdtype(data.dtype, np.number):
|
404
|
-
result = _is_continuous(data, image_repeats)
|
405
|
-
if result:
|
406
|
-
continuous_metadata[key] = data
|
407
|
-
unique_samples, ordinal_data = np.unique(data, return_inverse=True)
|
408
|
-
if unique_samples.size <= np.max([20, data.size * 0.01]):
|
409
|
-
discrete_metadata[key] = ordinal_data
|
410
|
-
else:
|
411
|
-
warnings.warn(
|
412
|
-
f"A user defined binning was not provided for {key}. "
|
413
|
-
f"Using the {auto_bin_method} method to discretize the data. "
|
414
|
-
"It is recommended that the user rerun and supply the desired "
|
415
|
-
"bins using the continuous_factor_bins parameter.",
|
416
|
-
UserWarning,
|
417
|
-
)
|
418
|
-
discrete_metadata[key] = _bin_data(data, auto_bin_method)
|
419
|
-
else:
|
420
|
-
_, discrete_metadata[key] = np.unique(data, return_inverse=True)
|
421
|
-
|
422
|
-
# splitting out the dictionaries into the keys and values
|
423
|
-
discrete_factor_names = list(discrete_metadata.keys())
|
424
|
-
discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
|
425
|
-
continuous_factor_names = list(continuous_metadata.keys())
|
426
|
-
continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
|
427
|
-
total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
|
428
|
-
|
429
|
-
return Metadata(
|
430
|
-
discrete_factor_names,
|
431
|
-
discrete_data,
|
432
|
-
continuous_factor_names,
|
433
|
-
continuous_data,
|
434
|
-
numerical_labels,
|
435
|
-
unique_classes,
|
436
|
-
total_num_factors,
|
437
|
-
)
|
438
|
-
|
439
|
-
|
440
|
-
def _digitize_data(data: list[Any] | NDArray[Any], bins: int | Iterable[float]) -> NDArray[np.intp]:
|
441
|
-
"""
|
442
|
-
Digitizes a list of values into a given number of bins.
|
443
|
-
|
444
|
-
Parameters
|
445
|
-
----------
|
446
|
-
data : list | NDArray
|
447
|
-
The values to be digitized.
|
448
|
-
bins : int | Iterable[float]
|
449
|
-
The number of bins or list of bin edges for the discrete values that data will be digitized into.
|
450
|
-
|
451
|
-
Returns
|
452
|
-
-------
|
453
|
-
NDArray[np.intp]
|
454
|
-
The digitized values
|
455
|
-
"""
|
456
|
-
|
457
|
-
if not np.all([np.issubdtype(type(n), np.number) for n in data]):
|
458
|
-
raise TypeError(
|
459
|
-
"Encountered a data value with non-numeric type when digitizing a factor. "
|
460
|
-
"Ensure all occurrences of continuous factors are numeric types."
|
461
|
-
)
|
462
|
-
if isinstance(bins, int):
|
463
|
-
_, bin_edges = np.histogram(data, bins=bins)
|
464
|
-
bin_edges[-1] = np.inf
|
465
|
-
bin_edges[0] = -np.inf
|
466
|
-
else:
|
467
|
-
bin_edges = list(bins)
|
468
|
-
return np.digitize(data, bin_edges)
|
469
|
-
|
470
|
-
|
471
|
-
def _bin_data(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
|
472
|
-
"""
|
473
|
-
Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
|
474
|
-
"""
|
475
|
-
if bin_method == "clusters":
|
476
|
-
# bin_edges = _binning_by_clusters(data)
|
477
|
-
warnings.warn(
|
478
|
-
"Binning by clusters is currently unavailable until changes to the clustering function go through.",
|
479
|
-
UserWarning,
|
480
|
-
)
|
481
|
-
bin_method = "uniform_width"
|
482
|
-
|
483
|
-
# if bin_method != "clusters": # restore this when clusters bin_method is available
|
484
|
-
counts, bin_edges = np.histogram(data, bins="auto")
|
485
|
-
n_bins = counts.size
|
486
|
-
if counts[counts > 0].min() < 10:
|
487
|
-
counter = 20
|
488
|
-
while counts[counts > 0].min() < 10 and n_bins >= 2 and counter > 0:
|
489
|
-
counter -= 1
|
490
|
-
n_bins -= 1
|
491
|
-
counts, bin_edges = np.histogram(data, bins=n_bins)
|
492
|
-
|
493
|
-
if bin_method == "uniform_count":
|
494
|
-
quantiles = np.linspace(0, 100, n_bins + 1)
|
495
|
-
bin_edges = np.asarray(np.percentile(data, quantiles))
|
496
|
-
|
497
|
-
bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
|
498
|
-
bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
|
499
|
-
return np.digitize(data, bin_edges) # type: ignore
|
500
|
-
|
501
|
-
|
502
|
-
def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
|
503
|
-
"""
|
504
|
-
Determines whether the data is continuous or discrete using the Wasserstein distance.
|
505
|
-
|
506
|
-
Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
|
507
|
-
a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
|
508
|
-
we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
|
509
|
-
neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
|
510
|
-
ignored, depending on context). These normalized locations will be much more uniformly distributed
|
511
|
-
for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
|
512
|
-
Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
|
513
|
-
|
514
|
-
The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
|
515
|
-
how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
|
516
|
-
least 20 points, and furthermore at least half as many points as there are discrete values, we can
|
517
|
-
reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
|
518
|
-
measured from a uniform distribution is greater or less than 0.054, respectively.
|
519
|
-
"""
|
520
|
-
# Check if the metadata is image specific
|
521
|
-
_, data_indicies_unsorted = np.unique(data, return_index=True)
|
522
|
-
if data_indicies_unsorted.size == image_indicies.size:
|
523
|
-
data_indicies = np.sort(data_indicies_unsorted)
|
524
|
-
if (data_indicies == image_indicies).all():
|
525
|
-
data = data[data_indicies]
|
526
|
-
|
527
|
-
# OLD METHOD
|
528
|
-
# uvals = np.unique(data)
|
529
|
-
# pct_unique = uvals.size / data.size
|
530
|
-
# return pct_unique < threshold
|
531
|
-
|
532
|
-
n_examples = len(data)
|
533
|
-
|
534
|
-
if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
|
535
|
-
warnings.warn(
|
536
|
-
f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
|
537
|
-
)
|
538
|
-
return False
|
539
|
-
|
540
|
-
# Require at least 3 unique values before bothering with NNN
|
541
|
-
xu = np.unique(data, axis=None)
|
542
|
-
if xu.size < 3:
|
543
|
-
return False
|
544
|
-
|
545
|
-
Xs = np.sort(data)
|
546
|
-
|
547
|
-
X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
|
548
|
-
|
549
|
-
dx = np.zeros(n_examples - 2) # no dx at end points
|
550
|
-
gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
|
551
|
-
dx[np.logical_not(gtz)] = 0.0
|
552
|
-
|
553
|
-
dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
|
554
|
-
|
555
|
-
shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
|
556
|
-
|
557
|
-
return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.
|
558
|
-
|
559
|
-
|
560
|
-
def get_counts(data: NDArray[np.int_], min_num_bins: int | None = None) -> NDArray[np.int_]:
|
561
|
-
"""
|
562
|
-
Returns columnwise unique counts for discrete data.
|
563
|
-
|
564
|
-
Parameters
|
565
|
-
----------
|
566
|
-
data : NDArray
|
567
|
-
Array containing integer values for metadata factors
|
568
|
-
min_num_bins : int | None, default None
|
569
|
-
Minimum number of bins for bincount, helps force consistency across runs
|
570
|
-
|
571
|
-
Returns
|
572
|
-
-------
|
573
|
-
NDArray[np.int_]
|
574
|
-
Bin counts per column of data.
|
575
|
-
"""
|
576
|
-
max_value = data.max() + 1 if min_num_bins is None else min_num_bins
|
577
|
-
cnt_array = np.zeros((max_value, data.shape[1]), dtype=np.int_)
|
578
|
-
for idx in range(data.shape[1]):
|
579
|
-
cnt_array[:, idx] = np.bincount(data[:, idx], minlength=max_value)
|
580
|
-
|
581
|
-
return cnt_array
|
400
|
+
if dropped:
|
401
|
+
dropped_items = "\n".join([f" {k}: {v}" for k, v in _sorted_drop_reasons(dropped).items()])
|
402
|
+
warnings.warn(f"Metadata entries were dropped:\n{dropped_items}")
|
403
|
+
return output
|