dataeval 0.73.1__py3-none-any.whl → 0.74.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -9
- dataeval/detectors/__init__.py +2 -10
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/mmd.py +1 -1
- dataeval/detectors/drift/torch.py +1 -101
- dataeval/detectors/linters/clusterer.py +3 -3
- dataeval/detectors/linters/duplicates.py +4 -4
- dataeval/detectors/linters/outliers.py +4 -4
- dataeval/detectors/ood/__init__.py +9 -9
- dataeval/detectors/ood/{ae.py → ae_torch.py} +22 -27
- dataeval/detectors/ood/base.py +63 -113
- dataeval/detectors/ood/base_torch.py +109 -0
- dataeval/detectors/ood/metadata_ks_compare.py +52 -14
- dataeval/interop.py +1 -1
- dataeval/metrics/bias/__init__.py +3 -0
- dataeval/metrics/bias/balance.py +73 -70
- dataeval/metrics/bias/coverage.py +4 -4
- dataeval/metrics/bias/diversity.py +67 -136
- dataeval/metrics/bias/metadata_preprocessing.py +285 -0
- dataeval/metrics/bias/metadata_utils.py +229 -0
- dataeval/metrics/bias/parity.py +51 -161
- dataeval/metrics/estimators/ber.py +3 -3
- dataeval/metrics/estimators/divergence.py +3 -3
- dataeval/metrics/estimators/uap.py +3 -3
- dataeval/metrics/stats/base.py +2 -2
- dataeval/metrics/stats/boxratiostats.py +1 -1
- dataeval/metrics/stats/datasetstats.py +6 -6
- dataeval/metrics/stats/dimensionstats.py +1 -1
- dataeval/metrics/stats/hashstats.py +1 -1
- dataeval/metrics/stats/labelstats.py +3 -3
- dataeval/metrics/stats/pixelstats.py +1 -1
- dataeval/metrics/stats/visualstats.py +1 -1
- dataeval/output.py +77 -53
- dataeval/utils/__init__.py +1 -7
- dataeval/utils/gmm.py +26 -0
- dataeval/utils/metadata.py +29 -9
- dataeval/utils/torch/gmm.py +98 -0
- dataeval/utils/torch/models.py +192 -0
- dataeval/utils/torch/trainer.py +84 -5
- dataeval/utils/torch/utils.py +107 -1
- dataeval/workflows/sufficiency.py +4 -4
- {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/METADATA +3 -9
- dataeval-0.74.1.dist-info/RECORD +65 -0
- dataeval/detectors/ood/aegmm.py +0 -66
- dataeval/detectors/ood/llr.py +0 -302
- dataeval/detectors/ood/vae.py +0 -97
- dataeval/detectors/ood/vaegmm.py +0 -75
- dataeval/metrics/bias/metadata.py +0 -440
- dataeval/utils/lazy.py +0 -26
- dataeval/utils/tensorflow/__init__.py +0 -19
- dataeval/utils/tensorflow/_internal/gmm.py +0 -123
- dataeval/utils/tensorflow/_internal/loss.py +0 -121
- dataeval/utils/tensorflow/_internal/models.py +0 -1394
- dataeval/utils/tensorflow/_internal/trainer.py +0 -114
- dataeval/utils/tensorflow/_internal/utils.py +0 -256
- dataeval/utils/tensorflow/loss/__init__.py +0 -11
- dataeval-0.73.1.dist-info/RECORD +0 -73
- {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/WHEEL +0 -0
@@ -4,21 +4,15 @@ __all__ = ["DiversityOutput", "diversity"]
|
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import Any, Literal
|
7
|
+
from typing import Any, Literal
|
8
8
|
|
9
9
|
import numpy as np
|
10
|
+
import scipy as sp
|
10
11
|
from numpy.typing import ArrayLike, NDArray
|
11
12
|
|
12
|
-
from dataeval.metrics.bias.
|
13
|
-
|
14
|
-
|
15
|
-
entropy,
|
16
|
-
get_counts,
|
17
|
-
get_num_bins,
|
18
|
-
heatmap,
|
19
|
-
preprocess_metadata,
|
20
|
-
)
|
21
|
-
from dataeval.output import OutputMetadata, set_metadata
|
13
|
+
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
14
|
+
from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
|
15
|
+
from dataeval.output import Output, set_metadata
|
22
16
|
from dataeval.utils.shared import get_method
|
23
17
|
|
24
18
|
with contextlib.suppress(ImportError):
|
@@ -26,31 +20,31 @@ with contextlib.suppress(ImportError):
|
|
26
20
|
|
27
21
|
|
28
22
|
@dataclass(frozen=True)
|
29
|
-
class DiversityOutput(
|
23
|
+
class DiversityOutput(Output):
|
30
24
|
"""
|
31
25
|
Output class for :func:`diversity` :term:`bias<Bias>` metric
|
32
26
|
|
33
27
|
Attributes
|
34
28
|
----------
|
35
|
-
diversity_index : NDArray[np.
|
29
|
+
diversity_index : NDArray[np.double]
|
36
30
|
:term:`Diversity` index for classes and factors
|
37
|
-
classwise : NDArray[np.
|
31
|
+
classwise : NDArray[np.double]
|
38
32
|
Classwise diversity index [n_class x n_factor]
|
39
|
-
|
40
|
-
Class labels for each value in the dataset
|
41
|
-
metadata_names : list[str]
|
33
|
+
factor_names : list[str]
|
42
34
|
Names of each metadata factor
|
35
|
+
class_list : NDArray[Any]
|
36
|
+
Class labels for each value in the dataset
|
43
37
|
"""
|
44
38
|
|
45
|
-
diversity_index: NDArray[np.
|
46
|
-
classwise: NDArray[np.
|
39
|
+
diversity_index: NDArray[np.double]
|
40
|
+
classwise: NDArray[np.double]
|
41
|
+
factor_names: list[str]
|
47
42
|
class_list: NDArray[Any]
|
48
|
-
metadata_names: list[str]
|
49
43
|
|
50
44
|
def plot(
|
51
45
|
self,
|
52
|
-
row_labels: ArrayLike |
|
53
|
-
col_labels: ArrayLike |
|
46
|
+
row_labels: ArrayLike | None = None,
|
47
|
+
col_labels: ArrayLike | None = None,
|
54
48
|
plot_classwise: bool = False,
|
55
49
|
) -> Figure:
|
56
50
|
"""
|
@@ -69,7 +63,7 @@ class DiversityOutput(OutputMetadata):
|
|
69
63
|
if row_labels is None:
|
70
64
|
row_labels = self.class_list
|
71
65
|
if col_labels is None:
|
72
|
-
col_labels = self.
|
66
|
+
col_labels = self.factor_names
|
73
67
|
|
74
68
|
fig = heatmap(
|
75
69
|
self.classwise,
|
@@ -82,7 +76,7 @@ class DiversityOutput(OutputMetadata):
|
|
82
76
|
|
83
77
|
else:
|
84
78
|
# Creating label array for heat map axes
|
85
|
-
heat_labels = np.concatenate((["class"], self.
|
79
|
+
heat_labels = np.concatenate((["class"], self.factor_names))
|
86
80
|
|
87
81
|
fig = diversity_bar_plot(heat_labels, self.diversity_index)
|
88
82
|
|
@@ -90,11 +84,9 @@ class DiversityOutput(OutputMetadata):
|
|
90
84
|
|
91
85
|
|
92
86
|
def diversity_shannon(
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
subset_mask: NDArray[np.bool_] | None = None,
|
97
|
-
) -> NDArray[np.float64]:
|
87
|
+
counts: NDArray[np.int_],
|
88
|
+
num_bins: NDArray[np.int_],
|
89
|
+
) -> NDArray[np.double]:
|
98
90
|
"""
|
99
91
|
Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
|
100
92
|
histogram binning, for continuous variables.
|
@@ -106,62 +98,31 @@ def diversity_shannon(
|
|
106
98
|
|
107
99
|
Parameters
|
108
100
|
----------
|
109
|
-
|
110
|
-
Array containing
|
111
|
-
|
112
|
-
|
113
|
-
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
114
|
-
The factors in names that have continuous values and the array of bin counts to
|
115
|
-
discretize values into. All factors are treated as having discrete values unless they
|
116
|
-
are specified as keys in this dictionary. Each element of this array must occur as a key
|
117
|
-
in names.
|
118
|
-
subset_mask : NDArray[np.bool_] or None, default None
|
119
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
120
|
-
|
121
|
-
Note
|
122
|
-
----
|
123
|
-
For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
|
101
|
+
counts : NDArray[np.int_]
|
102
|
+
Array containing bin counts for each factor
|
103
|
+
num_bins : NDArray[np.int_]
|
104
|
+
Number of bins with values for each factor
|
124
105
|
|
125
106
|
Returns
|
126
107
|
-------
|
127
|
-
diversity_index : NDArray[np.
|
108
|
+
diversity_index : NDArray[np.double]
|
128
109
|
Diversity index per column of X
|
129
110
|
|
130
111
|
See Also
|
131
112
|
--------
|
132
|
-
|
113
|
+
scipy.stats.entropy
|
133
114
|
"""
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
ent_unnormalized = entropy(
|
138
|
-
data,
|
139
|
-
names,
|
140
|
-
continuous_factor_bincounts,
|
141
|
-
normalized=False,
|
142
|
-
subset_mask=subset_mask,
|
143
|
-
hist_cache=hist_cache,
|
144
|
-
)
|
145
|
-
# normalize by global counts rather than classwise counts
|
146
|
-
num_bins = get_num_bins(
|
147
|
-
data,
|
148
|
-
names,
|
149
|
-
continuous_factor_bincounts=continuous_factor_bincounts,
|
150
|
-
subset_mask=subset_mask,
|
151
|
-
hist_cache=hist_cache,
|
152
|
-
)
|
153
|
-
ent_norm = np.empty(ent_unnormalized.shape)
|
154
|
-
ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
|
115
|
+
raw_entropy = sp.stats.entropy(counts, axis=0)
|
116
|
+
ent_norm = np.empty(raw_entropy.shape)
|
117
|
+
ent_norm[num_bins != 1] = raw_entropy[num_bins != 1] / np.log(num_bins[num_bins != 1])
|
155
118
|
ent_norm[num_bins == 1] = 0
|
156
119
|
return ent_norm
|
157
120
|
|
158
121
|
|
159
122
|
def diversity_simpson(
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
subset_mask: NDArray[np.bool_] | None = None,
|
164
|
-
) -> NDArray[np.float64]:
|
123
|
+
counts: NDArray[np.int_],
|
124
|
+
num_bins: NDArray[np.int_],
|
125
|
+
) -> NDArray[np.double]:
|
165
126
|
"""
|
166
127
|
Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
|
167
128
|
histogram binning, for continuous variables.
|
@@ -173,58 +134,38 @@ def diversity_simpson(
|
|
173
134
|
|
174
135
|
Parameters
|
175
136
|
----------
|
176
|
-
|
177
|
-
Array containing
|
178
|
-
|
179
|
-
|
180
|
-
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
181
|
-
The factors in names that have continuous values and the array of bin counts to
|
182
|
-
discretize values into. All factors are treated as having discrete values unless they
|
183
|
-
are specified as keys in this dictionary. Each element of this array must occur as a key
|
184
|
-
in names.
|
185
|
-
subset_mask : NDArray[np.bool_] or None, default None
|
186
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
137
|
+
counts : NDArray[np.int_]
|
138
|
+
Array containing bin counts for each factor
|
139
|
+
num_bins : NDArray[np.int_]
|
140
|
+
Number of bins with values for each factor
|
187
141
|
|
188
142
|
Note
|
189
143
|
----
|
190
|
-
For continuous variables, histogram bins are chosen automatically. See
|
191
|
-
numpy.histogram for details.
|
192
144
|
If there is only one category, the diversity index takes a value of 0.
|
193
145
|
|
194
146
|
Returns
|
195
147
|
-------
|
196
|
-
diversity_index : NDArray[np.
|
148
|
+
diversity_index : NDArray[np.double]
|
197
149
|
Diversity index per column of X
|
198
|
-
|
199
|
-
See Also
|
200
|
-
--------
|
201
|
-
numpy.histogram
|
202
150
|
"""
|
203
|
-
|
204
|
-
|
205
|
-
hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
|
206
|
-
# normalize by global counts, not classwise counts
|
207
|
-
num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
|
208
|
-
|
209
|
-
ev_index = np.empty(len(names))
|
151
|
+
ev_index = np.empty(counts.shape[1])
|
210
152
|
# loop over columns for convenience
|
211
|
-
for col, cnts in enumerate(
|
153
|
+
for col, cnts in enumerate(counts.T):
|
212
154
|
# relative frequencies
|
213
155
|
p_i = cnts / np.sum(cnts)
|
214
|
-
# inverse Simpson index
|
215
|
-
s_0 = 1 / np.sum(p_i**2)
|
156
|
+
# inverse Simpson index
|
157
|
+
s_0 = 1 / np.sum(p_i**2)
|
216
158
|
if num_bins[col] == 1:
|
217
159
|
ev_index[col] = 0
|
218
160
|
else:
|
161
|
+
# normalized by number of bins
|
219
162
|
ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
|
220
163
|
return ev_index
|
221
164
|
|
222
165
|
|
223
|
-
@set_metadata
|
166
|
+
@set_metadata
|
224
167
|
def diversity(
|
225
|
-
|
226
|
-
metadata: Mapping[str, ArrayLike],
|
227
|
-
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
168
|
+
metadata: MetadataOutput,
|
228
169
|
method: Literal["simpson", "shannon"] = "simpson",
|
229
170
|
) -> DiversityOutput:
|
230
171
|
"""
|
@@ -238,23 +179,13 @@ def diversity(
|
|
238
179
|
|
239
180
|
Parameters
|
240
181
|
----------
|
241
|
-
|
242
|
-
|
243
|
-
metadata : Mapping[str, ArrayLike]
|
244
|
-
Dict of list of metadata factors for each image
|
245
|
-
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
246
|
-
The factors in metadata that have continuous values and the array of bin counts to
|
247
|
-
discretize values into. All factors are treated as having discrete values unless they
|
248
|
-
are specified as keys in this dictionary. Each element of this array must occur as a key
|
249
|
-
in metadata.
|
250
|
-
method : {"simpson", "shannon"}, default "simpson"
|
251
|
-
Indicates which diversity index should be computed
|
182
|
+
metadata : MetadataOutput
|
183
|
+
Output after running `metadata_preprocessing`
|
252
184
|
|
253
185
|
Note
|
254
186
|
----
|
255
|
-
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
256
187
|
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
257
|
-
- If there is only one category, the diversity index takes a value of
|
188
|
+
- If there is only one category, the diversity index takes a value of 0.
|
258
189
|
|
259
190
|
Returns
|
260
191
|
-------
|
@@ -266,42 +197,42 @@ def diversity(
|
|
266
197
|
-------
|
267
198
|
Compute Simpson diversity index of metadata and class labels
|
268
199
|
|
269
|
-
>>> div_simp = diversity(
|
200
|
+
>>> div_simp = diversity(metadata, method="simpson")
|
270
201
|
>>> div_simp.diversity_index
|
271
|
-
array([0.72413793, 0.
|
202
|
+
array([0.72413793, 0.88636364, 0.72413793])
|
272
203
|
|
273
204
|
>>> div_simp.classwise
|
274
|
-
array([[0.
|
275
|
-
[0.
|
205
|
+
array([[0.69230769, 0.68965517],
|
206
|
+
[0.5 , 0.8 ]])
|
276
207
|
|
277
208
|
Compute Shannon diversity index of metadata and class labels
|
278
209
|
|
279
|
-
>>> div_shan = diversity(
|
210
|
+
>>> div_shan = diversity(metadata, method="shannon")
|
280
211
|
>>> div_shan.diversity_index
|
281
|
-
array([0.8812909 , 0.
|
212
|
+
array([0.8812909 , 0.96748876, 0.8812909 ])
|
282
213
|
|
283
214
|
>>> div_shan.classwise
|
284
|
-
array([[0.
|
285
|
-
[0.
|
215
|
+
array([[0.91651644, 0.86312057],
|
216
|
+
[0.68260619, 0.91829583]])
|
286
217
|
|
287
218
|
See Also
|
288
219
|
--------
|
289
|
-
|
220
|
+
scipy.stats.entropy
|
290
221
|
"""
|
291
222
|
diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
|
292
|
-
|
293
|
-
|
223
|
+
discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
|
224
|
+
cnts = get_counts(discretized_data)
|
225
|
+
num_bins = np.bincount(np.nonzero(cnts)[1])
|
226
|
+
diversity_index = diversity_fn(cnts, num_bins)
|
294
227
|
|
295
|
-
|
296
|
-
class_lbl = data[:, class_idx]
|
228
|
+
class_lbl = metadata.class_labels
|
297
229
|
|
298
230
|
u_classes = np.unique(class_lbl)
|
299
|
-
num_factors = len(
|
300
|
-
|
301
|
-
diversity[:] = np.nan
|
231
|
+
num_factors = len(metadata.discrete_factor_names)
|
232
|
+
classwise_div = np.full((len(u_classes), num_factors), np.nan)
|
302
233
|
for idx, cls in enumerate(u_classes):
|
303
234
|
subset_mask = class_lbl == cls
|
304
|
-
|
305
|
-
|
235
|
+
cls_cnts = get_counts(metadata.discrete_data[subset_mask], min_num_bins=cnts.shape[0])
|
236
|
+
classwise_div[idx, :] = diversity_fn(cls_cnts, num_bins[1:])
|
306
237
|
|
307
|
-
return DiversityOutput(diversity_index,
|
238
|
+
return DiversityOutput(diversity_index, classwise_div, metadata.discrete_factor_names, metadata.class_names)
|
@@ -0,0 +1,285 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = ["MetadataOutput", "metadata_preprocessing"]
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Any, Iterable, Literal, Mapping, TypeVar
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
from numpy.typing import ArrayLike, NDArray
|
11
|
+
from scipy.stats import wasserstein_distance as wd
|
12
|
+
|
13
|
+
from dataeval.interop import as_numpy, to_numpy
|
14
|
+
from dataeval.output import Output, set_metadata
|
15
|
+
from dataeval.utils.metadata import merge_metadata
|
16
|
+
|
17
|
+
TNum = TypeVar("TNum", int, float)
|
18
|
+
DISCRETE_MIN_WD = 0.054
|
19
|
+
CONTINUOUS_MIN_SAMPLE_SIZE = 20
|
20
|
+
|
21
|
+
|
22
|
+
@dataclass(frozen=True)
|
23
|
+
class MetadataOutput(Output):
|
24
|
+
"""
|
25
|
+
Output class for :func:`metadata_binning` function
|
26
|
+
|
27
|
+
Attributes
|
28
|
+
----------
|
29
|
+
discrete_factor_names : list[str]
|
30
|
+
List containing factor names for the original data that was discrete and the binned continuous data
|
31
|
+
discrete_data : NDArray[np.int]
|
32
|
+
Array containing values for the original data that was discrete and the binned continuous data
|
33
|
+
continuous_factor_names : list[str]
|
34
|
+
List containing factor names for the original continuous data
|
35
|
+
continuous_data : NDArray[np.int or np.double] | None
|
36
|
+
Array containing values for the original continuous data or None if there was no continuous data
|
37
|
+
class_labels : NDArray[np.int]
|
38
|
+
Numerical class labels for the images/objects
|
39
|
+
class_names : NDArray[Any]
|
40
|
+
Array of unique class names (for use with plotting)
|
41
|
+
total_num_factors : int
|
42
|
+
Sum of discrete_factor_names and continuous_factor_names plus 1 for class
|
43
|
+
"""
|
44
|
+
|
45
|
+
discrete_factor_names: list[str]
|
46
|
+
discrete_data: NDArray[np.int_]
|
47
|
+
continuous_factor_names: list[str]
|
48
|
+
continuous_data: NDArray[np.int_ | np.double] | None
|
49
|
+
class_labels: NDArray[np.int_]
|
50
|
+
class_names: NDArray[Any]
|
51
|
+
total_num_factors: int
|
52
|
+
|
53
|
+
|
54
|
+
@set_metadata
|
55
|
+
def metadata_preprocessing(
|
56
|
+
raw_metadata: Iterable[Mapping[str, Any]],
|
57
|
+
class_labels: ArrayLike | str,
|
58
|
+
continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
|
59
|
+
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
60
|
+
exclude: Iterable[str] | None = None,
|
61
|
+
) -> MetadataOutput:
|
62
|
+
"""
|
63
|
+
Restructures the metadata to be in the correct format for the bias functions.
|
64
|
+
|
65
|
+
This identifies whether the incoming metadata is discrete or continuous,
|
66
|
+
and whether the data is already binned or still needs binning.
|
67
|
+
It accepts a list of dictionaries containing the per image metadata and
|
68
|
+
automatically adjusts for multiple targets in an image.
|
69
|
+
|
70
|
+
Parameters
|
71
|
+
----------
|
72
|
+
raw_metadata : Iterable[Mapping[str, Any]]
|
73
|
+
Iterable collection of metadata dictionaries to flatten and merge.
|
74
|
+
class_labels : ArrayLike or string or None
|
75
|
+
If arraylike, expects the labels for each image (image classification) or each object (object detection).
|
76
|
+
If the labels are included in the metadata dictionary, pass in the key value.
|
77
|
+
continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
|
78
|
+
User provided dictionary specifying how to bin the continuous metadata factors
|
79
|
+
auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
|
80
|
+
Method by which the function will automatically bin continuous metadata factors. It is recommended
|
81
|
+
that the user provide the bins through the `continuous_factor_bins`.
|
82
|
+
exclude : Iterable[str] or None, default None
|
83
|
+
User provided collection of metadata keys to exclude when processing metadata.
|
84
|
+
|
85
|
+
Returns
|
86
|
+
-------
|
87
|
+
MetadataOutput
|
88
|
+
Output class containing the binned metadata
|
89
|
+
"""
|
90
|
+
# Transform metadata into single, flattened dictionary
|
91
|
+
metadata, image_repeats = merge_metadata(raw_metadata)
|
92
|
+
|
93
|
+
# Drop any excluded metadata keys
|
94
|
+
if exclude:
|
95
|
+
for k in list(metadata):
|
96
|
+
if k in exclude:
|
97
|
+
metadata.pop(k)
|
98
|
+
|
99
|
+
# Get the class label array in numeric form
|
100
|
+
class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
|
101
|
+
if class_array.ndim > 1:
|
102
|
+
raise ValueError(
|
103
|
+
f"Got class labels with {class_array.ndim}-dimensional "
|
104
|
+
f"shape {class_array.shape}, but expected a 1-dimensional array."
|
105
|
+
)
|
106
|
+
if not np.issubdtype(class_array.dtype, np.int_):
|
107
|
+
unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
|
108
|
+
else:
|
109
|
+
numerical_labels = class_array
|
110
|
+
unique_classes = np.unique(class_array)
|
111
|
+
|
112
|
+
# Bin according to user supplied bins
|
113
|
+
continuous_metadata = {}
|
114
|
+
discrete_metadata = {}
|
115
|
+
if continuous_factor_bins is not None and continuous_factor_bins != {}:
|
116
|
+
invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
|
117
|
+
if invalid_keys:
|
118
|
+
raise KeyError(
|
119
|
+
f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
|
120
|
+
"but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
|
121
|
+
"or add corresponding entries to the `metadata` dictionary."
|
122
|
+
)
|
123
|
+
for factor, grouping in continuous_factor_bins.items():
|
124
|
+
discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
|
125
|
+
continuous_metadata[factor] = metadata[factor]
|
126
|
+
|
127
|
+
# Determine category of the rest of the keys
|
128
|
+
remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
|
129
|
+
for key in remaining_keys:
|
130
|
+
data = to_numpy(metadata[key])
|
131
|
+
if np.issubdtype(data.dtype, np.number):
|
132
|
+
result = _is_continuous(data, image_repeats)
|
133
|
+
if result:
|
134
|
+
continuous_metadata[key] = data
|
135
|
+
unique_samples, ordinal_data = np.unique(data, return_inverse=True)
|
136
|
+
if unique_samples.size <= np.max([20, data.size * 0.01]):
|
137
|
+
discrete_metadata[key] = ordinal_data
|
138
|
+
else:
|
139
|
+
warnings.warn(
|
140
|
+
f"A user defined binning was not provided for {key}. "
|
141
|
+
f"Using the {auto_bin_method} method to discretize the data. "
|
142
|
+
"It is recommended that the user rerun and supply the desired "
|
143
|
+
"bins using the continuous_factor_bins parameter.",
|
144
|
+
UserWarning,
|
145
|
+
)
|
146
|
+
discrete_metadata[key] = _binning_function(data, auto_bin_method)
|
147
|
+
else:
|
148
|
+
_, discrete_metadata[key] = np.unique(data, return_inverse=True)
|
149
|
+
|
150
|
+
# splitting out the dictionaries into the keys and values
|
151
|
+
discrete_factor_names = list(discrete_metadata.keys())
|
152
|
+
discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
|
153
|
+
continuous_factor_names = list(continuous_metadata.keys())
|
154
|
+
continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
|
155
|
+
total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
|
156
|
+
|
157
|
+
return MetadataOutput(
|
158
|
+
discrete_factor_names,
|
159
|
+
discrete_data,
|
160
|
+
continuous_factor_names,
|
161
|
+
continuous_data,
|
162
|
+
numerical_labels,
|
163
|
+
unique_classes,
|
164
|
+
total_num_factors,
|
165
|
+
)
|
166
|
+
|
167
|
+
|
168
|
+
def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
|
169
|
+
"""
|
170
|
+
Digitizes a list of values into a given number of bins.
|
171
|
+
|
172
|
+
Parameters
|
173
|
+
----------
|
174
|
+
data : list | NDArray
|
175
|
+
The values to be digitized.
|
176
|
+
binning : int | list[tuple[TNum, TNum]]
|
177
|
+
The number of bins for the discrete values that data will be digitized into.
|
178
|
+
|
179
|
+
Returns
|
180
|
+
-------
|
181
|
+
NDArray[np.intp]
|
182
|
+
The digitized values
|
183
|
+
"""
|
184
|
+
|
185
|
+
if not np.all([np.issubdtype(type(n), np.number) for n in data]):
|
186
|
+
raise TypeError(
|
187
|
+
"Encountered a data value with non-numeric type when digitizing a factor. "
|
188
|
+
"Ensure all occurrences of continuous factors are numeric types."
|
189
|
+
)
|
190
|
+
if type(binning) is int:
|
191
|
+
_, bin_edges = np.histogram(data, bins=binning)
|
192
|
+
bin_edges[-1] = np.inf
|
193
|
+
bin_edges[0] = -np.inf
|
194
|
+
else:
|
195
|
+
bin_edges = binning
|
196
|
+
return np.digitize(data, bin_edges)
|
197
|
+
|
198
|
+
|
199
|
+
def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
|
200
|
+
"""
|
201
|
+
Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
|
202
|
+
"""
|
203
|
+
if bin_method == "clusters":
|
204
|
+
# bin_edges = _binning_by_clusters(data)
|
205
|
+
warnings.warn(
|
206
|
+
"Binning by clusters is currently unavailable until changes to the clustering function go through.",
|
207
|
+
UserWarning,
|
208
|
+
)
|
209
|
+
bin_method = "uniform_width"
|
210
|
+
|
211
|
+
if bin_method != "clusters":
|
212
|
+
counts, bin_edges = np.histogram(data, bins="auto")
|
213
|
+
n_bins = counts.size
|
214
|
+
if counts[counts > 0].min() < 10:
|
215
|
+
for _ in range(20):
|
216
|
+
n_bins -= 1
|
217
|
+
counts, bin_edges = np.histogram(data, bins=n_bins)
|
218
|
+
if counts[counts > 0].min() >= 10 or n_bins < 2:
|
219
|
+
break
|
220
|
+
|
221
|
+
if bin_method == "uniform_count":
|
222
|
+
quantiles = np.linspace(0, 100, n_bins + 1)
|
223
|
+
bin_edges = np.asarray(np.percentile(data, quantiles))
|
224
|
+
|
225
|
+
bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
|
226
|
+
bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
|
227
|
+
return np.digitize(data, bin_edges) # type: ignore
|
228
|
+
|
229
|
+
|
230
|
+
def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
|
231
|
+
"""
|
232
|
+
Determines whether the data is continuous or discrete using the Wasserstein distance.
|
233
|
+
|
234
|
+
Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
|
235
|
+
a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
|
236
|
+
we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
|
237
|
+
neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
|
238
|
+
ignored, depending on context). These normalized locations will be much more uniformly distributed
|
239
|
+
for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
|
240
|
+
Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
|
241
|
+
|
242
|
+
The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
|
243
|
+
how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
|
244
|
+
least 20 points, and furthermore at least half as many points as there are discrete values, we can
|
245
|
+
reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
|
246
|
+
measured from a uniform distribution is greater or less than 0.054, respectively.
|
247
|
+
"""
|
248
|
+
# Check if the metadata is image specific
|
249
|
+
_, data_indicies_unsorted = np.unique(data, return_index=True)
|
250
|
+
if data_indicies_unsorted.size == image_indicies.size:
|
251
|
+
data_indicies = np.sort(data_indicies_unsorted)
|
252
|
+
if (data_indicies == image_indicies).all():
|
253
|
+
data = data[data_indicies]
|
254
|
+
|
255
|
+
# OLD METHOD
|
256
|
+
# uvals = np.unique(data)
|
257
|
+
# pct_unique = uvals.size / data.size
|
258
|
+
# return pct_unique < threshold
|
259
|
+
|
260
|
+
n_examples = len(data)
|
261
|
+
|
262
|
+
if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
|
263
|
+
warnings.warn(
|
264
|
+
f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
|
265
|
+
)
|
266
|
+
return False
|
267
|
+
|
268
|
+
# Require at least 3 unique values before bothering with NNN
|
269
|
+
xu = np.unique(data, axis=None)
|
270
|
+
if xu.size < 3:
|
271
|
+
return False
|
272
|
+
|
273
|
+
Xs = np.sort(data)
|
274
|
+
|
275
|
+
X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
|
276
|
+
|
277
|
+
dx = np.zeros(n_examples - 2) # no dx at end points
|
278
|
+
gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
|
279
|
+
dx[np.logical_not(gtz)] = 0.0
|
280
|
+
|
281
|
+
dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
|
282
|
+
|
283
|
+
shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
|
284
|
+
|
285
|
+
return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.
|