dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +1 -1
- dataeval/detectors/drift/base.py +2 -2
- dataeval/detectors/linters/clusterer.py +1 -1
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +14 -6
- dataeval/detectors/ood/aegmm.py +14 -6
- dataeval/detectors/ood/base.py +9 -3
- dataeval/detectors/ood/llr.py +22 -16
- dataeval/detectors/ood/vae.py +14 -6
- dataeval/detectors/ood/vaegmm.py +14 -6
- dataeval/interop.py +9 -7
- dataeval/metrics/bias/balance.py +50 -44
- dataeval/metrics/bias/coverage.py +38 -6
- dataeval/metrics/bias/diversity.py +117 -65
- dataeval/metrics/bias/metadata.py +225 -60
- dataeval/metrics/bias/parity.py +68 -54
- dataeval/utils/__init__.py +4 -3
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/shared.py +1 -1
- dataeval/utils/split_dataset.py +12 -6
- dataeval/utils/tensorflow/_internal/gmm.py +8 -2
- dataeval/utils/tensorflow/_internal/loss.py +20 -11
- dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
- dataeval/utils/tensorflow/_internal/trainer.py +12 -5
- dataeval/utils/tensorflow/_internal/utils.py +70 -71
- dataeval/utils/torch/datasets.py +2 -2
- dataeval/workflows/__init__.py +1 -1
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
- dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = ["CoverageOutput", "coverage"]
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import math
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Literal
|
@@ -11,9 +12,13 @@ from numpy.typing import ArrayLike, NDArray
|
|
11
12
|
from scipy.spatial.distance import pdist, squareform
|
12
13
|
|
13
14
|
from dataeval.interop import to_numpy
|
15
|
+
from dataeval.metrics.bias.metadata import coverage_plot
|
14
16
|
from dataeval.output import OutputMetadata, set_metadata
|
15
17
|
from dataeval.utils.shared import flatten
|
16
18
|
|
19
|
+
with contextlib.suppress(ImportError):
|
20
|
+
from matplotlib.figure import Figure
|
21
|
+
|
17
22
|
|
18
23
|
@dataclass(frozen=True)
|
19
24
|
class CoverageOutput(OutputMetadata):
|
@@ -22,9 +27,9 @@ class CoverageOutput(OutputMetadata):
|
|
22
27
|
|
23
28
|
Attributes
|
24
29
|
----------
|
25
|
-
indices : NDArray
|
30
|
+
indices : NDArray[np.intp]
|
26
31
|
Array of uncovered indices
|
27
|
-
radii : NDArray
|
32
|
+
radii : NDArray[np.float64]
|
28
33
|
Array of critical value radii
|
29
34
|
critical_value : float
|
30
35
|
Radius for :term:`coverage<Coverage>`
|
@@ -34,13 +39,40 @@ class CoverageOutput(OutputMetadata):
|
|
34
39
|
radii: NDArray[np.float64]
|
35
40
|
critical_value: float
|
36
41
|
|
42
|
+
def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
|
43
|
+
"""
|
44
|
+
Plot the top k images together for visualization
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
images : ArrayLike
|
49
|
+
Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
|
50
|
+
top_k : int, default 6
|
51
|
+
Number of images to plot (plotting assumes groups of 3)
|
52
|
+
|
53
|
+
Returns
|
54
|
+
-------
|
55
|
+
matplotlib.figure.Figure
|
56
|
+
"""
|
57
|
+
# Determine which images to plot
|
58
|
+
highest_uncovered_indices = self.indices[:top_k]
|
59
|
+
|
60
|
+
# Grab the images
|
61
|
+
images = to_numpy(images)
|
62
|
+
selected_images = images[highest_uncovered_indices]
|
63
|
+
|
64
|
+
# Plot the images
|
65
|
+
fig = coverage_plot(selected_images, top_k)
|
66
|
+
|
67
|
+
return fig
|
68
|
+
|
37
69
|
|
38
70
|
@set_metadata()
|
39
71
|
def coverage(
|
40
72
|
embeddings: ArrayLike,
|
41
73
|
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
42
74
|
k: int = 20,
|
43
|
-
percent:
|
75
|
+
percent: float = 0.01,
|
44
76
|
) -> CoverageOutput:
|
45
77
|
"""
|
46
78
|
Class for evaluating :term:`coverage<Coverage>` and identifying images/samples that are in undercovered regions.
|
@@ -50,12 +82,12 @@ def coverage(
|
|
50
82
|
embeddings : ArrayLike, shape - (N, P)
|
51
83
|
A dataset in an ArrayLike format.
|
52
84
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
53
|
-
radius_type :
|
85
|
+
radius_type : {"adaptive", "naive"}, default "adaptive"
|
54
86
|
The function used to determine radius.
|
55
|
-
k: int, default 20
|
87
|
+
k : int, default 20
|
56
88
|
Number of observations required in order to be covered.
|
57
89
|
[1] suggests that a minimum of 20-50 samples is necessary.
|
58
|
-
percent:
|
90
|
+
percent : float, default 0.01
|
59
91
|
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
60
92
|
|
61
93
|
Returns
|
@@ -2,16 +2,28 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = ["DiversityOutput", "diversity"]
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
from dataclasses import dataclass
|
6
7
|
from typing import Any, Literal, Mapping
|
7
8
|
|
8
9
|
import numpy as np
|
9
10
|
from numpy.typing import ArrayLike, NDArray
|
10
11
|
|
11
|
-
from dataeval.metrics.bias.metadata import
|
12
|
+
from dataeval.metrics.bias.metadata import (
|
13
|
+
CLASS_LABEL,
|
14
|
+
diversity_bar_plot,
|
15
|
+
entropy,
|
16
|
+
get_counts,
|
17
|
+
get_num_bins,
|
18
|
+
heatmap,
|
19
|
+
preprocess_metadata,
|
20
|
+
)
|
12
21
|
from dataeval.output import OutputMetadata, set_metadata
|
13
22
|
from dataeval.utils.shared import get_method
|
14
23
|
|
24
|
+
with contextlib.suppress(ImportError):
|
25
|
+
from matplotlib.figure import Figure
|
26
|
+
|
15
27
|
|
16
28
|
@dataclass(frozen=True)
|
17
29
|
class DiversityOutput(OutputMetadata):
|
@@ -24,50 +36,63 @@ class DiversityOutput(OutputMetadata):
|
|
24
36
|
:term:`Diversity` index for classes and factors
|
25
37
|
classwise : NDArray[np.float64]
|
26
38
|
Classwise diversity index [n_class x n_factor]
|
27
|
-
class_list: NDArray[np.int64]
|
39
|
+
class_list : NDArray[np.int64]
|
28
40
|
Class labels for each value in the dataset
|
29
|
-
metadata_names: list[str]
|
41
|
+
metadata_names : list[str]
|
30
42
|
Names of each metadata factor
|
31
43
|
"""
|
32
44
|
|
33
45
|
diversity_index: NDArray[np.float64]
|
34
46
|
classwise: NDArray[np.float64]
|
35
|
-
|
36
|
-
class_list: NDArray[np.int64]
|
47
|
+
class_list: NDArray[Any]
|
37
48
|
metadata_names: list[str]
|
38
49
|
|
39
|
-
|
40
|
-
|
41
|
-
|
50
|
+
def plot(
|
51
|
+
self,
|
52
|
+
row_labels: ArrayLike | list[Any] | None = None,
|
53
|
+
col_labels: ArrayLike | list[Any] | None = None,
|
54
|
+
plot_classwise: bool = False,
|
55
|
+
) -> Figure:
|
42
56
|
"""
|
43
57
|
Plot a heatmap of diversity information
|
44
58
|
|
45
59
|
Parameters
|
46
60
|
----------
|
47
|
-
row_labels:
|
48
|
-
Array containing the labels for rows in the histogram
|
49
|
-
col_labels:
|
50
|
-
Array containing the labels for columns in the histogram
|
61
|
+
row_labels : ArrayLike or None, default None
|
62
|
+
List/Array containing the labels for rows in the histogram
|
63
|
+
col_labels : ArrayLike or None, default None
|
64
|
+
List/Array containing the labels for columns in the histogram
|
65
|
+
plot_classwise : bool, default False
|
66
|
+
Whether to plot per-class balance instead of global balance
|
51
67
|
"""
|
52
|
-
if
|
53
|
-
row_labels
|
54
|
-
|
55
|
-
col_labels
|
68
|
+
if plot_classwise:
|
69
|
+
if row_labels is None:
|
70
|
+
row_labels = self.class_list
|
71
|
+
if col_labels is None:
|
72
|
+
col_labels = self.metadata_names
|
73
|
+
|
74
|
+
fig = heatmap(
|
75
|
+
self.classwise,
|
76
|
+
row_labels,
|
77
|
+
col_labels,
|
78
|
+
xlabel="Factors",
|
79
|
+
ylabel="Class",
|
80
|
+
cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
|
81
|
+
)
|
56
82
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
)
|
83
|
+
else:
|
84
|
+
# Creating label array for heat map axes
|
85
|
+
heat_labels = np.concatenate((["class"], self.metadata_names))
|
86
|
+
|
87
|
+
fig = diversity_bar_plot(heat_labels, self.diversity_index)
|
88
|
+
|
89
|
+
return fig
|
65
90
|
|
66
91
|
|
67
92
|
def diversity_shannon(
|
68
93
|
data: NDArray[Any],
|
69
94
|
names: list[str],
|
70
|
-
|
95
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
71
96
|
subset_mask: NDArray[np.bool_] | None = None,
|
72
97
|
) -> NDArray[np.float64]:
|
73
98
|
"""
|
@@ -81,14 +106,16 @@ def diversity_shannon(
|
|
81
106
|
|
82
107
|
Parameters
|
83
108
|
----------
|
84
|
-
data: NDArray
|
109
|
+
data : NDArray
|
85
110
|
Array containing numerical values for metadata factors
|
86
|
-
names: list[str]
|
111
|
+
names : list[str]
|
87
112
|
Names of metadata factors -- keys of the metadata dictionary
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
113
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
114
|
+
The factors in names that have continuous values and the array of bin counts to
|
115
|
+
discretize values into. All factors are treated as having discrete values unless they
|
116
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
117
|
+
in names.
|
118
|
+
subset_mask : NDArray[np.bool_] or None, default None
|
92
119
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
93
120
|
|
94
121
|
Note
|
@@ -97,18 +124,32 @@ def diversity_shannon(
|
|
97
124
|
|
98
125
|
Returns
|
99
126
|
-------
|
100
|
-
diversity_index: NDArray
|
127
|
+
diversity_index : NDArray[np.float64]
|
101
128
|
Diversity index per column of X
|
102
129
|
|
103
130
|
See Also
|
104
131
|
--------
|
105
132
|
numpy.histogram
|
106
133
|
"""
|
134
|
+
hist_cache = {}
|
107
135
|
|
108
136
|
# entropy computed using global auto bins so that we can properly normalize
|
109
|
-
ent_unnormalized = entropy(
|
137
|
+
ent_unnormalized = entropy(
|
138
|
+
data,
|
139
|
+
names,
|
140
|
+
continuous_factor_bincounts,
|
141
|
+
normalized=False,
|
142
|
+
subset_mask=subset_mask,
|
143
|
+
hist_cache=hist_cache,
|
144
|
+
)
|
110
145
|
# normalize by global counts rather than classwise counts
|
111
|
-
num_bins = get_num_bins(
|
146
|
+
num_bins = get_num_bins(
|
147
|
+
data,
|
148
|
+
names,
|
149
|
+
continuous_factor_bincounts=continuous_factor_bincounts,
|
150
|
+
subset_mask=subset_mask,
|
151
|
+
hist_cache=hist_cache,
|
152
|
+
)
|
112
153
|
ent_norm = np.empty(ent_unnormalized.shape)
|
113
154
|
ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
|
114
155
|
ent_norm[num_bins == 1] = 0
|
@@ -118,7 +159,7 @@ def diversity_shannon(
|
|
118
159
|
def diversity_simpson(
|
119
160
|
data: NDArray[Any],
|
120
161
|
names: list[str],
|
121
|
-
|
162
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
122
163
|
subset_mask: NDArray[np.bool_] | None = None,
|
123
164
|
) -> NDArray[np.float64]:
|
124
165
|
"""
|
@@ -132,14 +173,16 @@ def diversity_simpson(
|
|
132
173
|
|
133
174
|
Parameters
|
134
175
|
----------
|
135
|
-
data: NDArray
|
176
|
+
data : NDArray
|
136
177
|
Array containing numerical values for metadata factors
|
137
|
-
names: list[str]
|
178
|
+
names : list[str]
|
138
179
|
Names of metadata factors -- keys of the metadata dictionary
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
180
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
181
|
+
The factors in names that have continuous values and the array of bin counts to
|
182
|
+
discretize values into. All factors are treated as having discrete values unless they
|
183
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
184
|
+
in names.
|
185
|
+
subset_mask : NDArray[np.bool_] or None, default None
|
143
186
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
144
187
|
|
145
188
|
Note
|
@@ -150,35 +193,39 @@ def diversity_simpson(
|
|
150
193
|
|
151
194
|
Returns
|
152
195
|
-------
|
153
|
-
NDArray
|
196
|
+
diversity_index : NDArray[np.float64]
|
154
197
|
Diversity index per column of X
|
155
198
|
|
156
199
|
See Also
|
157
200
|
--------
|
158
201
|
numpy.histogram
|
159
202
|
"""
|
203
|
+
hist_cache = {}
|
160
204
|
|
161
|
-
hist_counts
|
205
|
+
hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
|
162
206
|
# normalize by global counts, not classwise counts
|
163
|
-
num_bins = get_num_bins(data, names,
|
207
|
+
num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
|
164
208
|
|
165
209
|
ev_index = np.empty(len(names))
|
166
210
|
# loop over columns for convenience
|
167
211
|
for col, cnts in enumerate(hist_counts.values()):
|
168
212
|
# relative frequencies
|
169
|
-
p_i = cnts /
|
213
|
+
p_i = cnts / np.sum(cnts)
|
170
214
|
# inverse Simpson index normalized by (number of bins)
|
171
|
-
s_0 = 1 / np.sum(p_i**2) / num_bins[col]
|
215
|
+
s_0 = 1 / np.sum(p_i**2) # / num_bins[col]
|
172
216
|
if num_bins[col] == 1:
|
173
217
|
ev_index[col] = 0
|
174
218
|
else:
|
175
|
-
ev_index[col] = (s_0
|
219
|
+
ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
|
176
220
|
return ev_index
|
177
221
|
|
178
222
|
|
179
223
|
@set_metadata()
|
180
224
|
def diversity(
|
181
|
-
class_labels: ArrayLike,
|
225
|
+
class_labels: ArrayLike,
|
226
|
+
metadata: Mapping[str, ArrayLike],
|
227
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
228
|
+
method: Literal["simpson", "shannon"] = "simpson",
|
182
229
|
) -> DiversityOutput:
|
183
230
|
"""
|
184
231
|
Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
|
@@ -191,11 +238,16 @@ def diversity(
|
|
191
238
|
|
192
239
|
Parameters
|
193
240
|
----------
|
194
|
-
class_labels: ArrayLike
|
241
|
+
class_labels : ArrayLike
|
195
242
|
List of class labels for each image
|
196
|
-
metadata: Mapping[str, ArrayLike]
|
243
|
+
metadata : Mapping[str, ArrayLike]
|
197
244
|
Dict of list of metadata factors for each image
|
198
|
-
|
245
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
246
|
+
The factors in metadata that have continuous values and the array of bin counts to
|
247
|
+
discretize values into. All factors are treated as having discrete values unless they
|
248
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
249
|
+
in metadata.
|
250
|
+
method : {"simpson", "shannon"}, default "simpson"
|
199
251
|
Indicates which diversity index should be computed
|
200
252
|
|
201
253
|
Note
|
@@ -214,34 +266,34 @@ def diversity(
|
|
214
266
|
-------
|
215
267
|
Compute Simpson diversity index of metadata and class labels
|
216
268
|
|
217
|
-
>>> div_simp = diversity(class_labels, metadata, method="simpson")
|
269
|
+
>>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
|
218
270
|
>>> div_simp.diversity_index
|
219
|
-
array([0.
|
271
|
+
array([0.72413793, 0.72413793, 0.88636364])
|
220
272
|
|
221
273
|
>>> div_simp.classwise
|
222
|
-
array([[0.
|
223
|
-
[0.
|
274
|
+
array([[0.68965517, 0.69230769],
|
275
|
+
[0.8 , 1. ]])
|
224
276
|
|
225
277
|
Compute Shannon diversity index of metadata and class labels
|
226
278
|
|
227
|
-
>>> div_shan = diversity(class_labels, metadata, method="shannon")
|
279
|
+
>>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
|
228
280
|
>>> div_shan.diversity_index
|
229
|
-
array([0.
|
281
|
+
array([0.8812909 , 0.8812909 , 0.96748876])
|
230
282
|
|
231
283
|
>>> div_shan.classwise
|
232
|
-
array([[0.
|
233
|
-
[0.
|
284
|
+
array([[0.86312057, 0.91651644],
|
285
|
+
[0.91829583, 1. ]])
|
234
286
|
|
235
287
|
See Also
|
236
288
|
--------
|
237
289
|
numpy.histogram
|
238
290
|
"""
|
239
291
|
diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
|
240
|
-
data, names,
|
241
|
-
diversity_index = diversity_fn(data, names,
|
292
|
+
data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
|
293
|
+
diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
|
242
294
|
|
243
|
-
class_idx = names.index(
|
244
|
-
class_lbl =
|
295
|
+
class_idx = names.index(CLASS_LABEL)
|
296
|
+
class_lbl = data[:, class_idx]
|
245
297
|
|
246
298
|
u_classes = np.unique(class_lbl)
|
247
299
|
num_factors = len(names)
|
@@ -249,7 +301,7 @@ def diversity(
|
|
249
301
|
diversity[:] = np.nan
|
250
302
|
for idx, cls in enumerate(u_classes):
|
251
303
|
subset_mask = class_lbl == cls
|
252
|
-
diversity[idx, :] = diversity_fn(data, names,
|
304
|
+
diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
|
253
305
|
div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
|
254
306
|
|
255
|
-
return DiversityOutput(diversity_index, div_no_class,
|
307
|
+
return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))
|