dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +13 -9
- dataeval/_internal/detectors/clusterer.py +63 -49
- dataeval/_internal/detectors/drift/base.py +248 -51
- dataeval/_internal/detectors/drift/cvm.py +28 -26
- dataeval/_internal/detectors/drift/ks.py +31 -28
- dataeval/_internal/detectors/drift/mmd.py +62 -42
- dataeval/_internal/detectors/drift/torch.py +69 -60
- dataeval/_internal/detectors/drift/uncertainty.py +32 -32
- dataeval/_internal/detectors/duplicates.py +67 -31
- dataeval/_internal/detectors/ood/ae.py +15 -29
- dataeval/_internal/detectors/ood/aegmm.py +33 -27
- dataeval/_internal/detectors/ood/base.py +86 -47
- dataeval/_internal/detectors/ood/llr.py +34 -31
- dataeval/_internal/detectors/ood/vae.py +32 -31
- dataeval/_internal/detectors/ood/vaegmm.py +34 -28
- dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
- dataeval/_internal/flags.py +44 -21
- dataeval/_internal/interop.py +5 -3
- dataeval/_internal/metrics/balance.py +42 -5
- dataeval/_internal/metrics/ber.py +11 -8
- dataeval/_internal/metrics/coverage.py +15 -8
- dataeval/_internal/metrics/divergence.py +41 -7
- dataeval/_internal/metrics/diversity.py +57 -19
- dataeval/_internal/metrics/parity.py +141 -66
- dataeval/_internal/metrics/stats.py +330 -313
- dataeval/_internal/metrics/uap.py +33 -4
- dataeval/_internal/metrics/utils.py +79 -40
- dataeval/_internal/models/pytorch/autoencoder.py +127 -22
- dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
- dataeval/_internal/models/tensorflow/gmm.py +4 -2
- dataeval/_internal/models/tensorflow/losses.py +17 -13
- dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
- dataeval/_internal/models/tensorflow/trainer.py +10 -7
- dataeval/_internal/models/tensorflow/utils.py +23 -20
- dataeval/_internal/output.py +85 -0
- dataeval/_internal/utils.py +5 -3
- dataeval/_internal/workflows/sufficiency.py +122 -121
- dataeval/detectors/__init__.py +6 -25
- dataeval/detectors/drift/__init__.py +16 -0
- dataeval/detectors/drift/kernels/__init__.py +6 -0
- dataeval/detectors/drift/updates/__init__.py +3 -0
- dataeval/detectors/linters/__init__.py +5 -0
- dataeval/detectors/ood/__init__.py +11 -0
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +2 -26
- dataeval/metrics/bias/__init__.py +14 -0
- dataeval/metrics/estimators/__init__.py +9 -0
- dataeval/metrics/stats/__init__.py +6 -0
- dataeval/tensorflow/__init__.py +3 -0
- dataeval/tensorflow/loss/__init__.py +3 -0
- dataeval/tensorflow/models/__init__.py +5 -0
- dataeval/tensorflow/recon/__init__.py +3 -0
- dataeval/torch/__init__.py +3 -0
- dataeval/{models/torch → torch/models}/__init__.py +1 -2
- dataeval/torch/trainer/__init__.py +3 -0
- dataeval/utils/__init__.py +3 -6
- dataeval/workflows/__init__.py +2 -4
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
- dataeval-0.66.0.dist-info/RECORD +72 -0
- dataeval/_internal/metrics/base.py +0 -10
- dataeval/models/__init__.py +0 -15
- dataeval/models/tensorflow/__init__.py +0 -6
- dataeval-0.64.0.dist-info/RECORD +0 -60
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -1,48 +1,41 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import warnings
|
2
|
-
from
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Generic, Mapping, TypeVar
|
3
6
|
|
4
7
|
import numpy as np
|
5
8
|
from numpy.typing import ArrayLike, NDArray
|
6
9
|
from scipy.stats import chi2_contingency, chisquare
|
7
10
|
|
8
11
|
from dataeval._internal.interop import to_numpy
|
12
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
9
13
|
|
10
|
-
|
11
|
-
class ParityOutput(NamedTuple):
|
12
|
-
"""
|
13
|
-
Attributes
|
14
|
-
----------
|
15
|
-
score : np.float64
|
16
|
-
chi-squared value of the test
|
17
|
-
p_value : np.float64
|
18
|
-
p-value of the test
|
19
|
-
"""
|
20
|
-
|
21
|
-
score: np.float64
|
22
|
-
p_value: np.float64
|
14
|
+
TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
23
15
|
|
24
16
|
|
25
|
-
|
17
|
+
@dataclass(frozen=True)
|
18
|
+
class ParityOutput(Generic[TData], OutputMetadata):
|
26
19
|
"""
|
27
20
|
Attributes
|
28
21
|
----------
|
29
|
-
|
30
|
-
chi-squared
|
31
|
-
|
32
|
-
p-
|
22
|
+
score : np.float64 | NDArray[np.float64]
|
23
|
+
chi-squared score(s) of the test
|
24
|
+
p_value : np.float64 | NDArray[np.float64]
|
25
|
+
p-value(s) of the test
|
33
26
|
"""
|
34
27
|
|
35
|
-
score:
|
36
|
-
p_value:
|
28
|
+
score: TData
|
29
|
+
p_value: TData
|
37
30
|
|
38
31
|
|
39
|
-
def digitize_factor_bins(continuous_values:
|
32
|
+
def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str) -> NDArray:
|
40
33
|
"""
|
41
34
|
Digitizes a list of values into a given number of bins.
|
42
35
|
|
43
36
|
Parameters
|
44
37
|
----------
|
45
|
-
continuous_values:
|
38
|
+
continuous_values: NDArray
|
46
39
|
The values to be digitized.
|
47
40
|
bins: int
|
48
41
|
The number of bins for the discrete values that continuous_values will be digitized into.
|
@@ -51,10 +44,10 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
|
|
51
44
|
|
52
45
|
Returns
|
53
46
|
-------
|
54
|
-
|
47
|
+
NDArray
|
55
48
|
The digitized values
|
56
|
-
|
57
49
|
"""
|
50
|
+
|
58
51
|
if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
|
59
52
|
raise TypeError(
|
60
53
|
f"Encountered a non-numeric value for factor {factor_name}, but the factor"
|
@@ -69,14 +62,14 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
|
|
69
62
|
|
70
63
|
|
71
64
|
def format_discretize_factors(
|
72
|
-
data_factors: dict[str,
|
73
|
-
) ->
|
65
|
+
data_factors: dict[str, NDArray], continuous_factor_bincounts: dict[str, int]
|
66
|
+
) -> tuple[dict[str, NDArray], NDArray]:
|
74
67
|
"""
|
75
68
|
Sets up the internal list of metadata factors.
|
76
69
|
|
77
70
|
Parameters
|
78
71
|
----------
|
79
|
-
data_factors: Dict[str,
|
72
|
+
data_factors: Dict[str, NDArray]
|
80
73
|
The dataset factors, which are per-image attributes including class label and metadata.
|
81
74
|
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
82
75
|
continuous_factor_bincounts : Dict[str, int]
|
@@ -87,12 +80,12 @@ def format_discretize_factors(
|
|
87
80
|
|
88
81
|
Returns
|
89
82
|
-------
|
90
|
-
Dict[str,
|
91
|
-
Intrinsic per-image metadata information with the formatting that input data_factors uses.
|
92
|
-
|
93
|
-
|
94
|
-
Per-image labels, whose ith element is the label for the ith element of the dataset.
|
83
|
+
Tuple[Dict[str, NDArray], NDArray]
|
84
|
+
- Intrinsic per-image metadata information with the formatting that input data_factors uses.
|
85
|
+
Each key is a metadata factor, whose value is the discrete per-image factor values.
|
86
|
+
- Per-image labels, whose ith element is the label for the ith element of the dataset.
|
95
87
|
"""
|
88
|
+
|
96
89
|
invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
|
97
90
|
if invalid_keys:
|
98
91
|
raise KeyError(
|
@@ -123,7 +116,36 @@ def format_discretize_factors(
|
|
123
116
|
return metadata_factors, labels
|
124
117
|
|
125
118
|
|
126
|
-
def normalize_expected_dist(expected_dist:
|
119
|
+
def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
|
120
|
+
"""
|
121
|
+
Normalize the expected label distribution to match the total number of labels in the observed distribution.
|
122
|
+
|
123
|
+
This function adjusts the expected distribution so that its sum equals the sum of the observed distribution.
|
124
|
+
If the expected distribution is all zeros, an error is raised.
|
125
|
+
|
126
|
+
Parameters
|
127
|
+
----------
|
128
|
+
expected_dist : np.ndarray
|
129
|
+
The expected label distribution. This array represents the anticipated distribution of labels.
|
130
|
+
observed_dist : np.ndarray
|
131
|
+
The observed label distribution. This array represents the actual distribution of labels in the dataset.
|
132
|
+
|
133
|
+
Returns
|
134
|
+
-------
|
135
|
+
np.ndarray
|
136
|
+
The normalized expected distribution, scaled to have the same sum as the observed distribution.
|
137
|
+
|
138
|
+
Raises
|
139
|
+
------
|
140
|
+
ValueError
|
141
|
+
If the expected distribution is all zeros.
|
142
|
+
|
143
|
+
Notes
|
144
|
+
-----
|
145
|
+
The function ensures that the total number of labels in the expected distribution matches the total
|
146
|
+
number of labels in the observed distribution by scaling the expected distribution.
|
147
|
+
"""
|
148
|
+
|
127
149
|
exp_sum = np.sum(expected_dist)
|
128
150
|
obs_sum = np.sum(observed_dist)
|
129
151
|
|
@@ -141,14 +163,14 @@ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray
|
|
141
163
|
return expected_dist
|
142
164
|
|
143
165
|
|
144
|
-
def validate_dist(label_dist:
|
166
|
+
def validate_dist(label_dist: NDArray, label_name: str):
|
145
167
|
"""
|
146
168
|
Verifies that the given label distribution has labels and checks if
|
147
169
|
any labels have frequencies less than 5.
|
148
170
|
|
149
171
|
Parameters
|
150
172
|
----------
|
151
|
-
label_dist :
|
173
|
+
label_dist : NDArray
|
152
174
|
Array representing label distributions
|
153
175
|
|
154
176
|
Raises
|
@@ -158,6 +180,7 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
|
|
158
180
|
Warning
|
159
181
|
If any elements of label_dist are less than 5
|
160
182
|
"""
|
183
|
+
|
161
184
|
if not len(label_dist):
|
162
185
|
raise ValueError(f"No labels found in the {label_name} dataset")
|
163
186
|
if np.any(label_dist < 5):
|
@@ -166,24 +189,20 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
|
|
166
189
|
" dataset have frequencies less than 5. This may lead"
|
167
190
|
" to invalid chi-squared evaluation."
|
168
191
|
)
|
169
|
-
warnings.warn(
|
170
|
-
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
171
|
-
" dataset have frequencies less than 5. This may lead"
|
172
|
-
" to invalid chi-squared evaluation."
|
173
|
-
)
|
174
192
|
|
175
193
|
|
176
|
-
|
194
|
+
@set_metadata("dataeval.metrics")
|
195
|
+
def label_parity(
|
177
196
|
expected_labels: ArrayLike,
|
178
197
|
observed_labels: ArrayLike,
|
179
|
-
num_classes:
|
180
|
-
) -> ParityOutput:
|
198
|
+
num_classes: int | None = None,
|
199
|
+
) -> ParityOutput[np.float64]:
|
181
200
|
"""
|
182
|
-
|
183
|
-
tests the null hypothesis that the observed data has the expected frequencies.
|
201
|
+
Calculate the chi-square statistic to assess the parity between expected and observed label distributions.
|
184
202
|
|
185
|
-
This function
|
186
|
-
|
203
|
+
This function computes the frequency distribution of classes in both expected and observed labels, normalizes
|
204
|
+
the expected distribution to match the total number of observed labels, and then calculates the chi-square
|
205
|
+
statistic to determine if there is a significant difference between the two distributions.
|
187
206
|
|
188
207
|
Parameters
|
189
208
|
----------
|
@@ -191,9 +210,9 @@ def parity(
|
|
191
210
|
List of class labels in the expected dataset
|
192
211
|
observed_labels : ArrayLike
|
193
212
|
List of class labels in the observed dataset
|
194
|
-
num_classes :
|
195
|
-
The number of unique classes in the datasets. If
|
196
|
-
|
213
|
+
num_classes : int | None, default None
|
214
|
+
The number of unique classes in the datasets. If not provided, the function will infer it
|
215
|
+
from the set of unique labels in expected_labels and observed_labels
|
197
216
|
|
198
217
|
Returns
|
199
218
|
-------
|
@@ -203,8 +222,31 @@ def parity(
|
|
203
222
|
Raises
|
204
223
|
------
|
205
224
|
ValueError
|
206
|
-
If
|
225
|
+
If expected label distribution is empty, is all zeros, or if there is a mismatch in the number
|
226
|
+
of unique classes between the observed and expected distributions.
|
227
|
+
|
228
|
+
|
229
|
+
Notes
|
230
|
+
-----
|
231
|
+
- Providing ``num_classes`` can be helpful if there are classes with zero instances in one of the distributions.
|
232
|
+
- The function first validates the observed distribution and normalizes the expected distribution so that it
|
233
|
+
has the same total number of labels as the observed distribution.
|
234
|
+
- It then performs a chi-square test to determine if there is a statistically significant difference between
|
235
|
+
the observed and expected label distributions.
|
236
|
+
- This function acts as an interface to the scipy.stats.chisquare method, which is documented at
|
237
|
+
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
|
238
|
+
|
239
|
+
|
240
|
+
Examples
|
241
|
+
--------
|
242
|
+
Randomly creating some label distributions using ``np.random.default_rng``
|
243
|
+
|
244
|
+
>>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
|
245
|
+
>>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
|
246
|
+
>>> label_parity(expected_labels, observed_labels)
|
247
|
+
ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
|
207
248
|
"""
|
249
|
+
|
208
250
|
# Calculate
|
209
251
|
if not num_classes:
|
210
252
|
num_classes = 0
|
@@ -236,27 +278,28 @@ def parity(
|
|
236
278
|
return ParityOutput(cs, p)
|
237
279
|
|
238
280
|
|
239
|
-
|
281
|
+
@set_metadata("dataeval.metrics")
|
282
|
+
def parity(
|
240
283
|
data_factors: Mapping[str, ArrayLike],
|
241
|
-
continuous_factor_bincounts:
|
242
|
-
) ->
|
284
|
+
continuous_factor_bincounts: dict[str, int] | None = None,
|
285
|
+
) -> ParityOutput[NDArray[np.float64]]:
|
243
286
|
"""
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
287
|
+
Calculate chi-square statistics to assess the relationship between multiple factors and class labels.
|
288
|
+
|
289
|
+
This function computes the chi-square statistic for each metadata factor to determine if there is
|
290
|
+
a significant relationship between the factor values and class labels. The function handles both categorical
|
291
|
+
and discretized continuous factors.
|
249
292
|
|
250
293
|
Parameters
|
251
294
|
----------
|
252
295
|
data_factors: Mapping[str, ArrayLike]
|
253
296
|
The dataset factors, which are per-image attributes including class label and metadata.
|
254
297
|
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
255
|
-
continuous_factor_bincounts :
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
298
|
+
continuous_factor_bincounts : Dict[str, int] | None, default None
|
299
|
+
A dictionary specifying the number of bins for discretizing the continuous factors.
|
300
|
+
The keys should correspond to the names of continuous factors in `data_factors`,
|
301
|
+
and the values should be the number of bins to use for discretization.
|
302
|
+
If not provided, no discretization is applied.
|
260
303
|
|
261
304
|
Returns
|
262
305
|
-------
|
@@ -264,7 +307,39 @@ def parity_metadata(
|
|
264
307
|
Arrays of length (num_factors) whose (i)th element corresponds to the
|
265
308
|
chi-square score and p-value for the relationship between factor i and
|
266
309
|
the class labels in the dataset.
|
310
|
+
|
311
|
+
Raises
|
312
|
+
------
|
313
|
+
Warning
|
314
|
+
If any cell in the contingency matrix has a value between 0 and 5, a warning is issued because this can
|
315
|
+
lead to inaccurate chi-square calculations. It is recommended to ensure that each label co-occurs with
|
316
|
+
factor values either 0 times or at least 5 times. Alternatively, continuous-valued factors can be digitized
|
317
|
+
into fewer bins.
|
318
|
+
|
319
|
+
Notes
|
320
|
+
-----
|
321
|
+
- Each key of the ``continuous_factor_bincounts`` dictionary must occur as a key in data_factors.
|
322
|
+
- A high score with a low p-value suggests that a metadata factor is strongly correlated with a class label.
|
323
|
+
- The function creates a contingency matrix for each factor, where each entry represents the frequency of a
|
324
|
+
specific factor value co-occurring with a particular class label.
|
325
|
+
- Rows containing only zeros in the contingency matrix are removed before performing the chi-square test
|
326
|
+
to prevent errors in the calculation.
|
327
|
+
|
328
|
+
Examples
|
329
|
+
--------
|
330
|
+
Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
|
331
|
+
|
332
|
+
>>> data_factors = {
|
333
|
+
... "age": np_random_gen.choice([25, 30, 35, 45], (100)),
|
334
|
+
... "income": np_random_gen.choice([50000, 65000, 80000], (100)),
|
335
|
+
... "gender": np_random_gen.choice(["M", "F"], (100)),
|
336
|
+
... "class": np_random_gen.choice([0, 1, 2], (100)),
|
337
|
+
... }
|
338
|
+
>>> continuous_factor_bincounts = {"age": 4, "income": 3}
|
339
|
+
>>> parity(data_factors, continuous_factor_bincounts)
|
340
|
+
ParityOutput(score=array([2.82329785, 1.60625584, 1.38377236]), p_value=array([0.83067563, 0.80766733, 0.5006309 ]))
|
267
341
|
"""
|
342
|
+
|
268
343
|
data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
|
269
344
|
continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
|
270
345
|
|
@@ -306,4 +381,4 @@ def parity_metadata(
|
|
306
381
|
chi_scores[i] = chi2
|
307
382
|
p_values[i] = p
|
308
383
|
|
309
|
-
return
|
384
|
+
return ParityOutput(chi_scores, p_values)
|