dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/_internal/detectors/clusterer.py +47 -34
  3. dataeval/_internal/detectors/drift/base.py +53 -35
  4. dataeval/_internal/detectors/drift/cvm.py +5 -4
  5. dataeval/_internal/detectors/drift/ks.py +7 -6
  6. dataeval/_internal/detectors/drift/mmd.py +39 -19
  7. dataeval/_internal/detectors/drift/torch.py +6 -5
  8. dataeval/_internal/detectors/drift/uncertainty.py +7 -8
  9. dataeval/_internal/detectors/duplicates.py +57 -30
  10. dataeval/_internal/detectors/linter.py +40 -24
  11. dataeval/_internal/detectors/ood/ae.py +2 -1
  12. dataeval/_internal/detectors/ood/aegmm.py +2 -1
  13. dataeval/_internal/detectors/ood/base.py +37 -15
  14. dataeval/_internal/detectors/ood/llr.py +9 -8
  15. dataeval/_internal/detectors/ood/vae.py +2 -1
  16. dataeval/_internal/detectors/ood/vaegmm.py +2 -1
  17. dataeval/_internal/flags.py +42 -21
  18. dataeval/_internal/interop.py +3 -12
  19. dataeval/_internal/metrics/balance.py +188 -0
  20. dataeval/_internal/metrics/ber.py +123 -48
  21. dataeval/_internal/metrics/coverage.py +90 -74
  22. dataeval/_internal/metrics/divergence.py +101 -67
  23. dataeval/_internal/metrics/diversity.py +211 -0
  24. dataeval/_internal/metrics/parity.py +287 -155
  25. dataeval/_internal/metrics/stats.py +198 -317
  26. dataeval/_internal/metrics/uap.py +40 -29
  27. dataeval/_internal/metrics/utils.py +430 -0
  28. dataeval/_internal/models/tensorflow/losses.py +3 -3
  29. dataeval/_internal/models/tensorflow/trainer.py +3 -2
  30. dataeval/_internal/models/tensorflow/utils.py +4 -3
  31. dataeval/_internal/output.py +82 -0
  32. dataeval/_internal/utils.py +64 -0
  33. dataeval/_internal/workflows/sufficiency.py +96 -107
  34. dataeval/flags/__init__.py +2 -2
  35. dataeval/metrics/__init__.py +26 -7
  36. dataeval/utils/__init__.py +9 -0
  37. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
  38. dataeval-0.65.0.dist-info/RECORD +60 -0
  39. dataeval/_internal/functional/__init__.py +0 -0
  40. dataeval/_internal/functional/ber.py +0 -63
  41. dataeval/_internal/functional/coverage.py +0 -75
  42. dataeval/_internal/functional/divergence.py +0 -16
  43. dataeval/_internal/functional/hash.py +0 -79
  44. dataeval/_internal/functional/metadata.py +0 -136
  45. dataeval/_internal/functional/metadataparity.py +0 -190
  46. dataeval/_internal/functional/uap.py +0 -6
  47. dataeval/_internal/functional/utils.py +0 -158
  48. dataeval/_internal/maite/__init__.py +0 -0
  49. dataeval/_internal/maite/utils.py +0 -30
  50. dataeval/_internal/metrics/base.py +0 -92
  51. dataeval/_internal/metrics/metadata.py +0 -610
  52. dataeval/_internal/metrics/metadataparity.py +0 -67
  53. dataeval-0.63.0.dist-info/RECORD +0 -68
  54. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
  55. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -1,164 +1,296 @@
1
1
  import warnings
2
- from typing import Optional, Tuple
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Generic, Mapping, Optional, Tuple, TypeVar
3
4
 
4
5
  import numpy as np
5
- import scipy
6
+ from numpy.typing import ArrayLike, NDArray
7
+ from scipy.stats import chi2_contingency, chisquare
6
8
 
9
+ from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.output import OutputMetadata, set_metadata
7
11
 
8
- class Parity:
12
+ TData = TypeVar("TData", np.float64, NDArray[np.float64])
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ParityOutput(Generic[TData], OutputMetadata):
17
+ """
18
+ Attributes
19
+ ----------
20
+ score : np.float64 | NDArray[np.float64]
21
+ chi-squared score(s) of the test
22
+ p_value : np.float64 | NDArray[np.float64]
23
+ p-value(s) of the test
24
+ """
25
+
26
+ score: TData
27
+ p_value: TData
28
+
29
+
30
+ def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str) -> NDArray:
31
+ """
32
+ Digitizes a list of values into a given number of bins.
33
+
34
+ Parameters
35
+ ----------
36
+ continuous_values: NDArray
37
+ The values to be digitized.
38
+ bins: int
39
+ The number of bins for the discrete values that continuous_values will be digitized into.
40
+ factor_name: str
41
+ The name of the factor to be digitized.
42
+
43
+ Returns
44
+ -------
45
+ NDArray
46
+ The digitized values
47
+
48
+ """
49
+ if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
50
+ raise TypeError(
51
+ f"Encountered a non-numeric value for factor {factor_name}, but the factor"
52
+ " was specified to be continuous. Ensure all occurrences of this factor are numeric types,"
53
+ f" or do not specify {factor_name} as a continuous factor."
54
+ )
55
+
56
+ _, bin_edges = np.histogram(continuous_values, bins=bins)
57
+ bin_edges[-1] = np.inf
58
+ bin_edges[0] = -np.inf
59
+ return np.digitize(continuous_values, bin_edges)
60
+
61
+
62
+ def format_discretize_factors(
63
+ data_factors: Dict[str, NDArray], continuous_factor_bincounts: Dict[str, int]
64
+ ) -> Tuple[Dict[str, NDArray], NDArray]:
9
65
  """
10
- Class for evaluating statistics of observed and expected class labels, including:
66
+ Sets up the internal list of metadata factors.
67
+
68
+ Parameters
69
+ ----------
70
+ data_factors: Dict[str, NDArray]
71
+ The dataset factors, which are per-image attributes including class label and metadata.
72
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
73
+ continuous_factor_bincounts : Dict[str, int]
74
+ The factors in data_factors that have continuous values and the array of bin counts to
75
+ discretize values into. All factors are treated as having discrete values unless they
76
+ are specified as keys in this dictionary. Each element of this array must occur as a key
77
+ in data_factors.
78
+
79
+ Returns
80
+ -------
81
+ Tuple[Dict[str, NDArray], NDArray]
82
+ - Intrinsic per-image metadata information with the formatting that input data_factors uses.
83
+ Each key is a metadata factor, whose value is the discrete per-image factor values.
84
+ - Per-image labels, whose ith element is the label for the ith element of the dataset.
85
+ """
86
+ invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
87
+ if invalid_keys:
88
+ raise KeyError(
89
+ f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
90
+ "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
91
+ )
92
+
93
+ metadata_factors = {}
94
+
95
+ # make sure each factor has the same number of entries
96
+ lengths = []
97
+ for arr in data_factors.values():
98
+ lengths.append(arr.shape)
99
+
100
+ if lengths[1:] != lengths[:-1]:
101
+ raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
102
+
103
+ labels = data_factors["class"]
104
+
105
+ metadata_factors = {
106
+ name: val
107
+ if name not in continuous_factor_bincounts
108
+ else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
109
+ for name, val in data_factors.items()
110
+ if name != "class"
111
+ }
112
+
113
+ return metadata_factors, labels
114
+
115
+
116
+ def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
117
+ exp_sum = np.sum(expected_dist)
118
+ obs_sum = np.sum(observed_dist)
119
+
120
+ if exp_sum == 0:
121
+ raise ValueError(
122
+ f"Expected label distribution {expected_dist} is all zeros. "
123
+ "Ensure that Parity.expected_dist is set to a list "
124
+ "with at least one nonzero element"
125
+ )
126
+
127
+ # Renormalize expected distribution to have the same total number of labels as the observed dataset
128
+ if exp_sum != obs_sum:
129
+ expected_dist = expected_dist * obs_sum / exp_sum
130
+
131
+ return expected_dist
132
+
11
133
 
12
- - Chi Squared test for statistical independence between expected and observed labels
134
+ def validate_dist(label_dist: NDArray, label_name: str):
13
135
  """
136
+ Verifies that the given label distribution has labels and checks if
137
+ any labels have frequencies less than 5.
138
+
139
+ Parameters
140
+ ----------
141
+ label_dist : NDArray
142
+ Array representing label distributions
143
+
144
+ Raises
145
+ ------
146
+ ValueError
147
+ If label_dist is empty
148
+ Warning
149
+ If any elements of label_dist are less than 5
150
+ """
151
+ if not len(label_dist):
152
+ raise ValueError(f"No labels found in the {label_name} dataset")
153
+ if np.any(label_dist < 5):
154
+ warnings.warn(
155
+ f"Labels {np.where(label_dist<5)[0]} in {label_name}"
156
+ " dataset have frequencies less than 5. This may lead"
157
+ " to invalid chi-squared evaluation."
158
+ )
159
+
160
+
161
+ @set_metadata("dataeval.metrics")
162
+ def parity(
163
+ expected_labels: ArrayLike,
164
+ observed_labels: ArrayLike,
165
+ num_classes: Optional[int] = None,
166
+ ) -> ParityOutput[np.float64]:
167
+ """
168
+ Perform a one-way chi-squared test between observation frequencies and expected frequencies that
169
+ tests the null hypothesis that the observed data has the expected frequencies.
170
+
171
+ This function acts as an interface to the scipy.stats.chisquare method, which is documented at
172
+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
173
+
174
+ Parameters
175
+ ----------
176
+ expected_labels : ArrayLike
177
+ List of class labels in the expected dataset
178
+ observed_labels : ArrayLike
179
+ List of class labels in the observed dataset
180
+ num_classes : Optional[int]
181
+ The number of unique classes in the datasets. If this is not specified, it will
182
+ be inferred from the set of unique labels in expected_labels and observed_labels
183
+
184
+ Returns
185
+ -------
186
+ ParityOutput[np.float64]
187
+ chi-squared score and p-value of the test
188
+
189
+ Raises
190
+ ------
191
+ ValueError
192
+ If x is empty
193
+ """
194
+ # Calculate
195
+ if not num_classes:
196
+ num_classes = 0
197
+
198
+ # Calculate the class frequencies associated with the datasets
199
+ observed_dist = np.bincount(to_numpy(observed_labels), minlength=num_classes)
200
+ expected_dist = np.bincount(to_numpy(expected_labels), minlength=num_classes)
201
+
202
+ # Validate
203
+ validate_dist(observed_dist, "observed")
204
+
205
+ # Normalize
206
+ expected_dist = normalize_expected_dist(expected_dist, observed_dist)
207
+
208
+ # Validate normalized expected distribution
209
+ validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
210
+
211
+ if len(observed_dist) != len(expected_dist):
212
+ raise ValueError(
213
+ f"Found {len(observed_dist)} unique classes in observed label distribution, "
214
+ f"but found {len(expected_dist)} unique classes in expected label distribution. "
215
+ "This can happen when some class ids have zero instances in one dataset but "
216
+ "not in the other. When initializing Parity, try setting the num_classes "
217
+ "parameter to the known number of unique class ids, so that classes with "
218
+ "zero instances are still included in the distributions."
219
+ )
220
+
221
+ cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
222
+ return ParityOutput(cs, p)
223
+
224
+
225
+ @set_metadata("dataeval.metrics")
226
+ def parity_metadata(
227
+ data_factors: Mapping[str, ArrayLike],
228
+ continuous_factor_bincounts: Optional[Dict[str, int]] = None,
229
+ ) -> ParityOutput[NDArray[np.float64]]:
230
+ """
231
+ Evaluates the statistical independence of metadata factors from class labels.
232
+ This performs a chi-square test, which provides a score and a p-value for
233
+ statistical independence between each pair of a metadata factor and a class label.
234
+ A high score with a low p-value suggests that a metadata factor is strongly
235
+ correlated with a class label.
236
+
237
+ Parameters
238
+ ----------
239
+ data_factors: Mapping[str, ArrayLike]
240
+ The dataset factors, which are per-image attributes including class label and metadata.
241
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
242
+ continuous_factor_bincounts : Optional[Dict[str, int]], default None
243
+ The factors in data_factors that have continuous values and the array of bin counts to
244
+ discretize values into. All factors are treated as having discrete values unless they
245
+ are specified as keys in this dictionary. Each element of this array must occur as a key
246
+ in data_factors.
247
+
248
+ Returns
249
+ -------
250
+ ParityOutput[NDArray[np.float64]]
251
+ Arrays of length (num_factors) whose (i)th element corresponds to the
252
+ chi-square score and p-value for the relationship between factor i and
253
+ the class labels in the dataset.
254
+ """
255
+ data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
256
+ continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
257
+
258
+ factors, labels = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
259
+
260
+ chi_scores = np.zeros(len(factors))
261
+ p_values = np.zeros(len(factors))
262
+ n_cls = len(np.unique(labels))
263
+ for i, (current_factor_name, factor_values) in enumerate(factors.items()):
264
+ unique_factor_values = np.unique(factor_values)
265
+ contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
266
+ # Builds a contingency matrix where entry at index (r,c) represents
267
+ # the frequency of current_factor_name achieving value unique_factor_values[r]
268
+ # at a data point with class c.
269
+
270
+ # TODO: Vectorize this nested for loop
271
+ for fi, factor_value in enumerate(unique_factor_values):
272
+ for label in range(n_cls):
273
+ with_both = np.bitwise_and((labels == label), factor_values == factor_value)
274
+ contingency_matrix[fi, label] = np.sum(with_both)
275
+ if 0 < contingency_matrix[fi, label] < 5:
276
+ warnings.warn(
277
+ f"Factor {current_factor_name} value {factor_value} co-occurs "
278
+ f"only {contingency_matrix[fi, label]} times with label {label}. "
279
+ "This can cause inaccurate chi_square calculation. Recommend"
280
+ "ensuring each label occurs either 0 times or at least 5 times. "
281
+ "Alternatively, digitize any continuous-valued factors "
282
+ "into fewer bins."
283
+ )
284
+
285
+ # This deletes rows containing only zeros,
286
+ # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
287
+ rowsums = np.sum(contingency_matrix, axis=1)
288
+ rowmask = np.where(rowsums)
289
+ contingency_matrix = contingency_matrix[rowmask]
290
+
291
+ chi2, p, _, _ = chi2_contingency(contingency_matrix)
292
+
293
+ chi_scores[i] = chi2
294
+ p_values[i] = p
14
295
 
15
- def _normalize_expected_dist(self, expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
16
- exp_sum = np.sum(expected_dist)
17
- obs_sum = np.sum(observed_dist)
18
-
19
- if exp_sum == 0:
20
- raise ValueError(
21
- f"Expected label distribution {expected_dist} is all zeros. "
22
- "Ensure that Parity.expected_dist is set to a list "
23
- "with at least one nonzero element"
24
- )
25
-
26
- # Renormalize expected distribution to have the same total number of labels as the observed dataset
27
- if exp_sum != obs_sum:
28
- expected_dist = expected_dist * obs_sum / exp_sum
29
-
30
- return expected_dist
31
-
32
- def _calculate_label_dist(self, labels: np.ndarray, num_classes: int) -> np.ndarray:
33
- """
34
- Calculate the class frequencies associated with a dataset
35
-
36
- Parameters
37
- ----------
38
- labels : np.ndarray
39
- List of class labels in a dataset
40
- num_classes: int
41
- The number of unique classes in the datasets
42
-
43
- Returns
44
- -------
45
- label_dist : np.ndarray
46
- Array representing label distributions
47
- """
48
- label_dist = np.bincount(labels, minlength=num_classes)
49
- return label_dist
50
-
51
- def _validate_class_balance(self, expected_dist: np.ndarray, observed_dist: np.ndarray):
52
- """
53
- Check if the numbers of unique classes in the datasets are unequal
54
-
55
- Parameters
56
- ----------
57
- expected_dist : np.ndarray
58
- Array representing expected label distributions
59
- observed_dist : np.ndarray
60
- Array representing observed label distributions
61
-
62
- Raises
63
- ------
64
- ValueError
65
- When exp_ld and obs_ld do not have the same number of classes
66
- """
67
- exp_n_cls = len(expected_dist)
68
- obs_n_cls = len(observed_dist)
69
- if exp_n_cls != obs_n_cls:
70
- raise ValueError(
71
- f"Found {obs_n_cls} unique classes in observed label distribution, "
72
- f"but found {exp_n_cls} unique classes in expected label distribution,"
73
- "This can happen when some class ids have zero instances in one dataset but "
74
- "not in the other. When initializing Parity, "
75
- "try setting the num_classes parameter to the known number of unique class ids, "
76
- "so that classes with zero instances are still included in the distributions."
77
- )
78
-
79
- def _validate_dist(self, label_dist: np.ndarray, label_name: str):
80
- """
81
- Verifies that the given label distribution has labels and checks if
82
- any labels have frequencies less than 5.
83
-
84
- Parameters
85
- ----------
86
- label_dist : np.ndarray
87
- Array representing label distributions
88
-
89
- Raises
90
- ------
91
- ValueError
92
- If label_dist is empty
93
- Warning
94
- If any elements of label_dist are less than 5
95
- """
96
- if not len(label_dist):
97
- raise ValueError(f"No labels found in the {label_name} dataset")
98
- if np.any(label_dist < 5):
99
- warnings.warn(
100
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
101
- " dataset have frequencies less than 5. This may lead"
102
- " to invalid chi-squared evaluation."
103
- )
104
- warnings.warn(
105
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
106
- " dataset have frequencies less than 5. This may lead"
107
- " to invalid chi-squared evaluation."
108
- )
109
-
110
- def evaluate(
111
- self, expected_labels: np.ndarray, observed_labels: np.ndarray, num_classes: Optional[int] = None
112
- ) -> Tuple[np.float64, np.float64]:
113
- """
114
- Perform a one-way chi-squared test between observation frequencies and expected frequencies that
115
- tests the null hypothesis that the observed data has the expected frequencies.
116
-
117
- This function acts as an interface to the scipy.stats.chisquare method, which is documented at
118
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
119
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
120
-
121
- Parameters
122
- ----------
123
- expected_labels : np.ndarray
124
- List of class labels in the expected dataset
125
- observed_labels : np.ndarray
126
- List of class labels in the observed dataset
127
- num_classes : Optional[int]
128
- The number of unique classes in the datasets. If this is not specified, it will
129
- be inferred from the set of unique labels in expected_labels and observed_labels
130
-
131
- Returns
132
- -------
133
- np.float64
134
- chi-squared value of the test
135
- np.float64
136
- p-value of the test
137
-
138
- Raises
139
- ------
140
- ValueError
141
- If x is empty
142
- """
143
- # Calculate
144
- if not num_classes:
145
- num_classes = 0
146
-
147
- observed_dist = self._calculate_label_dist(observed_labels, num_classes)
148
- expected_dist = self._calculate_label_dist(expected_labels, num_classes)
149
-
150
- # Validate
151
- self._validate_dist(observed_dist, "observed")
152
-
153
- # Normalize
154
- expected_dist = self._normalize_expected_dist(expected_dist, observed_dist)
155
-
156
- # Validate normalized expected distribution
157
- self._validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
158
- self._validate_class_balance(expected_dist, observed_dist)
159
-
160
- cs_result = scipy.stats.chisquare(f_obs=observed_dist, f_exp=expected_dist)
161
-
162
- chisquared = cs_result.statistic
163
- p_value = cs_result.pvalue
164
- return chisquared, p_value
296
+ return ParityOutput(chi_scores, p_values)