dataeval 0.63.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +2 -1
  3. dataeval/_internal/detectors/drift/base.py +2 -1
  4. dataeval/_internal/detectors/drift/cvm.py +2 -1
  5. dataeval/_internal/detectors/drift/ks.py +2 -1
  6. dataeval/_internal/detectors/drift/mmd.py +4 -3
  7. dataeval/_internal/detectors/drift/uncertainty.py +1 -2
  8. dataeval/_internal/detectors/duplicates.py +2 -1
  9. dataeval/_internal/detectors/linter.py +1 -1
  10. dataeval/_internal/detectors/ood/ae.py +2 -1
  11. dataeval/_internal/detectors/ood/aegmm.py +2 -1
  12. dataeval/_internal/detectors/ood/base.py +2 -1
  13. dataeval/_internal/detectors/ood/llr.py +3 -2
  14. dataeval/_internal/detectors/ood/vae.py +2 -1
  15. dataeval/_internal/detectors/ood/vaegmm.py +2 -1
  16. dataeval/_internal/interop.py +2 -11
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +1 -83
  19. dataeval/_internal/metrics/ber.py +122 -48
  20. dataeval/_internal/metrics/coverage.py +83 -74
  21. dataeval/_internal/metrics/divergence.py +67 -67
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +300 -155
  24. dataeval/_internal/metrics/stats.py +7 -5
  25. dataeval/_internal/metrics/uap.py +37 -29
  26. dataeval/_internal/metrics/utils.py +393 -0
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/metrics/__init__.py +25 -6
  29. dataeval/utils/__init__.py +9 -0
  30. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -1
  31. dataeval-0.64.0.dist-info/RECORD +60 -0
  32. dataeval/_internal/functional/__init__.py +0 -0
  33. dataeval/_internal/functional/ber.py +0 -63
  34. dataeval/_internal/functional/coverage.py +0 -75
  35. dataeval/_internal/functional/divergence.py +0 -16
  36. dataeval/_internal/functional/hash.py +0 -79
  37. dataeval/_internal/functional/metadata.py +0 -136
  38. dataeval/_internal/functional/metadataparity.py +0 -190
  39. dataeval/_internal/functional/uap.py +0 -6
  40. dataeval/_internal/functional/utils.py +0 -158
  41. dataeval/_internal/maite/__init__.py +0 -0
  42. dataeval/_internal/maite/utils.py +0 -30
  43. dataeval/_internal/metrics/metadata.py +0 -610
  44. dataeval/_internal/metrics/metadataparity.py +0 -67
  45. dataeval-0.63.0.dist-info/RECORD +0 -68
  46. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  47. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,206 @@
1
+ from typing import Dict, List, Literal, NamedTuple, Optional, Sequence
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+
6
+ from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
7
+
8
+
9
+ class DiversityOutput(NamedTuple):
10
+ """
11
+ Attributes
12
+ ----------
13
+ diversity_index : NDArray[np.float64]
14
+ Diversity index for classes and factors
15
+ """
16
+
17
+ diversity_index: NDArray[np.float64]
18
+
19
+
20
+ def diversity_shannon(
21
+ data: np.ndarray,
22
+ names: List[str],
23
+ is_categorical: List[bool],
24
+ subset_mask: Optional[np.ndarray] = None,
25
+ ) -> np.ndarray:
26
+ """
27
+ Compute diversity for discrete/categorical variables and, through standard
28
+ histogram binning, for continuous variables.
29
+
30
+ We define diversity as a normalized form of the Shannon entropy.
31
+
32
+ diversity = 1 implies that samples are evenly distributed across a particular factor
33
+ diversity = 0 implies that all samples belong to one category/bin
34
+
35
+ Parameters
36
+ ----------
37
+ subset_mask: Optional[np.ndarray[bool]]
38
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
39
+
40
+ Notes
41
+ -----
42
+ For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
43
+
44
+ Returns
45
+ -------
46
+ diversity_index: np.ndarray
47
+ Diversity index per column of X
48
+
49
+ See Also
50
+ --------
51
+ numpy.histogram
52
+ """
53
+
54
+ # entropy computed using global auto bins so that we can properly normalize
55
+ ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
56
+ # normalize by global counts rather than classwise counts
57
+ num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
58
+ return ent_unnormalized / np.log(num_bins)
59
+
60
+
61
+ def diversity_simpson(
62
+ data: np.ndarray,
63
+ names: List[str],
64
+ is_categorical: List[bool],
65
+ subset_mask: Optional[np.ndarray] = None,
66
+ ) -> np.ndarray:
67
+ """
68
+ Compute diversity for discrete/categorical variables and, through standard
69
+ histogram binning, for continuous variables.
70
+
71
+ We define diversity as a normalized form of the inverse Simpson diversity
72
+ index.
73
+
74
+ diversity = 1 implies that samples are evenly distributed across a particular factor
75
+ diversity = 1/num_categories implies that all samples belong to one category/bin
76
+
77
+ Parameters
78
+ ----------
79
+ subset_mask: Optional[np.ndarray[bool]]
80
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
81
+
82
+ Notes
83
+ -----
84
+ For continuous variables, histogram bins are chosen automatically. See
85
+ numpy.histogram for details.
86
+ The expression is undefined for q=1, but it approaches the Shannon entropy
87
+ in the limit.
88
+ If there is only one category, the diversity index takes a value of 1 =
89
+ 1/N = 1/1. Entropy will take a value of 0.
90
+
91
+ Returns
92
+ -------
93
+ np.ndarray
94
+ Diversity index per column of X
95
+
96
+ See Also
97
+ --------
98
+ numpy.histogram
99
+ """
100
+
101
+ hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
102
+ # normalize by global counts, not classwise counts
103
+ num_bins = get_num_bins(data, names, is_categorical)
104
+
105
+ ev_index = np.empty(len(names))
106
+ # loop over columns for convenience
107
+ for col, cnts in enumerate(hist_counts.values()):
108
+ # relative frequencies
109
+ p_i = cnts / cnts.sum()
110
+ # inverse Simpson index normalized by (number of bins)
111
+ ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
112
+
113
+ return ev_index
114
+
115
+
116
+ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
117
+
118
+
119
+ def diversity(
120
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
121
+ ) -> DiversityOutput:
122
+ """
123
+ Compute diversity for discrete/categorical variables and, through standard
124
+ histogram binning, for continuous variables.
125
+
126
+ diversity = 1 implies that samples are evenly distributed across a particular factor
127
+ diversity = 0 implies that all samples belong to one category/bin
128
+
129
+ Parameters
130
+ ----------
131
+ class_labels: Sequence[int]
132
+ List of class labels for each image
133
+ metadata: List[Dict]
134
+ List of metadata factors for each image
135
+ metric: Literal["shannon", "simpson"], default "simpson"
136
+ string variable indicating which diversity index should be used.
137
+ Permissible values include "simpson" and "shannon"
138
+
139
+ Notes
140
+ -----
141
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
142
+
143
+ Returns
144
+ -------
145
+ DiversityOutput
146
+ Diversity index per column of self.data or each factor in self.names
147
+
148
+ See Also
149
+ --------
150
+ numpy.histogram
151
+ """
152
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
153
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
154
+ diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
155
+ return DiversityOutput(diversity_index)
156
+
157
+
158
+ def diversity_classwise(
159
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
160
+ ) -> DiversityOutput:
161
+ """
162
+ Compute diversity for discrete/categorical variables and, through standard
163
+ histogram binning, for continuous variables.
164
+
165
+ We define diversity as a normalized form of the inverse Simpson diversity
166
+ index.
167
+
168
+ diversity = 1 implies that samples are evenly distributed across a particular factor
169
+ diversity = 1/num_categories implies that all samples belong to one category/bin
170
+
171
+ Parameters
172
+ ----------
173
+ class_labels: Sequence[int]
174
+ List of class labels for each image
175
+ metadata: List[Dict]
176
+ List of metadata factors for each image
177
+
178
+ Notes
179
+ -----
180
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
181
+ - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
182
+ - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
183
+
184
+ Returns
185
+ -------
186
+ DiversityOutput
187
+ Diversity index [n_class x n_factor]
188
+
189
+ See Also
190
+ --------
191
+ numpy.histogram
192
+ """
193
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
194
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
195
+ class_idx = names.index("class_label")
196
+ class_lbl = data[:, class_idx]
197
+
198
+ u_classes = np.unique(class_lbl)
199
+ num_factors = len(names)
200
+ diversity = np.empty((len(u_classes), num_factors))
201
+ diversity[:] = np.nan
202
+ for idx, cls in enumerate(u_classes):
203
+ subset_mask = class_lbl == cls
204
+ diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
205
+ div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
206
+ return DiversityOutput(div_no_class)
@@ -1,164 +1,309 @@
1
1
  import warnings
2
- from typing import Optional, Tuple
2
+ from typing import Dict, Mapping, NamedTuple, Optional, Tuple
3
3
 
4
4
  import numpy as np
5
- import scipy
5
+ from numpy.typing import ArrayLike, NDArray
6
+ from scipy.stats import chi2_contingency, chisquare
6
7
 
8
+ from dataeval._internal.interop import to_numpy
7
9
 
8
- class Parity:
10
+
11
+ class ParityOutput(NamedTuple):
12
+ """
13
+ Attributes
14
+ ----------
15
+ score : np.float64
16
+ chi-squared value of the test
17
+ p_value : np.float64
18
+ p-value of the test
19
+ """
20
+
21
+ score: np.float64
22
+ p_value: np.float64
23
+
24
+
25
+ class ParityMetadataOutput(NamedTuple):
26
+ """
27
+ Attributes
28
+ ----------
29
+ scores : NDArray[np.float64]
30
+ chi-squared values of the test
31
+ p_values : NDArray[np.float64]
32
+ p-values of the test
33
+ """
34
+
35
+ score: NDArray[np.float64]
36
+ p_value: NDArray[np.float64]
37
+
38
+
39
+ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
40
+ """
41
+ Digitizes a list of values into a given number of bins.
42
+
43
+ Parameters
44
+ ----------
45
+ continuous_values: np.ndarray
46
+ The values to be digitized.
47
+ bins: int
48
+ The number of bins for the discrete values that continuous_values will be digitized into.
49
+ factor_name: str
50
+ The name of the factor to be digitized.
51
+
52
+ Returns
53
+ -------
54
+ np.ndarray
55
+ The digitized values
56
+
57
+ """
58
+ if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
59
+ raise TypeError(
60
+ f"Encountered a non-numeric value for factor {factor_name}, but the factor"
61
+ " was specified to be continuous. Ensure all occurrences of this factor are numeric types,"
62
+ f" or do not specify {factor_name} as a continuous factor."
63
+ )
64
+
65
+ _, bin_edges = np.histogram(continuous_values, bins=bins)
66
+ bin_edges[-1] = np.inf
67
+ bin_edges[0] = -np.inf
68
+ return np.digitize(continuous_values, bin_edges)
69
+
70
+
71
+ def format_discretize_factors(
72
+ data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
73
+ ) -> Tuple[dict, np.ndarray]:
9
74
  """
10
- Class for evaluating statistics of observed and expected class labels, including:
75
+ Sets up the internal list of metadata factors.
76
+
77
+ Parameters
78
+ ----------
79
+ data_factors: Dict[str, np.ndarray]
80
+ The dataset factors, which are per-image attributes including class label and metadata.
81
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
82
+ continuous_factor_bincounts : Dict[str, int]
83
+ The factors in data_factors that have continuous values and the array of bin counts to
84
+ discretize values into. All factors are treated as having discrete values unless they
85
+ are specified as keys in this dictionary. Each element of this array must occur as a key
86
+ in data_factors.
11
87
 
12
- - Chi Squared test for statistical independence between expected and observed labels
88
+ Returns
89
+ -------
90
+ Dict[str, np.ndarray]
91
+ Intrinsic per-image metadata information with the formatting that input data_factors uses.
92
+ Each key is a metadata factor, whose value is the discrete per-image factor values.
93
+ np.ndarray
94
+ Per-image labels, whose ith element is the label for the ith element of the dataset.
13
95
  """
96
+ invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
97
+ if invalid_keys:
98
+ raise KeyError(
99
+ f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
100
+ "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
101
+ )
102
+
103
+ metadata_factors = {}
104
+
105
+ # make sure each factor has the same number of entries
106
+ lengths = []
107
+ for arr in data_factors.values():
108
+ lengths.append(arr.shape)
109
+
110
+ if lengths[1:] != lengths[:-1]:
111
+ raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
112
+
113
+ labels = data_factors["class"]
114
+
115
+ metadata_factors = {
116
+ name: val
117
+ if name not in continuous_factor_bincounts
118
+ else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
119
+ for name, val in data_factors.items()
120
+ if name != "class"
121
+ }
122
+
123
+ return metadata_factors, labels
124
+
125
+
126
+ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
127
+ exp_sum = np.sum(expected_dist)
128
+ obs_sum = np.sum(observed_dist)
129
+
130
+ if exp_sum == 0:
131
+ raise ValueError(
132
+ f"Expected label distribution {expected_dist} is all zeros. "
133
+ "Ensure that Parity.expected_dist is set to a list "
134
+ "with at least one nonzero element"
135
+ )
136
+
137
+ # Renormalize expected distribution to have the same total number of labels as the observed dataset
138
+ if exp_sum != obs_sum:
139
+ expected_dist = expected_dist * obs_sum / exp_sum
140
+
141
+ return expected_dist
142
+
143
+
144
+ def validate_dist(label_dist: np.ndarray, label_name: str):
145
+ """
146
+ Verifies that the given label distribution has labels and checks if
147
+ any labels have frequencies less than 5.
148
+
149
+ Parameters
150
+ ----------
151
+ label_dist : np.ndarray
152
+ Array representing label distributions
153
+
154
+ Raises
155
+ ------
156
+ ValueError
157
+ If label_dist is empty
158
+ Warning
159
+ If any elements of label_dist are less than 5
160
+ """
161
+ if not len(label_dist):
162
+ raise ValueError(f"No labels found in the {label_name} dataset")
163
+ if np.any(label_dist < 5):
164
+ warnings.warn(
165
+ f"Labels {np.where(label_dist<5)[0]} in {label_name}"
166
+ " dataset have frequencies less than 5. This may lead"
167
+ " to invalid chi-squared evaluation."
168
+ )
169
+ warnings.warn(
170
+ f"Labels {np.where(label_dist<5)[0]} in {label_name}"
171
+ " dataset have frequencies less than 5. This may lead"
172
+ " to invalid chi-squared evaluation."
173
+ )
174
+
175
+
176
+ def parity(
177
+ expected_labels: ArrayLike,
178
+ observed_labels: ArrayLike,
179
+ num_classes: Optional[int] = None,
180
+ ) -> ParityOutput:
181
+ """
182
+ Perform a one-way chi-squared test between observation frequencies and expected frequencies that
183
+ tests the null hypothesis that the observed data has the expected frequencies.
184
+
185
+ This function acts as an interface to the scipy.stats.chisquare method, which is documented at
186
+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
187
+
188
+ Parameters
189
+ ----------
190
+ expected_labels : ArrayLike
191
+ List of class labels in the expected dataset
192
+ observed_labels : ArrayLike
193
+ List of class labels in the observed dataset
194
+ num_classes : Optional[int]
195
+ The number of unique classes in the datasets. If this is not specified, it will
196
+ be inferred from the set of unique labels in expected_labels and observed_labels
197
+
198
+ Returns
199
+ -------
200
+ ParityOutput[np.float64]
201
+ chi-squared score and p-value of the test
202
+
203
+ Raises
204
+ ------
205
+ ValueError
206
+ If x is empty
207
+ """
208
+ # Calculate
209
+ if not num_classes:
210
+ num_classes = 0
211
+
212
+ # Calculate the class frequencies associated with the datasets
213
+ observed_dist = np.bincount(to_numpy(observed_labels), minlength=num_classes)
214
+ expected_dist = np.bincount(to_numpy(expected_labels), minlength=num_classes)
215
+
216
+ # Validate
217
+ validate_dist(observed_dist, "observed")
218
+
219
+ # Normalize
220
+ expected_dist = normalize_expected_dist(expected_dist, observed_dist)
221
+
222
+ # Validate normalized expected distribution
223
+ validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
224
+
225
+ if len(observed_dist) != len(expected_dist):
226
+ raise ValueError(
227
+ f"Found {len(observed_dist)} unique classes in observed label distribution, "
228
+ f"but found {len(expected_dist)} unique classes in expected label distribution. "
229
+ "This can happen when some class ids have zero instances in one dataset but "
230
+ "not in the other. When initializing Parity, try setting the num_classes "
231
+ "parameter to the known number of unique class ids, so that classes with "
232
+ "zero instances are still included in the distributions."
233
+ )
234
+
235
+ cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
236
+ return ParityOutput(cs, p)
237
+
238
+
239
+ def parity_metadata(
240
+ data_factors: Mapping[str, ArrayLike],
241
+ continuous_factor_bincounts: Optional[Dict[str, int]] = None,
242
+ ) -> ParityMetadataOutput:
243
+ """
244
+ Evaluates the statistical independence of metadata factors from class labels.
245
+ This performs a chi-square test, which provides a score and a p-value for
246
+ statistical independence between each pair of a metadata factor and a class label.
247
+ A high score with a low p-value suggests that a metadata factor is strongly
248
+ correlated with a class label.
249
+
250
+ Parameters
251
+ ----------
252
+ data_factors: Mapping[str, ArrayLike]
253
+ The dataset factors, which are per-image attributes including class label and metadata.
254
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
255
+ continuous_factor_bincounts : Optional[Dict[str, int]], default None
256
+ The factors in data_factors that have continuous values and the array of bin counts to
257
+ discretize values into. All factors are treated as having discrete values unless they
258
+ are specified as keys in this dictionary. Each element of this array must occur as a key
259
+ in data_factors.
260
+
261
+ Returns
262
+ -------
263
+ ParityOutput[NDArray[np.float64]]
264
+ Arrays of length (num_factors) whose (i)th element corresponds to the
265
+ chi-square score and p-value for the relationship between factor i and
266
+ the class labels in the dataset.
267
+ """
268
+ data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
269
+ continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
270
+
271
+ factors, labels = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
272
+
273
+ chi_scores = np.zeros(len(factors))
274
+ p_values = np.zeros(len(factors))
275
+ n_cls = len(np.unique(labels))
276
+ for i, (current_factor_name, factor_values) in enumerate(factors.items()):
277
+ unique_factor_values = np.unique(factor_values)
278
+ contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
279
+ # Builds a contingency matrix where entry at index (r,c) represents
280
+ # the frequency of current_factor_name achieving value unique_factor_values[r]
281
+ # at a data point with class c.
282
+
283
+ # TODO: Vectorize this nested for loop
284
+ for fi, factor_value in enumerate(unique_factor_values):
285
+ for label in range(n_cls):
286
+ with_both = np.bitwise_and((labels == label), factor_values == factor_value)
287
+ contingency_matrix[fi, label] = np.sum(with_both)
288
+ if 0 < contingency_matrix[fi, label] < 5:
289
+ warnings.warn(
290
+ f"Factor {current_factor_name} value {factor_value} co-occurs "
291
+ f"only {contingency_matrix[fi, label]} times with label {label}. "
292
+ "This can cause inaccurate chi_square calculation. Recommend"
293
+ "ensuring each label occurs either 0 times or at least 5 times. "
294
+ "Alternatively, digitize any continuous-valued factors "
295
+ "into fewer bins."
296
+ )
297
+
298
+ # This deletes rows containing only zeros,
299
+ # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
300
+ rowsums = np.sum(contingency_matrix, axis=1)
301
+ rowmask = np.where(rowsums)
302
+ contingency_matrix = contingency_matrix[rowmask]
303
+
304
+ chi2, p, _, _ = chi2_contingency(contingency_matrix)
305
+
306
+ chi_scores[i] = chi2
307
+ p_values[i] = p
14
308
 
15
- def _normalize_expected_dist(self, expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
16
- exp_sum = np.sum(expected_dist)
17
- obs_sum = np.sum(observed_dist)
18
-
19
- if exp_sum == 0:
20
- raise ValueError(
21
- f"Expected label distribution {expected_dist} is all zeros. "
22
- "Ensure that Parity.expected_dist is set to a list "
23
- "with at least one nonzero element"
24
- )
25
-
26
- # Renormalize expected distribution to have the same total number of labels as the observed dataset
27
- if exp_sum != obs_sum:
28
- expected_dist = expected_dist * obs_sum / exp_sum
29
-
30
- return expected_dist
31
-
32
- def _calculate_label_dist(self, labels: np.ndarray, num_classes: int) -> np.ndarray:
33
- """
34
- Calculate the class frequencies associated with a dataset
35
-
36
- Parameters
37
- ----------
38
- labels : np.ndarray
39
- List of class labels in a dataset
40
- num_classes: int
41
- The number of unique classes in the datasets
42
-
43
- Returns
44
- -------
45
- label_dist : np.ndarray
46
- Array representing label distributions
47
- """
48
- label_dist = np.bincount(labels, minlength=num_classes)
49
- return label_dist
50
-
51
- def _validate_class_balance(self, expected_dist: np.ndarray, observed_dist: np.ndarray):
52
- """
53
- Check if the numbers of unique classes in the datasets are unequal
54
-
55
- Parameters
56
- ----------
57
- expected_dist : np.ndarray
58
- Array representing expected label distributions
59
- observed_dist : np.ndarray
60
- Array representing observed label distributions
61
-
62
- Raises
63
- ------
64
- ValueError
65
- When exp_ld and obs_ld do not have the same number of classes
66
- """
67
- exp_n_cls = len(expected_dist)
68
- obs_n_cls = len(observed_dist)
69
- if exp_n_cls != obs_n_cls:
70
- raise ValueError(
71
- f"Found {obs_n_cls} unique classes in observed label distribution, "
72
- f"but found {exp_n_cls} unique classes in expected label distribution,"
73
- "This can happen when some class ids have zero instances in one dataset but "
74
- "not in the other. When initializing Parity, "
75
- "try setting the num_classes parameter to the known number of unique class ids, "
76
- "so that classes with zero instances are still included in the distributions."
77
- )
78
-
79
- def _validate_dist(self, label_dist: np.ndarray, label_name: str):
80
- """
81
- Verifies that the given label distribution has labels and checks if
82
- any labels have frequencies less than 5.
83
-
84
- Parameters
85
- ----------
86
- label_dist : np.ndarray
87
- Array representing label distributions
88
-
89
- Raises
90
- ------
91
- ValueError
92
- If label_dist is empty
93
- Warning
94
- If any elements of label_dist are less than 5
95
- """
96
- if not len(label_dist):
97
- raise ValueError(f"No labels found in the {label_name} dataset")
98
- if np.any(label_dist < 5):
99
- warnings.warn(
100
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
101
- " dataset have frequencies less than 5. This may lead"
102
- " to invalid chi-squared evaluation."
103
- )
104
- warnings.warn(
105
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
106
- " dataset have frequencies less than 5. This may lead"
107
- " to invalid chi-squared evaluation."
108
- )
109
-
110
- def evaluate(
111
- self, expected_labels: np.ndarray, observed_labels: np.ndarray, num_classes: Optional[int] = None
112
- ) -> Tuple[np.float64, np.float64]:
113
- """
114
- Perform a one-way chi-squared test between observation frequencies and expected frequencies that
115
- tests the null hypothesis that the observed data has the expected frequencies.
116
-
117
- This function acts as an interface to the scipy.stats.chisquare method, which is documented at
118
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
119
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
120
-
121
- Parameters
122
- ----------
123
- expected_labels : np.ndarray
124
- List of class labels in the expected dataset
125
- observed_labels : np.ndarray
126
- List of class labels in the observed dataset
127
- num_classes : Optional[int]
128
- The number of unique classes in the datasets. If this is not specified, it will
129
- be inferred from the set of unique labels in expected_labels and observed_labels
130
-
131
- Returns
132
- -------
133
- np.float64
134
- chi-squared value of the test
135
- np.float64
136
- p-value of the test
137
-
138
- Raises
139
- ------
140
- ValueError
141
- If x is empty
142
- """
143
- # Calculate
144
- if not num_classes:
145
- num_classes = 0
146
-
147
- observed_dist = self._calculate_label_dist(observed_labels, num_classes)
148
- expected_dist = self._calculate_label_dist(expected_labels, num_classes)
149
-
150
- # Validate
151
- self._validate_dist(observed_dist, "observed")
152
-
153
- # Normalize
154
- expected_dist = self._normalize_expected_dist(expected_dist, observed_dist)
155
-
156
- # Validate normalized expected distribution
157
- self._validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
158
- self._validate_class_balance(expected_dist, observed_dist)
159
-
160
- cs_result = scipy.stats.chisquare(f_obs=observed_dist, f_exp=expected_dist)
161
-
162
- chisquared = cs_result.statistic
163
- p_value = cs_result.pvalue
164
- return chisquared, p_value
309
+ return ParityMetadataOutput(chi_scores, p_values)
@@ -1,14 +1,15 @@
1
+ from abc import abstractmethod
1
2
  from enum import Flag
2
3
  from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Sequence, TypeVar, Union
3
4
 
4
5
  import numpy as np
6
+ from numpy.typing import ArrayLike
5
7
  from scipy.stats import entropy, kurtosis, skew
6
8
 
7
9
  from dataeval._internal.flags import ImageHash, ImageProperty, ImageStatistics, ImageStatsFlags, ImageVisuals
8
- from dataeval._internal.functional.hash import pchash, xxhash
9
- from dataeval._internal.functional.utils import edge_filter, get_bitdepth, normalize_image_shape, rescale
10
- from dataeval._internal.interop import ArrayLike, to_numpy_iter
11
- from dataeval._internal.metrics.base import EvaluateMixin, MetricMixin
10
+ from dataeval._internal.interop import to_numpy_iter
11
+ from dataeval._internal.metrics.base import EvaluateMixin
12
+ from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, pchash, rescale, xxhash
12
13
 
13
14
  QUARTILES = (0, 25, 50, 75, 100)
14
15
 
@@ -16,11 +17,12 @@ TBatch = TypeVar("TBatch", bound=Sequence[ArrayLike])
16
17
  TFlag = TypeVar("TFlag", bound=Flag)
17
18
 
18
19
 
19
- class BaseStatsMetric(EvaluateMixin, MetricMixin, Generic[TBatch, TFlag]):
20
+ class BaseStatsMetric(EvaluateMixin, Generic[TBatch, TFlag]):
20
21
  def __init__(self, flags: TFlag):
21
22
  self.flags = flags
22
23
  self.results = []
23
24
 
25
+ @abstractmethod
24
26
  def update(self, images: TBatch) -> None:
25
27
  """
26
28
  Updates internal metric cache for later calculation