dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +45 -16
  3. dataeval/_internal/detectors/drift/base.py +15 -12
  4. dataeval/_internal/detectors/drift/cvm.py +12 -8
  5. dataeval/_internal/detectors/drift/ks.py +7 -3
  6. dataeval/_internal/detectors/drift/mmd.py +15 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +6 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -11
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +7 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +6 -5
  12. dataeval/_internal/detectors/ood/base.py +15 -13
  13. dataeval/_internal/detectors/ood/llr.py +8 -5
  14. dataeval/_internal/detectors/ood/vae.py +6 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  16. dataeval/_internal/interop.py +43 -0
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +2 -84
  19. dataeval/_internal/metrics/ber.py +77 -53
  20. dataeval/_internal/metrics/coverage.py +80 -55
  21. dataeval/_internal/metrics/divergence.py +62 -54
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +292 -163
  24. dataeval/_internal/metrics/stats.py +48 -35
  25. dataeval/_internal/metrics/uap.py +31 -26
  26. dataeval/_internal/metrics/utils.py +237 -2
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/_internal/workflows/__init__.py +0 -0
  29. dataeval/metrics/__init__.py +25 -5
  30. dataeval/utils/__init__.py +9 -0
  31. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
  32. dataeval-0.64.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/hash.py +0 -79
  34. dataeval-0.61.0.dist-info/RECORD +0 -55
  35. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,206 @@
1
+ from typing import Dict, List, Literal, NamedTuple, Optional, Sequence
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+
6
+ from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
7
+
8
+
9
+ class DiversityOutput(NamedTuple):
10
+ """
11
+ Attributes
12
+ ----------
13
+ diversity_index : NDArray[np.float64]
14
+ Diversity index for classes and factors
15
+ """
16
+
17
+ diversity_index: NDArray[np.float64]
18
+
19
+
20
+ def diversity_shannon(
21
+ data: np.ndarray,
22
+ names: List[str],
23
+ is_categorical: List[bool],
24
+ subset_mask: Optional[np.ndarray] = None,
25
+ ) -> np.ndarray:
26
+ """
27
+ Compute diversity for discrete/categorical variables and, through standard
28
+ histogram binning, for continuous variables.
29
+
30
+ We define diversity as a normalized form of the Shannon entropy.
31
+
32
+ diversity = 1 implies that samples are evenly distributed across a particular factor
33
+ diversity = 0 implies that all samples belong to one category/bin
34
+
35
+ Parameters
36
+ ----------
37
+ subset_mask: Optional[np.ndarray[bool]]
38
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
39
+
40
+ Notes
41
+ -----
42
+ For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
43
+
44
+ Returns
45
+ -------
46
+ diversity_index: np.ndarray
47
+ Diversity index per column of X
48
+
49
+ See Also
50
+ --------
51
+ numpy.histogram
52
+ """
53
+
54
+ # entropy computed using global auto bins so that we can properly normalize
55
+ ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
56
+ # normalize by global counts rather than classwise counts
57
+ num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
58
+ return ent_unnormalized / np.log(num_bins)
59
+
60
+
61
+ def diversity_simpson(
62
+ data: np.ndarray,
63
+ names: List[str],
64
+ is_categorical: List[bool],
65
+ subset_mask: Optional[np.ndarray] = None,
66
+ ) -> np.ndarray:
67
+ """
68
+ Compute diversity for discrete/categorical variables and, through standard
69
+ histogram binning, for continuous variables.
70
+
71
+ We define diversity as a normalized form of the inverse Simpson diversity
72
+ index.
73
+
74
+ diversity = 1 implies that samples are evenly distributed across a particular factor
75
+ diversity = 1/num_categories implies that all samples belong to one category/bin
76
+
77
+ Parameters
78
+ ----------
79
+ subset_mask: Optional[np.ndarray[bool]]
80
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
81
+
82
+ Notes
83
+ -----
84
+ For continuous variables, histogram bins are chosen automatically. See
85
+ numpy.histogram for details.
86
+ The expression is undefined for q=1, but it approaches the Shannon entropy
87
+ in the limit.
88
+ If there is only one category, the diversity index takes a value of 1 =
89
+ 1/N = 1/1. Entropy will take a value of 0.
90
+
91
+ Returns
92
+ -------
93
+ np.ndarray
94
+ Diversity index per column of X
95
+
96
+ See Also
97
+ --------
98
+ numpy.histogram
99
+ """
100
+
101
+ hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
102
+ # normalize by global counts, not classwise counts
103
+ num_bins = get_num_bins(data, names, is_categorical)
104
+
105
+ ev_index = np.empty(len(names))
106
+ # loop over columns for convenience
107
+ for col, cnts in enumerate(hist_counts.values()):
108
+ # relative frequencies
109
+ p_i = cnts / cnts.sum()
110
+ # inverse Simpson index normalized by (number of bins)
111
+ ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
112
+
113
+ return ev_index
114
+
115
+
116
+ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
117
+
118
+
119
+ def diversity(
120
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
121
+ ) -> DiversityOutput:
122
+ """
123
+ Compute diversity for discrete/categorical variables and, through standard
124
+ histogram binning, for continuous variables.
125
+
126
+ diversity = 1 implies that samples are evenly distributed across a particular factor
127
+ diversity = 0 implies that all samples belong to one category/bin
128
+
129
+ Parameters
130
+ ----------
131
+ class_labels: Sequence[int]
132
+ List of class labels for each image
133
+ metadata: List[Dict]
134
+ List of metadata factors for each image
135
+ metric: Literal["shannon", "simpson"], default "simpson"
136
+ string variable indicating which diversity index should be used.
137
+ Permissible values include "simpson" and "shannon"
138
+
139
+ Notes
140
+ -----
141
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
142
+
143
+ Returns
144
+ -------
145
+ DiversityOutput
146
+ Diversity index per column of self.data or each factor in self.names
147
+
148
+ See Also
149
+ --------
150
+ numpy.histogram
151
+ """
152
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
153
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
154
+ diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
155
+ return DiversityOutput(diversity_index)
156
+
157
+
158
+ def diversity_classwise(
159
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
160
+ ) -> DiversityOutput:
161
+ """
162
+ Compute diversity for discrete/categorical variables and, through standard
163
+ histogram binning, for continuous variables.
164
+
165
+ We define diversity as a normalized form of the inverse Simpson diversity
166
+ index.
167
+
168
+ diversity = 1 implies that samples are evenly distributed across a particular factor
169
+ diversity = 1/num_categories implies that all samples belong to one category/bin
170
+
171
+ Parameters
172
+ ----------
173
+ class_labels: Sequence[int]
174
+ List of class labels for each image
175
+ metadata: List[Dict]
176
+ List of metadata factors for each image
177
+
178
+ Notes
179
+ -----
180
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
181
+ - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
182
+ - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
183
+
184
+ Returns
185
+ -------
186
+ DiversityOutput
187
+ Diversity index [n_class x n_factor]
188
+
189
+ See Also
190
+ --------
191
+ numpy.histogram
192
+ """
193
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
194
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
195
+ class_idx = names.index("class_label")
196
+ class_lbl = data[:, class_idx]
197
+
198
+ u_classes = np.unique(class_lbl)
199
+ num_factors = len(names)
200
+ diversity = np.empty((len(u_classes), num_factors))
201
+ diversity[:] = np.nan
202
+ for idx, cls in enumerate(u_classes):
203
+ subset_mask = class_lbl == cls
204
+ diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
205
+ div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
206
+ return DiversityOutput(div_no_class)
@@ -1,180 +1,309 @@
1
1
  import warnings
2
- from typing import Optional, Tuple
2
+ from typing import Dict, Mapping, NamedTuple, Optional, Tuple
3
3
 
4
4
  import numpy as np
5
- import scipy
5
+ from numpy.typing import ArrayLike, NDArray
6
+ from scipy.stats import chi2_contingency, chisquare
6
7
 
8
+ from dataeval._internal.interop import to_numpy
7
9
 
8
- class Parity:
10
+
11
+ class ParityOutput(NamedTuple):
12
+ """
13
+ Attributes
14
+ ----------
15
+ score : np.float64
16
+ chi-squared value of the test
17
+ p_value : np.float64
18
+ p-value of the test
19
+ """
20
+
21
+ score: np.float64
22
+ p_value: np.float64
23
+
24
+
25
+ class ParityMetadataOutput(NamedTuple):
26
+ """
27
+ Attributes
28
+ ----------
29
+ scores : NDArray[np.float64]
30
+ chi-squared values of the test
31
+ p_values : NDArray[np.float64]
32
+ p-values of the test
33
+ """
34
+
35
+ score: NDArray[np.float64]
36
+ p_value: NDArray[np.float64]
37
+
38
+
39
+ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
40
+ """
41
+ Digitizes a list of values into a given number of bins.
42
+
43
+ Parameters
44
+ ----------
45
+ continuous_values: np.ndarray
46
+ The values to be digitized.
47
+ bins: int
48
+ The number of bins for the discrete values that continuous_values will be digitized into.
49
+ factor_name: str
50
+ The name of the factor to be digitized.
51
+
52
+ Returns
53
+ -------
54
+ np.ndarray
55
+ The digitized values
56
+
57
+ """
58
+ if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
59
+ raise TypeError(
60
+ f"Encountered a non-numeric value for factor {factor_name}, but the factor"
61
+ " was specified to be continuous. Ensure all occurrences of this factor are numeric types,"
62
+ f" or do not specify {factor_name} as a continuous factor."
63
+ )
64
+
65
+ _, bin_edges = np.histogram(continuous_values, bins=bins)
66
+ bin_edges[-1] = np.inf
67
+ bin_edges[0] = -np.inf
68
+ return np.digitize(continuous_values, bin_edges)
69
+
70
+
71
+ def format_discretize_factors(
72
+ data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
73
+ ) -> Tuple[dict, np.ndarray]:
74
+ """
75
+ Sets up the internal list of metadata factors.
76
+
77
+ Parameters
78
+ ----------
79
+ data_factors: Dict[str, np.ndarray]
80
+ The dataset factors, which are per-image attributes including class label and metadata.
81
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
82
+ continuous_factor_bincounts : Dict[str, int]
83
+ The factors in data_factors that have continuous values and the array of bin counts to
84
+ discretize values into. All factors are treated as having discrete values unless they
85
+ are specified as keys in this dictionary. Each element of this array must occur as a key
86
+ in data_factors.
87
+
88
+ Returns
89
+ -------
90
+ Dict[str, np.ndarray]
91
+ Intrinsic per-image metadata information with the formatting that input data_factors uses.
92
+ Each key is a metadata factor, whose value is the discrete per-image factor values.
93
+ np.ndarray
94
+ Per-image labels, whose ith element is the label for the ith element of the dataset.
95
+ """
96
+ invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
97
+ if invalid_keys:
98
+ raise KeyError(
99
+ f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
100
+ "keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
101
+ )
102
+
103
+ metadata_factors = {}
104
+
105
+ # make sure each factor has the same number of entries
106
+ lengths = []
107
+ for arr in data_factors.values():
108
+ lengths.append(arr.shape)
109
+
110
+ if lengths[1:] != lengths[:-1]:
111
+ raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
112
+
113
+ labels = data_factors["class"]
114
+
115
+ metadata_factors = {
116
+ name: val
117
+ if name not in continuous_factor_bincounts
118
+ else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
119
+ for name, val in data_factors.items()
120
+ if name != "class"
121
+ }
122
+
123
+ return metadata_factors, labels
124
+
125
+
126
+ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
127
+ exp_sum = np.sum(expected_dist)
128
+ obs_sum = np.sum(observed_dist)
129
+
130
+ if exp_sum == 0:
131
+ raise ValueError(
132
+ f"Expected label distribution {expected_dist} is all zeros. "
133
+ "Ensure that Parity.expected_dist is set to a list "
134
+ "with at least one nonzero element"
135
+ )
136
+
137
+ # Renormalize expected distribution to have the same total number of labels as the observed dataset
138
+ if exp_sum != obs_sum:
139
+ expected_dist = expected_dist * obs_sum / exp_sum
140
+
141
+ return expected_dist
142
+
143
+
144
+ def validate_dist(label_dist: np.ndarray, label_name: str):
145
+ """
146
+ Verifies that the given label distribution has labels and checks if
147
+ any labels have frequencies less than 5.
148
+
149
+ Parameters
150
+ ----------
151
+ label_dist : np.ndarray
152
+ Array representing label distributions
153
+
154
+ Raises
155
+ ------
156
+ ValueError
157
+ If label_dist is empty
158
+ Warning
159
+ If any elements of label_dist are less than 5
160
+ """
161
+ if not len(label_dist):
162
+ raise ValueError(f"No labels found in the {label_name} dataset")
163
+ if np.any(label_dist < 5):
164
+ warnings.warn(
165
+ f"Labels {np.where(label_dist<5)[0]} in {label_name}"
166
+ " dataset have frequencies less than 5. This may lead"
167
+ " to invalid chi-squared evaluation."
168
+ )
169
+ warnings.warn(
170
+ f"Labels {np.where(label_dist<5)[0]} in {label_name}"
171
+ " dataset have frequencies less than 5. This may lead"
172
+ " to invalid chi-squared evaluation."
173
+ )
174
+
175
+
176
+ def parity(
177
+ expected_labels: ArrayLike,
178
+ observed_labels: ArrayLike,
179
+ num_classes: Optional[int] = None,
180
+ ) -> ParityOutput:
9
181
  """
10
- Class for evaluating statistics of observed and expected class labels, including:
182
+ Perform a one-way chi-squared test between observation frequencies and expected frequencies that
183
+ tests the null hypothesis that the observed data has the expected frequencies.
11
184
 
12
- - Chi Squared test for statistical independence between expected and observed labels
185
+ This function acts as an interface to the scipy.stats.chisquare method, which is documented at
186
+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
13
187
 
14
188
  Parameters
15
189
  ----------
16
- expected_labels : np.ndarray
190
+ expected_labels : ArrayLike
17
191
  List of class labels in the expected dataset
18
- observed_labels : np.ndarray
192
+ observed_labels : ArrayLike
19
193
  List of class labels in the observed dataset
20
194
  num_classes : Optional[int]
21
195
  The number of unique classes in the datasets. If this is not specified, it will
22
196
  be inferred from the set of unique labels in expected_labels and observed_labels
197
+
198
+ Returns
199
+ -------
200
+ ParityOutput[np.float64]
201
+ chi-squared score and p-value of the test
202
+
203
+ Raises
204
+ ------
205
+ ValueError
206
+ If x is empty
207
+ """
208
+ # Calculate
209
+ if not num_classes:
210
+ num_classes = 0
211
+
212
+ # Calculate the class frequencies associated with the datasets
213
+ observed_dist = np.bincount(to_numpy(observed_labels), minlength=num_classes)
214
+ expected_dist = np.bincount(to_numpy(expected_labels), minlength=num_classes)
215
+
216
+ # Validate
217
+ validate_dist(observed_dist, "observed")
218
+
219
+ # Normalize
220
+ expected_dist = normalize_expected_dist(expected_dist, observed_dist)
221
+
222
+ # Validate normalized expected distribution
223
+ validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
224
+
225
+ if len(observed_dist) != len(expected_dist):
226
+ raise ValueError(
227
+ f"Found {len(observed_dist)} unique classes in observed label distribution, "
228
+ f"but found {len(expected_dist)} unique classes in expected label distribution. "
229
+ "This can happen when some class ids have zero instances in one dataset but "
230
+ "not in the other. When initializing Parity, try setting the num_classes "
231
+ "parameter to the known number of unique class ids, so that classes with "
232
+ "zero instances are still included in the distributions."
233
+ )
234
+
235
+ cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
236
+ return ParityOutput(cs, p)
237
+
238
+
239
+ def parity_metadata(
240
+ data_factors: Mapping[str, ArrayLike],
241
+ continuous_factor_bincounts: Optional[Dict[str, int]] = None,
242
+ ) -> ParityMetadataOutput:
243
+ """
244
+ Evaluates the statistical independence of metadata factors from class labels.
245
+ This performs a chi-square test, which provides a score and a p-value for
246
+ statistical independence between each pair of a metadata factor and a class label.
247
+ A high score with a low p-value suggests that a metadata factor is strongly
248
+ correlated with a class label.
249
+
250
+ Parameters
251
+ ----------
252
+ data_factors: Mapping[str, ArrayLike]
253
+ The dataset factors, which are per-image attributes including class label and metadata.
254
+ Each key of dataset_factors is a factor, whose value is the per-image factor values.
255
+ continuous_factor_bincounts : Optional[Dict[str, int]], default None
256
+ The factors in data_factors that have continuous values and the array of bin counts to
257
+ discretize values into. All factors are treated as having discrete values unless they
258
+ are specified as keys in this dictionary. Each element of this array must occur as a key
259
+ in data_factors.
260
+
261
+ Returns
262
+ -------
263
+ ParityOutput[NDArray[np.float64]]
264
+ Arrays of length (num_factors) whose (i)th element corresponds to the
265
+ chi-square score and p-value for the relationship between factor i and
266
+ the class labels in the dataset.
23
267
  """
268
+ data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
269
+ continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
270
+
271
+ factors, labels = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
272
+
273
+ chi_scores = np.zeros(len(factors))
274
+ p_values = np.zeros(len(factors))
275
+ n_cls = len(np.unique(labels))
276
+ for i, (current_factor_name, factor_values) in enumerate(factors.items()):
277
+ unique_factor_values = np.unique(factor_values)
278
+ contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
279
+ # Builds a contingency matrix where entry at index (r,c) represents
280
+ # the frequency of current_factor_name achieving value unique_factor_values[r]
281
+ # at a data point with class c.
282
+
283
+ # TODO: Vectorize this nested for loop
284
+ for fi, factor_value in enumerate(unique_factor_values):
285
+ for label in range(n_cls):
286
+ with_both = np.bitwise_and((labels == label), factor_values == factor_value)
287
+ contingency_matrix[fi, label] = np.sum(with_both)
288
+ if 0 < contingency_matrix[fi, label] < 5:
289
+ warnings.warn(
290
+ f"Factor {current_factor_name} value {factor_value} co-occurs "
291
+ f"only {contingency_matrix[fi, label]} times with label {label}. "
292
+ "This can cause inaccurate chi_square calculation. Recommend"
293
+ "ensuring each label occurs either 0 times or at least 5 times. "
294
+ "Alternatively, digitize any continuous-valued factors "
295
+ "into fewer bins."
296
+ )
297
+
298
+ # This deletes rows containing only zeros,
299
+ # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
300
+ rowsums = np.sum(contingency_matrix, axis=1)
301
+ rowmask = np.where(rowsums)
302
+ contingency_matrix = contingency_matrix[rowmask]
303
+
304
+ chi2, p, _, _ = chi2_contingency(contingency_matrix)
305
+
306
+ chi_scores[i] = chi2
307
+ p_values[i] = p
24
308
 
25
- def __init__(self, expected_labels: np.ndarray, observed_labels: np.ndarray, num_classes: Optional[int] = None):
26
- self.set_labels(expected_labels, observed_labels, num_classes)
27
-
28
- def set_labels(self, expected_labels: np.ndarray, observed_labels: np.ndarray, num_classes: Optional[int] = None):
29
- """
30
- Calculates the label distributions for expected and observed labels
31
- and performs validation on the results.
32
-
33
- Parameters
34
- ----------
35
- expected_labels : np.ndarray
36
- List of class labels in the expected dataset
37
- observed_labels : np.ndarray
38
- List of class labels in the observed dataset
39
- num_classes : Optional[int]
40
- The number of unique classes in the datasets. If this is not specified, it will
41
- be inferred from the set of unique labels in expected_labels and observed_labels
42
-
43
- Raises
44
- ------
45
- ValueError
46
- If x is empty
47
- """
48
- self.num_classes = num_classes
49
-
50
- # Calculate
51
- observed_dist = self._calculate_label_dist(observed_labels)
52
- expected_dist = self._calculate_label_dist(expected_labels)
53
-
54
- # Validate
55
- self._validate_dist(observed_dist, "observed")
56
-
57
- # Normalize
58
- expected_dist = self._normalize_expected_dist(expected_dist, observed_dist)
59
-
60
- # Validate normalized expected distribution
61
- self._validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
62
- self._validate_class_balance(expected_dist, observed_dist)
63
-
64
- self._observed_dist = observed_dist
65
- self._expected_dist = expected_dist
66
-
67
- def _normalize_expected_dist(self, expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
68
- exp_sum = np.sum(expected_dist)
69
- obs_sum = np.sum(observed_dist)
70
-
71
- if exp_sum == 0:
72
- raise ValueError(
73
- f"Expected label distribution {expected_dist} is all zeros. "
74
- "Ensure that Parity.expected_dist is set to a list "
75
- "with at least one nonzero element"
76
- )
77
-
78
- # Renormalize expected distribution to have the same total number of labels as the observed dataset
79
- if exp_sum != obs_sum:
80
- expected_dist = expected_dist * obs_sum / exp_sum
81
-
82
- return expected_dist
83
-
84
- def _calculate_label_dist(self, labels: np.ndarray) -> np.ndarray:
85
- """
86
- Calculate the class frequencies associated with a dataset
87
-
88
- Parameters
89
- ----------
90
- labels : np.ndarray
91
- List of class labels in a dataset
92
-
93
- Returns
94
- -------
95
- label_dist : np.ndarray
96
- Array representing label distributions
97
- """
98
- label_dist = np.bincount(labels, minlength=(self.num_classes if self.num_classes else 0))
99
- return label_dist
100
-
101
- def _validate_class_balance(self, expected_dist: np.ndarray, observed_dist: np.ndarray):
102
- """
103
- Check if the numbers of unique classes in the datasets are unequal
104
-
105
- Parameters
106
- ----------
107
- expected_dist : np.ndarray
108
- Array representing expected label distributions
109
- observed_dist : np.ndarray
110
- Array representing observed label distributions
111
-
112
- Raises
113
- ------
114
- ValueError
115
- When exp_ld and obs_ld do not have the same number of classes
116
- """
117
- exp_n_cls = len(expected_dist)
118
- obs_n_cls = len(observed_dist)
119
- if exp_n_cls != obs_n_cls:
120
- raise ValueError(
121
- f"Found {obs_n_cls} unique classes in observed label distribution, "
122
- f"but found {exp_n_cls} unique classes in expected label distribution,"
123
- "This can happen when some class ids have zero instances in one dataset but "
124
- "not in the other. When initializing Parity, "
125
- "try setting the num_classes parameter to the known number of unique class ids, "
126
- "so that classes with zero instances are still included in the distributions."
127
- )
128
-
129
- def _validate_dist(self, label_dist: np.ndarray, label_name: str):
130
- """
131
- Verifies that the given label distribution has labels and checks if
132
- any labels have frequencies less than 5.
133
-
134
- Parameters
135
- ----------
136
- label_dist : np.ndarray
137
- Array representing label distributions
138
-
139
- Raises
140
- ------
141
- ValueError
142
- If label_dist is empty
143
- Warning
144
- If any elements of label_dist are less than 5
145
- """
146
- if not len(label_dist):
147
- raise ValueError(f"No labels found in the {label_name} dataset")
148
- if np.any(label_dist < 5):
149
- warnings.warn(
150
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
151
- " dataset have frequencies less than 5. This may lead"
152
- " to invalid chi-squared evaluation."
153
- )
154
- warnings.warn(
155
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
156
- " dataset have frequencies less than 5. This may lead"
157
- " to invalid chi-squared evaluation."
158
- )
159
-
160
- def evaluate(self) -> Tuple[np.float64, np.float64]:
161
- """
162
- Perform a one-way chi-squared test between observation frequencies and expected frequencies that
163
- tests the null hypothesis that the observed data has the expected frequencies.
164
-
165
- This function acts as an interface to the scipy.stats.chisquare method, which is documented at
166
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
167
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
168
-
169
- Returns
170
- -------
171
- np.float64
172
- chi-squared value of the test
173
- np.float64
174
- p-value of the test
175
- """
176
- cs_result = scipy.stats.chisquare(f_obs=self._observed_dist, f_exp=self._expected_dist)
177
-
178
- chisquared = cs_result.statistic
179
- p_value = cs_result.pvalue
180
- return chisquared, p_value
309
+ return ParityMetadataOutput(chi_scores, p_values)