dataeval 0.63.0__py3-none-any.whl → 0.64.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/_internal/detectors/clusterer.py +2 -1
- dataeval/_internal/detectors/drift/base.py +2 -1
- dataeval/_internal/detectors/drift/cvm.py +2 -1
- dataeval/_internal/detectors/drift/ks.py +2 -1
- dataeval/_internal/detectors/drift/mmd.py +4 -3
- dataeval/_internal/detectors/drift/uncertainty.py +1 -2
- dataeval/_internal/detectors/duplicates.py +2 -1
- dataeval/_internal/detectors/linter.py +1 -1
- dataeval/_internal/detectors/ood/ae.py +2 -1
- dataeval/_internal/detectors/ood/aegmm.py +2 -1
- dataeval/_internal/detectors/ood/base.py +2 -1
- dataeval/_internal/detectors/ood/llr.py +3 -2
- dataeval/_internal/detectors/ood/vae.py +2 -1
- dataeval/_internal/detectors/ood/vaegmm.py +2 -1
- dataeval/_internal/interop.py +2 -11
- dataeval/_internal/metrics/balance.py +180 -0
- dataeval/_internal/metrics/base.py +1 -83
- dataeval/_internal/metrics/ber.py +122 -48
- dataeval/_internal/metrics/coverage.py +83 -74
- dataeval/_internal/metrics/divergence.py +67 -67
- dataeval/_internal/metrics/diversity.py +206 -0
- dataeval/_internal/metrics/parity.py +300 -155
- dataeval/_internal/metrics/stats.py +7 -5
- dataeval/_internal/metrics/uap.py +37 -29
- dataeval/_internal/metrics/utils.py +393 -0
- dataeval/_internal/utils.py +64 -0
- dataeval/metrics/__init__.py +25 -6
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -1
- dataeval-0.64.0.dist-info/RECORD +60 -0
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +0 -63
- dataeval/_internal/functional/coverage.py +0 -75
- dataeval/_internal/functional/divergence.py +0 -16
- dataeval/_internal/functional/hash.py +0 -79
- dataeval/_internal/functional/metadata.py +0 -136
- dataeval/_internal/functional/metadataparity.py +0 -190
- dataeval/_internal/functional/uap.py +0 -6
- dataeval/_internal/functional/utils.py +0 -158
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +0 -30
- dataeval/_internal/metrics/metadata.py +0 -610
- dataeval/_internal/metrics/metadataparity.py +0 -67
- dataeval-0.63.0.dist-info/RECORD +0 -68
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,206 @@
|
|
1
|
+
from typing import Dict, List, Literal, NamedTuple, Optional, Sequence
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from numpy.typing import NDArray
|
5
|
+
|
6
|
+
from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
|
7
|
+
|
8
|
+
|
9
|
+
class DiversityOutput(NamedTuple):
|
10
|
+
"""
|
11
|
+
Attributes
|
12
|
+
----------
|
13
|
+
diversity_index : NDArray[np.float64]
|
14
|
+
Diversity index for classes and factors
|
15
|
+
"""
|
16
|
+
|
17
|
+
diversity_index: NDArray[np.float64]
|
18
|
+
|
19
|
+
|
20
|
+
def diversity_shannon(
|
21
|
+
data: np.ndarray,
|
22
|
+
names: List[str],
|
23
|
+
is_categorical: List[bool],
|
24
|
+
subset_mask: Optional[np.ndarray] = None,
|
25
|
+
) -> np.ndarray:
|
26
|
+
"""
|
27
|
+
Compute diversity for discrete/categorical variables and, through standard
|
28
|
+
histogram binning, for continuous variables.
|
29
|
+
|
30
|
+
We define diversity as a normalized form of the Shannon entropy.
|
31
|
+
|
32
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
33
|
+
diversity = 0 implies that all samples belong to one category/bin
|
34
|
+
|
35
|
+
Parameters
|
36
|
+
----------
|
37
|
+
subset_mask: Optional[np.ndarray[bool]]
|
38
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
39
|
+
|
40
|
+
Notes
|
41
|
+
-----
|
42
|
+
For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
|
43
|
+
|
44
|
+
Returns
|
45
|
+
-------
|
46
|
+
diversity_index: np.ndarray
|
47
|
+
Diversity index per column of X
|
48
|
+
|
49
|
+
See Also
|
50
|
+
--------
|
51
|
+
numpy.histogram
|
52
|
+
"""
|
53
|
+
|
54
|
+
# entropy computed using global auto bins so that we can properly normalize
|
55
|
+
ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
|
56
|
+
# normalize by global counts rather than classwise counts
|
57
|
+
num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
|
58
|
+
return ent_unnormalized / np.log(num_bins)
|
59
|
+
|
60
|
+
|
61
|
+
def diversity_simpson(
|
62
|
+
data: np.ndarray,
|
63
|
+
names: List[str],
|
64
|
+
is_categorical: List[bool],
|
65
|
+
subset_mask: Optional[np.ndarray] = None,
|
66
|
+
) -> np.ndarray:
|
67
|
+
"""
|
68
|
+
Compute diversity for discrete/categorical variables and, through standard
|
69
|
+
histogram binning, for continuous variables.
|
70
|
+
|
71
|
+
We define diversity as a normalized form of the inverse Simpson diversity
|
72
|
+
index.
|
73
|
+
|
74
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
75
|
+
diversity = 1/num_categories implies that all samples belong to one category/bin
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
subset_mask: Optional[np.ndarray[bool]]
|
80
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
81
|
+
|
82
|
+
Notes
|
83
|
+
-----
|
84
|
+
For continuous variables, histogram bins are chosen automatically. See
|
85
|
+
numpy.histogram for details.
|
86
|
+
The expression is undefined for q=1, but it approaches the Shannon entropy
|
87
|
+
in the limit.
|
88
|
+
If there is only one category, the diversity index takes a value of 1 =
|
89
|
+
1/N = 1/1. Entropy will take a value of 0.
|
90
|
+
|
91
|
+
Returns
|
92
|
+
-------
|
93
|
+
np.ndarray
|
94
|
+
Diversity index per column of X
|
95
|
+
|
96
|
+
See Also
|
97
|
+
--------
|
98
|
+
numpy.histogram
|
99
|
+
"""
|
100
|
+
|
101
|
+
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
102
|
+
# normalize by global counts, not classwise counts
|
103
|
+
num_bins = get_num_bins(data, names, is_categorical)
|
104
|
+
|
105
|
+
ev_index = np.empty(len(names))
|
106
|
+
# loop over columns for convenience
|
107
|
+
for col, cnts in enumerate(hist_counts.values()):
|
108
|
+
# relative frequencies
|
109
|
+
p_i = cnts / cnts.sum()
|
110
|
+
# inverse Simpson index normalized by (number of bins)
|
111
|
+
ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
|
112
|
+
|
113
|
+
return ev_index
|
114
|
+
|
115
|
+
|
116
|
+
DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
|
117
|
+
|
118
|
+
|
119
|
+
def diversity(
|
120
|
+
class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
|
121
|
+
) -> DiversityOutput:
|
122
|
+
"""
|
123
|
+
Compute diversity for discrete/categorical variables and, through standard
|
124
|
+
histogram binning, for continuous variables.
|
125
|
+
|
126
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
127
|
+
diversity = 0 implies that all samples belong to one category/bin
|
128
|
+
|
129
|
+
Parameters
|
130
|
+
----------
|
131
|
+
class_labels: Sequence[int]
|
132
|
+
List of class labels for each image
|
133
|
+
metadata: List[Dict]
|
134
|
+
List of metadata factors for each image
|
135
|
+
metric: Literal["shannon", "simpson"], default "simpson"
|
136
|
+
string variable indicating which diversity index should be used.
|
137
|
+
Permissible values include "simpson" and "shannon"
|
138
|
+
|
139
|
+
Notes
|
140
|
+
-----
|
141
|
+
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
142
|
+
|
143
|
+
Returns
|
144
|
+
-------
|
145
|
+
DiversityOutput
|
146
|
+
Diversity index per column of self.data or each factor in self.names
|
147
|
+
|
148
|
+
See Also
|
149
|
+
--------
|
150
|
+
numpy.histogram
|
151
|
+
"""
|
152
|
+
diversity_fn = get_method(DIVERSITY_FN_MAP, method)
|
153
|
+
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
154
|
+
diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
|
155
|
+
return DiversityOutput(diversity_index)
|
156
|
+
|
157
|
+
|
158
|
+
def diversity_classwise(
|
159
|
+
class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
|
160
|
+
) -> DiversityOutput:
|
161
|
+
"""
|
162
|
+
Compute diversity for discrete/categorical variables and, through standard
|
163
|
+
histogram binning, for continuous variables.
|
164
|
+
|
165
|
+
We define diversity as a normalized form of the inverse Simpson diversity
|
166
|
+
index.
|
167
|
+
|
168
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
169
|
+
diversity = 1/num_categories implies that all samples belong to one category/bin
|
170
|
+
|
171
|
+
Parameters
|
172
|
+
----------
|
173
|
+
class_labels: Sequence[int]
|
174
|
+
List of class labels for each image
|
175
|
+
metadata: List[Dict]
|
176
|
+
List of metadata factors for each image
|
177
|
+
|
178
|
+
Notes
|
179
|
+
-----
|
180
|
+
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
181
|
+
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
182
|
+
- If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
|
183
|
+
|
184
|
+
Returns
|
185
|
+
-------
|
186
|
+
DiversityOutput
|
187
|
+
Diversity index [n_class x n_factor]
|
188
|
+
|
189
|
+
See Also
|
190
|
+
--------
|
191
|
+
numpy.histogram
|
192
|
+
"""
|
193
|
+
diversity_fn = get_method(DIVERSITY_FN_MAP, method)
|
194
|
+
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
195
|
+
class_idx = names.index("class_label")
|
196
|
+
class_lbl = data[:, class_idx]
|
197
|
+
|
198
|
+
u_classes = np.unique(class_lbl)
|
199
|
+
num_factors = len(names)
|
200
|
+
diversity = np.empty((len(u_classes), num_factors))
|
201
|
+
diversity[:] = np.nan
|
202
|
+
for idx, cls in enumerate(u_classes):
|
203
|
+
subset_mask = class_lbl == cls
|
204
|
+
diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
|
205
|
+
div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
|
206
|
+
return DiversityOutput(div_no_class)
|
@@ -1,164 +1,309 @@
|
|
1
1
|
import warnings
|
2
|
-
from typing import Optional, Tuple
|
2
|
+
from typing import Dict, Mapping, NamedTuple, Optional, Tuple
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
-
import
|
5
|
+
from numpy.typing import ArrayLike, NDArray
|
6
|
+
from scipy.stats import chi2_contingency, chisquare
|
6
7
|
|
8
|
+
from dataeval._internal.interop import to_numpy
|
7
9
|
|
8
|
-
|
10
|
+
|
11
|
+
class ParityOutput(NamedTuple):
|
12
|
+
"""
|
13
|
+
Attributes
|
14
|
+
----------
|
15
|
+
score : np.float64
|
16
|
+
chi-squared value of the test
|
17
|
+
p_value : np.float64
|
18
|
+
p-value of the test
|
19
|
+
"""
|
20
|
+
|
21
|
+
score: np.float64
|
22
|
+
p_value: np.float64
|
23
|
+
|
24
|
+
|
25
|
+
class ParityMetadataOutput(NamedTuple):
|
26
|
+
"""
|
27
|
+
Attributes
|
28
|
+
----------
|
29
|
+
scores : NDArray[np.float64]
|
30
|
+
chi-squared values of the test
|
31
|
+
p_values : NDArray[np.float64]
|
32
|
+
p-values of the test
|
33
|
+
"""
|
34
|
+
|
35
|
+
score: NDArray[np.float64]
|
36
|
+
p_value: NDArray[np.float64]
|
37
|
+
|
38
|
+
|
39
|
+
def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
|
40
|
+
"""
|
41
|
+
Digitizes a list of values into a given number of bins.
|
42
|
+
|
43
|
+
Parameters
|
44
|
+
----------
|
45
|
+
continuous_values: np.ndarray
|
46
|
+
The values to be digitized.
|
47
|
+
bins: int
|
48
|
+
The number of bins for the discrete values that continuous_values will be digitized into.
|
49
|
+
factor_name: str
|
50
|
+
The name of the factor to be digitized.
|
51
|
+
|
52
|
+
Returns
|
53
|
+
-------
|
54
|
+
np.ndarray
|
55
|
+
The digitized values
|
56
|
+
|
57
|
+
"""
|
58
|
+
if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
|
59
|
+
raise TypeError(
|
60
|
+
f"Encountered a non-numeric value for factor {factor_name}, but the factor"
|
61
|
+
" was specified to be continuous. Ensure all occurrences of this factor are numeric types,"
|
62
|
+
f" or do not specify {factor_name} as a continuous factor."
|
63
|
+
)
|
64
|
+
|
65
|
+
_, bin_edges = np.histogram(continuous_values, bins=bins)
|
66
|
+
bin_edges[-1] = np.inf
|
67
|
+
bin_edges[0] = -np.inf
|
68
|
+
return np.digitize(continuous_values, bin_edges)
|
69
|
+
|
70
|
+
|
71
|
+
def format_discretize_factors(
|
72
|
+
data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
|
73
|
+
) -> Tuple[dict, np.ndarray]:
|
9
74
|
"""
|
10
|
-
|
75
|
+
Sets up the internal list of metadata factors.
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
data_factors: Dict[str, np.ndarray]
|
80
|
+
The dataset factors, which are per-image attributes including class label and metadata.
|
81
|
+
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
82
|
+
continuous_factor_bincounts : Dict[str, int]
|
83
|
+
The factors in data_factors that have continuous values and the array of bin counts to
|
84
|
+
discretize values into. All factors are treated as having discrete values unless they
|
85
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
86
|
+
in data_factors.
|
11
87
|
|
12
|
-
|
88
|
+
Returns
|
89
|
+
-------
|
90
|
+
Dict[str, np.ndarray]
|
91
|
+
Intrinsic per-image metadata information with the formatting that input data_factors uses.
|
92
|
+
Each key is a metadata factor, whose value is the discrete per-image factor values.
|
93
|
+
np.ndarray
|
94
|
+
Per-image labels, whose ith element is the label for the ith element of the dataset.
|
13
95
|
"""
|
96
|
+
invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
|
97
|
+
if invalid_keys:
|
98
|
+
raise KeyError(
|
99
|
+
f"The continuous factor(s) {invalid_keys} do not exist in data_factors. Delete these "
|
100
|
+
"keys from `continuous_factor_names` or add corresponding entries to `data_factors`."
|
101
|
+
)
|
102
|
+
|
103
|
+
metadata_factors = {}
|
104
|
+
|
105
|
+
# make sure each factor has the same number of entries
|
106
|
+
lengths = []
|
107
|
+
for arr in data_factors.values():
|
108
|
+
lengths.append(arr.shape)
|
109
|
+
|
110
|
+
if lengths[1:] != lengths[:-1]:
|
111
|
+
raise ValueError("The lengths of each entry in the dictionary are not equal." f" Found lengths {lengths}")
|
112
|
+
|
113
|
+
labels = data_factors["class"]
|
114
|
+
|
115
|
+
metadata_factors = {
|
116
|
+
name: val
|
117
|
+
if name not in continuous_factor_bincounts
|
118
|
+
else digitize_factor_bins(val, continuous_factor_bincounts[name], name)
|
119
|
+
for name, val in data_factors.items()
|
120
|
+
if name != "class"
|
121
|
+
}
|
122
|
+
|
123
|
+
return metadata_factors, labels
|
124
|
+
|
125
|
+
|
126
|
+
def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
|
127
|
+
exp_sum = np.sum(expected_dist)
|
128
|
+
obs_sum = np.sum(observed_dist)
|
129
|
+
|
130
|
+
if exp_sum == 0:
|
131
|
+
raise ValueError(
|
132
|
+
f"Expected label distribution {expected_dist} is all zeros. "
|
133
|
+
"Ensure that Parity.expected_dist is set to a list "
|
134
|
+
"with at least one nonzero element"
|
135
|
+
)
|
136
|
+
|
137
|
+
# Renormalize expected distribution to have the same total number of labels as the observed dataset
|
138
|
+
if exp_sum != obs_sum:
|
139
|
+
expected_dist = expected_dist * obs_sum / exp_sum
|
140
|
+
|
141
|
+
return expected_dist
|
142
|
+
|
143
|
+
|
144
|
+
def validate_dist(label_dist: np.ndarray, label_name: str):
|
145
|
+
"""
|
146
|
+
Verifies that the given label distribution has labels and checks if
|
147
|
+
any labels have frequencies less than 5.
|
148
|
+
|
149
|
+
Parameters
|
150
|
+
----------
|
151
|
+
label_dist : np.ndarray
|
152
|
+
Array representing label distributions
|
153
|
+
|
154
|
+
Raises
|
155
|
+
------
|
156
|
+
ValueError
|
157
|
+
If label_dist is empty
|
158
|
+
Warning
|
159
|
+
If any elements of label_dist are less than 5
|
160
|
+
"""
|
161
|
+
if not len(label_dist):
|
162
|
+
raise ValueError(f"No labels found in the {label_name} dataset")
|
163
|
+
if np.any(label_dist < 5):
|
164
|
+
warnings.warn(
|
165
|
+
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
166
|
+
" dataset have frequencies less than 5. This may lead"
|
167
|
+
" to invalid chi-squared evaluation."
|
168
|
+
)
|
169
|
+
warnings.warn(
|
170
|
+
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
171
|
+
" dataset have frequencies less than 5. This may lead"
|
172
|
+
" to invalid chi-squared evaluation."
|
173
|
+
)
|
174
|
+
|
175
|
+
|
176
|
+
def parity(
|
177
|
+
expected_labels: ArrayLike,
|
178
|
+
observed_labels: ArrayLike,
|
179
|
+
num_classes: Optional[int] = None,
|
180
|
+
) -> ParityOutput:
|
181
|
+
"""
|
182
|
+
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
183
|
+
tests the null hypothesis that the observed data has the expected frequencies.
|
184
|
+
|
185
|
+
This function acts as an interface to the scipy.stats.chisquare method, which is documented at
|
186
|
+
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
|
187
|
+
|
188
|
+
Parameters
|
189
|
+
----------
|
190
|
+
expected_labels : ArrayLike
|
191
|
+
List of class labels in the expected dataset
|
192
|
+
observed_labels : ArrayLike
|
193
|
+
List of class labels in the observed dataset
|
194
|
+
num_classes : Optional[int]
|
195
|
+
The number of unique classes in the datasets. If this is not specified, it will
|
196
|
+
be inferred from the set of unique labels in expected_labels and observed_labels
|
197
|
+
|
198
|
+
Returns
|
199
|
+
-------
|
200
|
+
ParityOutput[np.float64]
|
201
|
+
chi-squared score and p-value of the test
|
202
|
+
|
203
|
+
Raises
|
204
|
+
------
|
205
|
+
ValueError
|
206
|
+
If x is empty
|
207
|
+
"""
|
208
|
+
# Calculate
|
209
|
+
if not num_classes:
|
210
|
+
num_classes = 0
|
211
|
+
|
212
|
+
# Calculate the class frequencies associated with the datasets
|
213
|
+
observed_dist = np.bincount(to_numpy(observed_labels), minlength=num_classes)
|
214
|
+
expected_dist = np.bincount(to_numpy(expected_labels), minlength=num_classes)
|
215
|
+
|
216
|
+
# Validate
|
217
|
+
validate_dist(observed_dist, "observed")
|
218
|
+
|
219
|
+
# Normalize
|
220
|
+
expected_dist = normalize_expected_dist(expected_dist, observed_dist)
|
221
|
+
|
222
|
+
# Validate normalized expected distribution
|
223
|
+
validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
|
224
|
+
|
225
|
+
if len(observed_dist) != len(expected_dist):
|
226
|
+
raise ValueError(
|
227
|
+
f"Found {len(observed_dist)} unique classes in observed label distribution, "
|
228
|
+
f"but found {len(expected_dist)} unique classes in expected label distribution. "
|
229
|
+
"This can happen when some class ids have zero instances in one dataset but "
|
230
|
+
"not in the other. When initializing Parity, try setting the num_classes "
|
231
|
+
"parameter to the known number of unique class ids, so that classes with "
|
232
|
+
"zero instances are still included in the distributions."
|
233
|
+
)
|
234
|
+
|
235
|
+
cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
|
236
|
+
return ParityOutput(cs, p)
|
237
|
+
|
238
|
+
|
239
|
+
def parity_metadata(
|
240
|
+
data_factors: Mapping[str, ArrayLike],
|
241
|
+
continuous_factor_bincounts: Optional[Dict[str, int]] = None,
|
242
|
+
) -> ParityMetadataOutput:
|
243
|
+
"""
|
244
|
+
Evaluates the statistical independence of metadata factors from class labels.
|
245
|
+
This performs a chi-square test, which provides a score and a p-value for
|
246
|
+
statistical independence between each pair of a metadata factor and a class label.
|
247
|
+
A high score with a low p-value suggests that a metadata factor is strongly
|
248
|
+
correlated with a class label.
|
249
|
+
|
250
|
+
Parameters
|
251
|
+
----------
|
252
|
+
data_factors: Mapping[str, ArrayLike]
|
253
|
+
The dataset factors, which are per-image attributes including class label and metadata.
|
254
|
+
Each key of dataset_factors is a factor, whose value is the per-image factor values.
|
255
|
+
continuous_factor_bincounts : Optional[Dict[str, int]], default None
|
256
|
+
The factors in data_factors that have continuous values and the array of bin counts to
|
257
|
+
discretize values into. All factors are treated as having discrete values unless they
|
258
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
259
|
+
in data_factors.
|
260
|
+
|
261
|
+
Returns
|
262
|
+
-------
|
263
|
+
ParityOutput[NDArray[np.float64]]
|
264
|
+
Arrays of length (num_factors) whose (i)th element corresponds to the
|
265
|
+
chi-square score and p-value for the relationship between factor i and
|
266
|
+
the class labels in the dataset.
|
267
|
+
"""
|
268
|
+
data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
|
269
|
+
continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
|
270
|
+
|
271
|
+
factors, labels = format_discretize_factors(data_factors_np, continuous_factor_bincounts)
|
272
|
+
|
273
|
+
chi_scores = np.zeros(len(factors))
|
274
|
+
p_values = np.zeros(len(factors))
|
275
|
+
n_cls = len(np.unique(labels))
|
276
|
+
for i, (current_factor_name, factor_values) in enumerate(factors.items()):
|
277
|
+
unique_factor_values = np.unique(factor_values)
|
278
|
+
contingency_matrix = np.zeros((len(unique_factor_values), n_cls))
|
279
|
+
# Builds a contingency matrix where entry at index (r,c) represents
|
280
|
+
# the frequency of current_factor_name achieving value unique_factor_values[r]
|
281
|
+
# at a data point with class c.
|
282
|
+
|
283
|
+
# TODO: Vectorize this nested for loop
|
284
|
+
for fi, factor_value in enumerate(unique_factor_values):
|
285
|
+
for label in range(n_cls):
|
286
|
+
with_both = np.bitwise_and((labels == label), factor_values == factor_value)
|
287
|
+
contingency_matrix[fi, label] = np.sum(with_both)
|
288
|
+
if 0 < contingency_matrix[fi, label] < 5:
|
289
|
+
warnings.warn(
|
290
|
+
f"Factor {current_factor_name} value {factor_value} co-occurs "
|
291
|
+
f"only {contingency_matrix[fi, label]} times with label {label}. "
|
292
|
+
"This can cause inaccurate chi_square calculation. Recommend"
|
293
|
+
"ensuring each label occurs either 0 times or at least 5 times. "
|
294
|
+
"Alternatively, digitize any continuous-valued factors "
|
295
|
+
"into fewer bins."
|
296
|
+
)
|
297
|
+
|
298
|
+
# This deletes rows containing only zeros,
|
299
|
+
# because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
|
300
|
+
rowsums = np.sum(contingency_matrix, axis=1)
|
301
|
+
rowmask = np.where(rowsums)
|
302
|
+
contingency_matrix = contingency_matrix[rowmask]
|
303
|
+
|
304
|
+
chi2, p, _, _ = chi2_contingency(contingency_matrix)
|
305
|
+
|
306
|
+
chi_scores[i] = chi2
|
307
|
+
p_values[i] = p
|
14
308
|
|
15
|
-
|
16
|
-
exp_sum = np.sum(expected_dist)
|
17
|
-
obs_sum = np.sum(observed_dist)
|
18
|
-
|
19
|
-
if exp_sum == 0:
|
20
|
-
raise ValueError(
|
21
|
-
f"Expected label distribution {expected_dist} is all zeros. "
|
22
|
-
"Ensure that Parity.expected_dist is set to a list "
|
23
|
-
"with at least one nonzero element"
|
24
|
-
)
|
25
|
-
|
26
|
-
# Renormalize expected distribution to have the same total number of labels as the observed dataset
|
27
|
-
if exp_sum != obs_sum:
|
28
|
-
expected_dist = expected_dist * obs_sum / exp_sum
|
29
|
-
|
30
|
-
return expected_dist
|
31
|
-
|
32
|
-
def _calculate_label_dist(self, labels: np.ndarray, num_classes: int) -> np.ndarray:
|
33
|
-
"""
|
34
|
-
Calculate the class frequencies associated with a dataset
|
35
|
-
|
36
|
-
Parameters
|
37
|
-
----------
|
38
|
-
labels : np.ndarray
|
39
|
-
List of class labels in a dataset
|
40
|
-
num_classes: int
|
41
|
-
The number of unique classes in the datasets
|
42
|
-
|
43
|
-
Returns
|
44
|
-
-------
|
45
|
-
label_dist : np.ndarray
|
46
|
-
Array representing label distributions
|
47
|
-
"""
|
48
|
-
label_dist = np.bincount(labels, minlength=num_classes)
|
49
|
-
return label_dist
|
50
|
-
|
51
|
-
def _validate_class_balance(self, expected_dist: np.ndarray, observed_dist: np.ndarray):
|
52
|
-
"""
|
53
|
-
Check if the numbers of unique classes in the datasets are unequal
|
54
|
-
|
55
|
-
Parameters
|
56
|
-
----------
|
57
|
-
expected_dist : np.ndarray
|
58
|
-
Array representing expected label distributions
|
59
|
-
observed_dist : np.ndarray
|
60
|
-
Array representing observed label distributions
|
61
|
-
|
62
|
-
Raises
|
63
|
-
------
|
64
|
-
ValueError
|
65
|
-
When exp_ld and obs_ld do not have the same number of classes
|
66
|
-
"""
|
67
|
-
exp_n_cls = len(expected_dist)
|
68
|
-
obs_n_cls = len(observed_dist)
|
69
|
-
if exp_n_cls != obs_n_cls:
|
70
|
-
raise ValueError(
|
71
|
-
f"Found {obs_n_cls} unique classes in observed label distribution, "
|
72
|
-
f"but found {exp_n_cls} unique classes in expected label distribution,"
|
73
|
-
"This can happen when some class ids have zero instances in one dataset but "
|
74
|
-
"not in the other. When initializing Parity, "
|
75
|
-
"try setting the num_classes parameter to the known number of unique class ids, "
|
76
|
-
"so that classes with zero instances are still included in the distributions."
|
77
|
-
)
|
78
|
-
|
79
|
-
def _validate_dist(self, label_dist: np.ndarray, label_name: str):
|
80
|
-
"""
|
81
|
-
Verifies that the given label distribution has labels and checks if
|
82
|
-
any labels have frequencies less than 5.
|
83
|
-
|
84
|
-
Parameters
|
85
|
-
----------
|
86
|
-
label_dist : np.ndarray
|
87
|
-
Array representing label distributions
|
88
|
-
|
89
|
-
Raises
|
90
|
-
------
|
91
|
-
ValueError
|
92
|
-
If label_dist is empty
|
93
|
-
Warning
|
94
|
-
If any elements of label_dist are less than 5
|
95
|
-
"""
|
96
|
-
if not len(label_dist):
|
97
|
-
raise ValueError(f"No labels found in the {label_name} dataset")
|
98
|
-
if np.any(label_dist < 5):
|
99
|
-
warnings.warn(
|
100
|
-
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
101
|
-
" dataset have frequencies less than 5. This may lead"
|
102
|
-
" to invalid chi-squared evaluation."
|
103
|
-
)
|
104
|
-
warnings.warn(
|
105
|
-
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
106
|
-
" dataset have frequencies less than 5. This may lead"
|
107
|
-
" to invalid chi-squared evaluation."
|
108
|
-
)
|
109
|
-
|
110
|
-
def evaluate(
|
111
|
-
self, expected_labels: np.ndarray, observed_labels: np.ndarray, num_classes: Optional[int] = None
|
112
|
-
) -> Tuple[np.float64, np.float64]:
|
113
|
-
"""
|
114
|
-
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
115
|
-
tests the null hypothesis that the observed data has the expected frequencies.
|
116
|
-
|
117
|
-
This function acts as an interface to the scipy.stats.chisquare method, which is documented at
|
118
|
-
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
|
119
|
-
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
|
120
|
-
|
121
|
-
Parameters
|
122
|
-
----------
|
123
|
-
expected_labels : np.ndarray
|
124
|
-
List of class labels in the expected dataset
|
125
|
-
observed_labels : np.ndarray
|
126
|
-
List of class labels in the observed dataset
|
127
|
-
num_classes : Optional[int]
|
128
|
-
The number of unique classes in the datasets. If this is not specified, it will
|
129
|
-
be inferred from the set of unique labels in expected_labels and observed_labels
|
130
|
-
|
131
|
-
Returns
|
132
|
-
-------
|
133
|
-
np.float64
|
134
|
-
chi-squared value of the test
|
135
|
-
np.float64
|
136
|
-
p-value of the test
|
137
|
-
|
138
|
-
Raises
|
139
|
-
------
|
140
|
-
ValueError
|
141
|
-
If x is empty
|
142
|
-
"""
|
143
|
-
# Calculate
|
144
|
-
if not num_classes:
|
145
|
-
num_classes = 0
|
146
|
-
|
147
|
-
observed_dist = self._calculate_label_dist(observed_labels, num_classes)
|
148
|
-
expected_dist = self._calculate_label_dist(expected_labels, num_classes)
|
149
|
-
|
150
|
-
# Validate
|
151
|
-
self._validate_dist(observed_dist, "observed")
|
152
|
-
|
153
|
-
# Normalize
|
154
|
-
expected_dist = self._normalize_expected_dist(expected_dist, observed_dist)
|
155
|
-
|
156
|
-
# Validate normalized expected distribution
|
157
|
-
self._validate_dist(expected_dist, f"expected for {np.sum(observed_dist)} observations")
|
158
|
-
self._validate_class_balance(expected_dist, observed_dist)
|
159
|
-
|
160
|
-
cs_result = scipy.stats.chisquare(f_obs=observed_dist, f_exp=expected_dist)
|
161
|
-
|
162
|
-
chisquared = cs_result.statistic
|
163
|
-
p_value = cs_result.pvalue
|
164
|
-
return chisquared, p_value
|
309
|
+
return ParityMetadataOutput(chi_scores, p_values)
|
@@ -1,14 +1,15 @@
|
|
1
|
+
from abc import abstractmethod
|
1
2
|
from enum import Flag
|
2
3
|
from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Sequence, TypeVar, Union
|
3
4
|
|
4
5
|
import numpy as np
|
6
|
+
from numpy.typing import ArrayLike
|
5
7
|
from scipy.stats import entropy, kurtosis, skew
|
6
8
|
|
7
9
|
from dataeval._internal.flags import ImageHash, ImageProperty, ImageStatistics, ImageStatsFlags, ImageVisuals
|
8
|
-
from dataeval._internal.
|
9
|
-
from dataeval._internal.
|
10
|
-
from dataeval._internal.
|
11
|
-
from dataeval._internal.metrics.base import EvaluateMixin, MetricMixin
|
10
|
+
from dataeval._internal.interop import to_numpy_iter
|
11
|
+
from dataeval._internal.metrics.base import EvaluateMixin
|
12
|
+
from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, pchash, rescale, xxhash
|
12
13
|
|
13
14
|
QUARTILES = (0, 25, 50, 75, 100)
|
14
15
|
|
@@ -16,11 +17,12 @@ TBatch = TypeVar("TBatch", bound=Sequence[ArrayLike])
|
|
16
17
|
TFlag = TypeVar("TFlag", bound=Flag)
|
17
18
|
|
18
19
|
|
19
|
-
class BaseStatsMetric(EvaluateMixin,
|
20
|
+
class BaseStatsMetric(EvaluateMixin, Generic[TBatch, TFlag]):
|
20
21
|
def __init__(self, flags: TFlag):
|
21
22
|
self.flags = flags
|
22
23
|
self.results = []
|
23
24
|
|
25
|
+
@abstractmethod
|
24
26
|
def update(self, images: TBatch) -> None:
|
25
27
|
"""
|
26
28
|
Updates internal metric cache for later calculation
|