dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/_internal/detectors/clusterer.py +47 -34
- dataeval/_internal/detectors/drift/base.py +53 -35
- dataeval/_internal/detectors/drift/cvm.py +5 -4
- dataeval/_internal/detectors/drift/ks.py +7 -6
- dataeval/_internal/detectors/drift/mmd.py +39 -19
- dataeval/_internal/detectors/drift/torch.py +6 -5
- dataeval/_internal/detectors/drift/uncertainty.py +7 -8
- dataeval/_internal/detectors/duplicates.py +57 -30
- dataeval/_internal/detectors/linter.py +40 -24
- dataeval/_internal/detectors/ood/ae.py +2 -1
- dataeval/_internal/detectors/ood/aegmm.py +2 -1
- dataeval/_internal/detectors/ood/base.py +37 -15
- dataeval/_internal/detectors/ood/llr.py +9 -8
- dataeval/_internal/detectors/ood/vae.py +2 -1
- dataeval/_internal/detectors/ood/vaegmm.py +2 -1
- dataeval/_internal/flags.py +42 -21
- dataeval/_internal/interop.py +3 -12
- dataeval/_internal/metrics/balance.py +188 -0
- dataeval/_internal/metrics/ber.py +123 -48
- dataeval/_internal/metrics/coverage.py +90 -74
- dataeval/_internal/metrics/divergence.py +101 -67
- dataeval/_internal/metrics/diversity.py +211 -0
- dataeval/_internal/metrics/parity.py +287 -155
- dataeval/_internal/metrics/stats.py +198 -317
- dataeval/_internal/metrics/uap.py +40 -29
- dataeval/_internal/metrics/utils.py +430 -0
- dataeval/_internal/models/tensorflow/losses.py +3 -3
- dataeval/_internal/models/tensorflow/trainer.py +3 -2
- dataeval/_internal/models/tensorflow/utils.py +4 -3
- dataeval/_internal/output.py +82 -0
- dataeval/_internal/utils.py +64 -0
- dataeval/_internal/workflows/sufficiency.py +96 -107
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +26 -7
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
- dataeval-0.65.0.dist-info/RECORD +60 -0
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +0 -63
- dataeval/_internal/functional/coverage.py +0 -75
- dataeval/_internal/functional/divergence.py +0 -16
- dataeval/_internal/functional/hash.py +0 -79
- dataeval/_internal/functional/metadata.py +0 -136
- dataeval/_internal/functional/metadataparity.py +0 -190
- dataeval/_internal/functional/uap.py +0 -6
- dataeval/_internal/functional/utils.py +0 -158
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +0 -30
- dataeval/_internal/metrics/base.py +0 -92
- dataeval/_internal/metrics/metadata.py +0 -610
- dataeval/_internal/metrics/metadataparity.py +0 -67
- dataeval-0.63.0.dist-info/RECORD +0 -68
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -3,27 +3,104 @@ This module contains the implementation of HP Divergence
|
|
3
3
|
using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
4
4
|
"""
|
5
5
|
|
6
|
-
from
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Literal
|
7
8
|
|
8
9
|
import numpy as np
|
10
|
+
from numpy.typing import ArrayLike, NDArray
|
9
11
|
|
10
|
-
from dataeval._internal.
|
11
|
-
from dataeval._internal.
|
12
|
-
from dataeval._internal.
|
12
|
+
from dataeval._internal.interop import to_numpy
|
13
|
+
from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
|
14
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
|
17
|
+
@dataclass(frozen=True)
|
18
|
+
class DivergenceOutput(OutputMetadata):
|
19
|
+
"""
|
20
|
+
Attributes
|
21
|
+
----------
|
22
|
+
divergence : float
|
23
|
+
Divergence value calculated between 2 datasets ranging between 0.0 and 1.0
|
24
|
+
errors : int
|
25
|
+
The number of differing edges between the datasets
|
26
|
+
"""
|
27
|
+
|
28
|
+
divergence: float
|
29
|
+
errors: int
|
30
|
+
|
31
|
+
|
32
|
+
def divergence_mst(data: NDArray, labels: NDArray) -> int:
|
33
|
+
"""
|
34
|
+
Calculates the estimated label errors based on the minimum spanning tree
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
----------
|
38
|
+
data : NDArray, shape - (N, ... )
|
39
|
+
Input images to be grouped
|
40
|
+
labels : NDArray
|
41
|
+
Corresponding labels for each data point
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
int
|
46
|
+
Number of label errors when creating the minimum spanning tree
|
47
|
+
"""
|
48
|
+
mst = minimum_spanning_tree(data).toarray()
|
49
|
+
edgelist = np.transpose(np.nonzero(mst))
|
50
|
+
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
51
|
+
return errors
|
16
52
|
|
17
53
|
|
18
|
-
|
54
|
+
def divergence_fnn(data: NDArray, labels: NDArray) -> int:
|
19
55
|
"""
|
20
|
-
Calculates the estimated
|
56
|
+
Calculates the estimated label errors based on their nearest neighbors
|
21
57
|
|
22
58
|
Parameters
|
23
59
|
----------
|
24
|
-
|
60
|
+
data : NDArray, shape - (N, ... )
|
61
|
+
Input images to be grouped
|
62
|
+
labels : NDArray
|
63
|
+
Corresponding labels for each data point
|
64
|
+
|
65
|
+
Returns
|
66
|
+
-------
|
67
|
+
int
|
68
|
+
Number of label errors when finding nearest neighbors
|
69
|
+
"""
|
70
|
+
nn_indices = compute_neighbors(data, data)
|
71
|
+
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
72
|
+
return errors
|
73
|
+
|
74
|
+
|
75
|
+
DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
76
|
+
|
77
|
+
|
78
|
+
@set_metadata("dataeval.metrics")
|
79
|
+
def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
80
|
+
"""
|
81
|
+
Calculates the divergence and any errors between the datasets
|
82
|
+
|
83
|
+
Parameters
|
84
|
+
----------
|
85
|
+
data_a : ArrayLike, shape - (N, P)
|
86
|
+
A dataset in an ArrayLike format to compare.
|
87
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
88
|
+
data_b : ArrayLike, shape - (N, P)
|
89
|
+
A dataset in an ArrayLike format to compare.
|
90
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
91
|
+
method : Literal["MST, "FNN"], default "FNN"
|
25
92
|
Method used to estimate dataset divergence
|
26
93
|
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
DivergenceOutput
|
97
|
+
The divergence value (0.0..1.0) and the number of differing edges between the datasets
|
98
|
+
|
99
|
+
Notes
|
100
|
+
-----
|
101
|
+
The divergence value indicates how similar the 2 datasets are
|
102
|
+
with 0 indicating approximately identical data distributions.
|
103
|
+
|
27
104
|
Warning
|
28
105
|
-------
|
29
106
|
MST is very slow in this implementation, this is unlike matlab where
|
@@ -40,63 +117,20 @@ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
|
40
117
|
|
41
118
|
Examples
|
42
119
|
--------
|
43
|
-
|
120
|
+
Evaluate the datasets:
|
44
121
|
|
45
|
-
>>>
|
46
|
-
|
47
|
-
Specify the method:
|
48
|
-
|
49
|
-
>>> divert = Divergence(method="FNN")
|
122
|
+
>>> divergence(datasetA, datasetB)
|
123
|
+
DivergenceOutput(divergence=0.28, errors=36.0)
|
50
124
|
"""
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
Parameters
|
64
|
-
----------
|
65
|
-
data_a : ArrayLike, shape - (N, P)
|
66
|
-
A dataset in an ArrayLike format to compare.
|
67
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
68
|
-
data_b : ArrayLike, shape - (N, P)
|
69
|
-
A dataset in an ArrayLike format to compare.
|
70
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
71
|
-
|
72
|
-
Returns
|
73
|
-
-------
|
74
|
-
Dict[str, Any]
|
75
|
-
divergence : float
|
76
|
-
divergence value between 0.0 and 1.0
|
77
|
-
error : int
|
78
|
-
the number of differing edges between the datasets
|
79
|
-
|
80
|
-
Notes
|
81
|
-
-----
|
82
|
-
The divergence value indicates how similar the 2 datasets are
|
83
|
-
with 0 indicating approximately identical data distributions.
|
84
|
-
|
85
|
-
Examples
|
86
|
-
--------
|
87
|
-
Evaluate the datasets:
|
88
|
-
|
89
|
-
>>> divert.evaluate(datasetA, datasetB)
|
90
|
-
{'divergence': 0.28, 'error': 36.0}
|
91
|
-
"""
|
92
|
-
a = to_numpy(data_a)
|
93
|
-
b = to_numpy(data_b)
|
94
|
-
N = a.shape[0]
|
95
|
-
M = b.shape[0]
|
96
|
-
|
97
|
-
stacked_data = np.vstack((a, b))
|
98
|
-
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
99
|
-
|
100
|
-
errors = self._method(stacked_data, labels)
|
101
|
-
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
102
|
-
return {"divergence": dp, "error": errors}
|
125
|
+
div_fn = get_method(DIVERGENCE_FN_MAP, method)
|
126
|
+
a = to_numpy(data_a)
|
127
|
+
b = to_numpy(data_b)
|
128
|
+
N = a.shape[0]
|
129
|
+
M = b.shape[0]
|
130
|
+
|
131
|
+
stacked_data = np.vstack((a, b))
|
132
|
+
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
133
|
+
|
134
|
+
errors = div_fn(stacked_data, labels)
|
135
|
+
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
136
|
+
return DivergenceOutput(dp, errors)
|
@@ -0,0 +1,211 @@
|
|
1
|
+
from dataclasses import dataclass
|
2
|
+
from typing import Dict, List, Literal, Optional, Sequence
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
from numpy.typing import NDArray
|
6
|
+
|
7
|
+
from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
|
8
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass(frozen=True)
|
12
|
+
class DiversityOutput(OutputMetadata):
|
13
|
+
"""
|
14
|
+
Attributes
|
15
|
+
----------
|
16
|
+
diversity_index : NDArray[np.float64]
|
17
|
+
Diversity index for classes and factors
|
18
|
+
"""
|
19
|
+
|
20
|
+
diversity_index: NDArray[np.float64]
|
21
|
+
|
22
|
+
|
23
|
+
def diversity_shannon(
|
24
|
+
data: NDArray,
|
25
|
+
names: List[str],
|
26
|
+
is_categorical: List[bool],
|
27
|
+
subset_mask: Optional[NDArray[np.bool_]] = None,
|
28
|
+
) -> NDArray:
|
29
|
+
"""
|
30
|
+
Compute diversity for discrete/categorical variables and, through standard
|
31
|
+
histogram binning, for continuous variables.
|
32
|
+
|
33
|
+
We define diversity as a normalized form of the Shannon entropy.
|
34
|
+
|
35
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
36
|
+
diversity = 0 implies that all samples belong to one category/bin
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
subset_mask: Optional[NDArray[np.bool_]]
|
41
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
42
|
+
|
43
|
+
Notes
|
44
|
+
-----
|
45
|
+
For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
|
46
|
+
|
47
|
+
Returns
|
48
|
+
-------
|
49
|
+
diversity_index: NDArray
|
50
|
+
Diversity index per column of X
|
51
|
+
|
52
|
+
See Also
|
53
|
+
--------
|
54
|
+
numpy.histogram
|
55
|
+
"""
|
56
|
+
|
57
|
+
# entropy computed using global auto bins so that we can properly normalize
|
58
|
+
ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
|
59
|
+
# normalize by global counts rather than classwise counts
|
60
|
+
num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
|
61
|
+
return ent_unnormalized / np.log(num_bins)
|
62
|
+
|
63
|
+
|
64
|
+
def diversity_simpson(
|
65
|
+
data: NDArray,
|
66
|
+
names: List[str],
|
67
|
+
is_categorical: List[bool],
|
68
|
+
subset_mask: Optional[NDArray[np.bool_]] = None,
|
69
|
+
) -> NDArray:
|
70
|
+
"""
|
71
|
+
Compute diversity for discrete/categorical variables and, through standard
|
72
|
+
histogram binning, for continuous variables.
|
73
|
+
|
74
|
+
We define diversity as a normalized form of the inverse Simpson diversity
|
75
|
+
index.
|
76
|
+
|
77
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
78
|
+
diversity = 1/num_categories implies that all samples belong to one category/bin
|
79
|
+
|
80
|
+
Parameters
|
81
|
+
----------
|
82
|
+
subset_mask: Optional[NDArray[np.bool_]]
|
83
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
84
|
+
|
85
|
+
Notes
|
86
|
+
-----
|
87
|
+
For continuous variables, histogram bins are chosen automatically. See
|
88
|
+
numpy.histogram for details.
|
89
|
+
The expression is undefined for q=1, but it approaches the Shannon entropy
|
90
|
+
in the limit.
|
91
|
+
If there is only one category, the diversity index takes a value of 1 =
|
92
|
+
1/N = 1/1. Entropy will take a value of 0.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
NDArray
|
97
|
+
Diversity index per column of X
|
98
|
+
|
99
|
+
See Also
|
100
|
+
--------
|
101
|
+
numpy.histogram
|
102
|
+
"""
|
103
|
+
|
104
|
+
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
105
|
+
# normalize by global counts, not classwise counts
|
106
|
+
num_bins = get_num_bins(data, names, is_categorical)
|
107
|
+
|
108
|
+
ev_index = np.empty(len(names))
|
109
|
+
# loop over columns for convenience
|
110
|
+
for col, cnts in enumerate(hist_counts.values()):
|
111
|
+
# relative frequencies
|
112
|
+
p_i = cnts / cnts.sum()
|
113
|
+
# inverse Simpson index normalized by (number of bins)
|
114
|
+
ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
|
115
|
+
|
116
|
+
return ev_index
|
117
|
+
|
118
|
+
|
119
|
+
DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
|
120
|
+
|
121
|
+
|
122
|
+
@set_metadata("dataeval.metrics")
|
123
|
+
def diversity(
|
124
|
+
class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
|
125
|
+
) -> DiversityOutput:
|
126
|
+
"""
|
127
|
+
Compute diversity for discrete/categorical variables and, through standard
|
128
|
+
histogram binning, for continuous variables.
|
129
|
+
|
130
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
131
|
+
diversity = 0 implies that all samples belong to one category/bin
|
132
|
+
|
133
|
+
Parameters
|
134
|
+
----------
|
135
|
+
class_labels: Sequence[int]
|
136
|
+
List of class labels for each image
|
137
|
+
metadata: List[Dict]
|
138
|
+
List of metadata factors for each image
|
139
|
+
metric: Literal["shannon", "simpson"], default "simpson"
|
140
|
+
string variable indicating which diversity index should be used.
|
141
|
+
Permissible values include "simpson" and "shannon"
|
142
|
+
|
143
|
+
Notes
|
144
|
+
-----
|
145
|
+
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
146
|
+
|
147
|
+
Returns
|
148
|
+
-------
|
149
|
+
DiversityOutput
|
150
|
+
Diversity index per column of self.data or each factor in self.names
|
151
|
+
|
152
|
+
See Also
|
153
|
+
--------
|
154
|
+
numpy.histogram
|
155
|
+
"""
|
156
|
+
diversity_fn = get_method(DIVERSITY_FN_MAP, method)
|
157
|
+
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
158
|
+
diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
|
159
|
+
return DiversityOutput(diversity_index)
|
160
|
+
|
161
|
+
|
162
|
+
@set_metadata("dataeval.metrics")
|
163
|
+
def diversity_classwise(
|
164
|
+
class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
|
165
|
+
) -> DiversityOutput:
|
166
|
+
"""
|
167
|
+
Compute diversity for discrete/categorical variables and, through standard
|
168
|
+
histogram binning, for continuous variables.
|
169
|
+
|
170
|
+
We define diversity as a normalized form of the inverse Simpson diversity
|
171
|
+
index.
|
172
|
+
|
173
|
+
diversity = 1 implies that samples are evenly distributed across a particular factor
|
174
|
+
diversity = 1/num_categories implies that all samples belong to one category/bin
|
175
|
+
|
176
|
+
Parameters
|
177
|
+
----------
|
178
|
+
class_labels: Sequence[int]
|
179
|
+
List of class labels for each image
|
180
|
+
metadata: List[Dict]
|
181
|
+
List of metadata factors for each image
|
182
|
+
|
183
|
+
Notes
|
184
|
+
-----
|
185
|
+
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
186
|
+
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
187
|
+
- If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
|
188
|
+
|
189
|
+
Returns
|
190
|
+
-------
|
191
|
+
DiversityOutput
|
192
|
+
Diversity index [n_class x n_factor]
|
193
|
+
|
194
|
+
See Also
|
195
|
+
--------
|
196
|
+
numpy.histogram
|
197
|
+
"""
|
198
|
+
diversity_fn = get_method(DIVERSITY_FN_MAP, method)
|
199
|
+
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
200
|
+
class_idx = names.index("class_label")
|
201
|
+
class_lbl = data[:, class_idx]
|
202
|
+
|
203
|
+
u_classes = np.unique(class_lbl)
|
204
|
+
num_factors = len(names)
|
205
|
+
diversity = np.empty((len(u_classes), num_factors))
|
206
|
+
diversity[:] = np.nan
|
207
|
+
for idx, cls in enumerate(u_classes):
|
208
|
+
subset_mask = class_lbl == cls
|
209
|
+
diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
|
210
|
+
div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
|
211
|
+
return DiversityOutput(div_no_class)
|