dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/_internal/detectors/clusterer.py +47 -34
- dataeval/_internal/detectors/drift/base.py +53 -35
- dataeval/_internal/detectors/drift/cvm.py +5 -4
- dataeval/_internal/detectors/drift/ks.py +7 -6
- dataeval/_internal/detectors/drift/mmd.py +39 -19
- dataeval/_internal/detectors/drift/torch.py +6 -5
- dataeval/_internal/detectors/drift/uncertainty.py +7 -8
- dataeval/_internal/detectors/duplicates.py +57 -30
- dataeval/_internal/detectors/linter.py +40 -24
- dataeval/_internal/detectors/ood/ae.py +2 -1
- dataeval/_internal/detectors/ood/aegmm.py +2 -1
- dataeval/_internal/detectors/ood/base.py +37 -15
- dataeval/_internal/detectors/ood/llr.py +9 -8
- dataeval/_internal/detectors/ood/vae.py +2 -1
- dataeval/_internal/detectors/ood/vaegmm.py +2 -1
- dataeval/_internal/flags.py +42 -21
- dataeval/_internal/interop.py +3 -12
- dataeval/_internal/metrics/balance.py +188 -0
- dataeval/_internal/metrics/ber.py +123 -48
- dataeval/_internal/metrics/coverage.py +90 -74
- dataeval/_internal/metrics/divergence.py +101 -67
- dataeval/_internal/metrics/diversity.py +211 -0
- dataeval/_internal/metrics/parity.py +287 -155
- dataeval/_internal/metrics/stats.py +198 -317
- dataeval/_internal/metrics/uap.py +40 -29
- dataeval/_internal/metrics/utils.py +430 -0
- dataeval/_internal/models/tensorflow/losses.py +3 -3
- dataeval/_internal/models/tensorflow/trainer.py +3 -2
- dataeval/_internal/models/tensorflow/utils.py +4 -3
- dataeval/_internal/output.py +82 -0
- dataeval/_internal/utils.py +64 -0
- dataeval/_internal/workflows/sufficiency.py +96 -107
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +26 -7
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
- dataeval-0.65.0.dist-info/RECORD +60 -0
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +0 -63
- dataeval/_internal/functional/coverage.py +0 -75
- dataeval/_internal/functional/divergence.py +0 -16
- dataeval/_internal/functional/hash.py +0 -79
- dataeval/_internal/functional/metadata.py +0 -136
- dataeval/_internal/functional/metadataparity.py +0 -190
- dataeval/_internal/functional/uap.py +0 -6
- dataeval/_internal/functional/utils.py +0 -158
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +0 -30
- dataeval/_internal/metrics/base.py +0 -92
- dataeval/_internal/metrics/metadata.py +0 -610
- dataeval/_internal/metrics/metadataparity.py +0 -67
- dataeval-0.63.0.dist-info/RECORD +0 -68
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -1,92 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
from typing import Callable, Dict, Generic, List, TypeVar
|
3
|
-
|
4
|
-
TOutput = TypeVar("TOutput", bound=dict)
|
5
|
-
TMethods = TypeVar("TMethods")
|
6
|
-
TCallable = TypeVar("TCallable", bound=Callable)
|
7
|
-
|
8
|
-
|
9
|
-
class MetricMixin(ABC, Generic[TOutput]):
|
10
|
-
@abstractmethod
|
11
|
-
def update(self, *args, **kwargs): ...
|
12
|
-
|
13
|
-
@abstractmethod
|
14
|
-
def compute(self) -> TOutput: ...
|
15
|
-
|
16
|
-
@abstractmethod
|
17
|
-
def reset(self): ...
|
18
|
-
|
19
|
-
|
20
|
-
class EvaluateMixin(ABC, Generic[TOutput]):
|
21
|
-
@abstractmethod
|
22
|
-
def evaluate(self, *args, **kwargs) -> TOutput:
|
23
|
-
"""Abstract method to calculate metric based off of constructor parameters"""
|
24
|
-
|
25
|
-
|
26
|
-
class MethodsMixin(ABC, Generic[TMethods, TCallable]):
|
27
|
-
"""
|
28
|
-
Use this mixin to define a mapping of functions to method names which
|
29
|
-
can be queried by the user and called internally with the appropriate
|
30
|
-
method name as the key.
|
31
|
-
|
32
|
-
Explicitly defining the Callable generic helps with type safety and
|
33
|
-
hinting for function signatures and recommended but optional.
|
34
|
-
|
35
|
-
e.g.:
|
36
|
-
|
37
|
-
def _mult(x: float, y: float) -> float:
|
38
|
-
return x * y
|
39
|
-
|
40
|
-
class MyMetric(MethodsMixin[Callable[float, float], float]):
|
41
|
-
|
42
|
-
def _methods(cls) -> Dict[str, Callable[float, float], float]:
|
43
|
-
return {
|
44
|
-
"ADD": lambda x, y: x + y,
|
45
|
-
"MULT": _mult,
|
46
|
-
...
|
47
|
-
}
|
48
|
-
|
49
|
-
Then during evaluate, you can call the method specified with the getter.
|
50
|
-
|
51
|
-
e.g.:
|
52
|
-
|
53
|
-
def evaluate(self):
|
54
|
-
return self._method(x, y)
|
55
|
-
|
56
|
-
The resulting class can be used like so.
|
57
|
-
|
58
|
-
m = MyMetric(1.0, 2.0, "ADD")
|
59
|
-
m.evaluate() # returns 3.0
|
60
|
-
m.method # returns "ADD"
|
61
|
-
MyMetric.methods() # returns "['ADD', 'MULT']
|
62
|
-
m.method = "MULT"
|
63
|
-
m.evaluate() # returns 2.0
|
64
|
-
"""
|
65
|
-
|
66
|
-
@classmethod
|
67
|
-
@abstractmethod
|
68
|
-
def _methods(cls) -> Dict[str, TCallable]:
|
69
|
-
"""Abstract method returning available method functions for class"""
|
70
|
-
|
71
|
-
@property
|
72
|
-
def _method(self) -> TCallable:
|
73
|
-
return self._methods()[self.method]
|
74
|
-
|
75
|
-
@classmethod
|
76
|
-
def methods(cls) -> List[str]:
|
77
|
-
return list(cls._methods().keys())
|
78
|
-
|
79
|
-
@property
|
80
|
-
def method(self) -> str:
|
81
|
-
return self._method_key
|
82
|
-
|
83
|
-
@method.setter
|
84
|
-
def method(self, value: TMethods):
|
85
|
-
self._set_method(value)
|
86
|
-
|
87
|
-
def _set_method(self, value: TMethods):
|
88
|
-
"""This setter is to fix pyright incorrect detection of
|
89
|
-
incorrectly overriding the 'method' property"""
|
90
|
-
if value not in self.methods():
|
91
|
-
raise KeyError(f"Specified method not available for class ({self.methods()}).")
|
92
|
-
self._method_key = value
|
@@ -1,610 +0,0 @@
|
|
1
|
-
import warnings
|
2
|
-
from typing import Dict, List
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
import torch
|
6
|
-
from numpy.typing import ArrayLike, NDArray
|
7
|
-
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
8
|
-
from torchmetrics import Metric
|
9
|
-
|
10
|
-
from dataeval._internal.functional.metadata import _entropy, _get_counts, _get_num_bins, _infer_categorical
|
11
|
-
|
12
|
-
|
13
|
-
def str_to_int(d: Dict) -> Dict:
|
14
|
-
"""
|
15
|
-
Map columns of dict that are not numeric (e.g. string) to numeric values
|
16
|
-
that mutual information and diversity functions can accommodate. Each
|
17
|
-
unique string receives a unique integer value.
|
18
|
-
|
19
|
-
Parameters
|
20
|
-
----------
|
21
|
-
d: Dict
|
22
|
-
Dictionary of ndarray feature values or descriptors.
|
23
|
-
|
24
|
-
Returns
|
25
|
-
-------
|
26
|
-
Dict
|
27
|
-
Dictionary with same keys and non-numeric values mapped to numeric values.
|
28
|
-
"""
|
29
|
-
for key, val in d.items():
|
30
|
-
val = val.numpy() if torch.is_tensor(val) else val
|
31
|
-
val = np.array(val) if isinstance(val, list) else val
|
32
|
-
# if not numeric
|
33
|
-
if not np.issubdtype(val.dtype, np.number):
|
34
|
-
_, mapped_vals = np.unique(val, return_inverse=True)
|
35
|
-
d[key] = mapped_vals
|
36
|
-
return d
|
37
|
-
|
38
|
-
|
39
|
-
def list_to_dict(list_of_dicts: List[Dict]) -> Dict:
|
40
|
-
"""
|
41
|
-
Converts list of dicts to dict of ndarrays
|
42
|
-
|
43
|
-
Parameters
|
44
|
-
----------
|
45
|
-
list_of_dicts: List[Dict]
|
46
|
-
list of dictionaries, typically of metadata factors
|
47
|
-
|
48
|
-
Returns
|
49
|
-
-------
|
50
|
-
Dict[np.ndarray]
|
51
|
-
dictionary whose columns are np.ndarray
|
52
|
-
"""
|
53
|
-
return {k: np.array([dic[k] for dic in list_of_dicts]) for k in list_of_dicts[0]}
|
54
|
-
|
55
|
-
|
56
|
-
class BaseBiasMetric(Metric):
|
57
|
-
"""
|
58
|
-
Base class for bias metrics with common functionality for consuming
|
59
|
-
metadata---subclasses torchmetrics.Metric
|
60
|
-
|
61
|
-
Attributes
|
62
|
-
---------
|
63
|
-
data: np.ndarray
|
64
|
-
Array of metadata factors; string variables are converted to integers
|
65
|
-
names: List[str]
|
66
|
-
List of the names of metadata factor variables
|
67
|
-
is_categorical: List
|
68
|
-
List of boolean flags for categorical features. Mutual information is
|
69
|
-
computed differently for categorical/discrete and continuous variables
|
70
|
-
num_factors: int
|
71
|
-
Number of metadata factors in the dataset
|
72
|
-
num_samples: int
|
73
|
-
Number of samples in the dataset
|
74
|
-
"""
|
75
|
-
|
76
|
-
def __init__(self):
|
77
|
-
super().__init__()
|
78
|
-
self.names = []
|
79
|
-
self.data = np.empty(0)
|
80
|
-
self.is_categorical = []
|
81
|
-
|
82
|
-
# torchmetric 'compute' function operates on these states
|
83
|
-
self.add_state("metadata", default=[], dist_reduce_fx="cat")
|
84
|
-
self.add_state("class_label", default=[], dist_reduce_fx="cat")
|
85
|
-
|
86
|
-
self.num_factors = 0
|
87
|
-
self.num_samples = 0
|
88
|
-
|
89
|
-
def update(self, class_label: ArrayLike, metadata: List[Dict]):
|
90
|
-
self.metadata.extend(metadata)
|
91
|
-
self.class_label.append(class_label)
|
92
|
-
|
93
|
-
def _collect_data(self):
|
94
|
-
metadata_dict = {"class_label": np.concatenate(self.class_label).astype(int)}
|
95
|
-
metadata_dict = {**metadata_dict, **list_to_dict(self.metadata)}
|
96
|
-
|
97
|
-
# convert string variables to int
|
98
|
-
metadata_dict = str_to_int(metadata_dict)
|
99
|
-
self.data = np.stack(list(metadata_dict.values()), axis=-1)
|
100
|
-
self.names = list(metadata_dict.keys())
|
101
|
-
|
102
|
-
self.is_categorical = [_infer_categorical(metadata_dict[var], 0.25)[0] for var in self.names]
|
103
|
-
|
104
|
-
# class_label is also in self.names
|
105
|
-
self.num_factors = len(self.names)
|
106
|
-
self.num_samples = len(self.metadata)
|
107
|
-
|
108
|
-
|
109
|
-
class BaseBalanceMetric(BaseBiasMetric):
|
110
|
-
"""
|
111
|
-
Base class for balance (mutual information) metrics. Contains input
|
112
|
-
validation for balance metrics.
|
113
|
-
"""
|
114
|
-
|
115
|
-
def __init__(self, num_neighbors: int):
|
116
|
-
super().__init__()
|
117
|
-
if not isinstance(num_neighbors, (int, float)):
|
118
|
-
raise TypeError(
|
119
|
-
f"Variable {num_neighbors} is not real-valued numeric type."
|
120
|
-
"num_neighbors should be an int, greater than 0 and less than"
|
121
|
-
"the number of samples in the dataset"
|
122
|
-
)
|
123
|
-
if num_neighbors < 1:
|
124
|
-
raise ValueError(
|
125
|
-
f"Invalid value for {num_neighbors}."
|
126
|
-
"Choose a value greater than 0 and less than number of samples"
|
127
|
-
"in the dataset."
|
128
|
-
)
|
129
|
-
if isinstance(num_neighbors, float):
|
130
|
-
num_neighbors = int(num_neighbors)
|
131
|
-
warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
|
132
|
-
|
133
|
-
self.num_neighbors = num_neighbors
|
134
|
-
|
135
|
-
|
136
|
-
class Balance(BaseBalanceMetric):
|
137
|
-
"""
|
138
|
-
Metadata balance measures distributional correlation between metadata
|
139
|
-
factors and class label to identify opportunities for shortcut learning or
|
140
|
-
sampling bias in the dataset.
|
141
|
-
|
142
|
-
Parameters
|
143
|
-
----------
|
144
|
-
num_neighbors: int
|
145
|
-
number of nearest neighbors used for the computation of
|
146
|
-
|
147
|
-
Attributes
|
148
|
-
---------
|
149
|
-
data: np.ndarray
|
150
|
-
Array of metadata factors; string variables are converted to integers
|
151
|
-
names: List[str]
|
152
|
-
List of the names of metadata factor variables
|
153
|
-
is_categorical: List
|
154
|
-
List of boolean flags for categorical features. Mutual information is
|
155
|
-
computed differently for categorical/discrete and continuous variables
|
156
|
-
num_factors: int
|
157
|
-
Number of metadata factors in the dataset
|
158
|
-
num_samples: int
|
159
|
-
Number of samples in the dataset
|
160
|
-
|
161
|
-
Notes
|
162
|
-
-----
|
163
|
-
We use mutual_info_classif from sklearn since class label is categorical
|
164
|
-
mutual_info_classif outputs are consistent up to O(1e-4) and depend on
|
165
|
-
a random seed.
|
166
|
-
MI is computed differently for categorical and continuous variables,
|
167
|
-
and we attempt to infer whether a variable is categorical by the
|
168
|
-
fraction of unique values in the dataset.
|
169
|
-
|
170
|
-
See Also
|
171
|
-
--------
|
172
|
-
sklearn.feature_selection.mutual_info_classif
|
173
|
-
sklearn.feature_selection.mutual_info_regression
|
174
|
-
sklearn.metrics.mutual_info_score
|
175
|
-
"""
|
176
|
-
|
177
|
-
def __init__(self, num_neighbors: int = 5):
|
178
|
-
super().__init__(num_neighbors=num_neighbors)
|
179
|
-
|
180
|
-
def compute(self) -> NDArray:
|
181
|
-
"""
|
182
|
-
Mutual information (MI) between factors (class label, metadata, label/image properties)
|
183
|
-
|
184
|
-
Parameters
|
185
|
-
----------
|
186
|
-
num_neighbors: int
|
187
|
-
Number of nearest neighbors to use for computing MI between discrete
|
188
|
-
and continuous variables.
|
189
|
-
|
190
|
-
Returns
|
191
|
-
-------
|
192
|
-
NDArray
|
193
|
-
(num_factors+1) x (num_factors+1) estimate of mutual information
|
194
|
-
between num_factors metadata factors and class label. Symmetry is enforced.
|
195
|
-
|
196
|
-
See Also
|
197
|
-
--------
|
198
|
-
sklearn.feature_selection.mutual_info_classif
|
199
|
-
sklearn.feature_selection.mutual_info_regression
|
200
|
-
sklearn.metrics.mutual_info_score
|
201
|
-
"""
|
202
|
-
self._collect_data()
|
203
|
-
mi = np.empty((self.num_factors, self.num_factors))
|
204
|
-
mi[:] = np.nan
|
205
|
-
|
206
|
-
for idx, tgt_var in enumerate(self.names):
|
207
|
-
tgt = self.data[:, idx]
|
208
|
-
|
209
|
-
if self.is_categorical[idx]:
|
210
|
-
# categorical target
|
211
|
-
mi[idx, :] = mutual_info_classif(
|
212
|
-
self.data,
|
213
|
-
tgt,
|
214
|
-
discrete_features=self.is_categorical, # type: ignore
|
215
|
-
n_neighbors=self.num_neighbors,
|
216
|
-
)
|
217
|
-
else:
|
218
|
-
# continuous variables
|
219
|
-
mi[idx, :] = mutual_info_regression(
|
220
|
-
self.data,
|
221
|
-
tgt,
|
222
|
-
discrete_features=self.is_categorical, # type: ignore
|
223
|
-
n_neighbors=self.num_neighbors,
|
224
|
-
)
|
225
|
-
|
226
|
-
ent_all = _entropy(self.data, self.names, self.is_categorical, normalized=False)
|
227
|
-
norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
|
228
|
-
# in principle MI should be symmetric, but it is not in practice.
|
229
|
-
nmi = 0.5 * (mi + mi.T) / norm_factor
|
230
|
-
|
231
|
-
return nmi
|
232
|
-
|
233
|
-
|
234
|
-
class BalanceClasswise(BaseBalanceMetric):
|
235
|
-
"""
|
236
|
-
Computes mutual information (analogous to correlation) between metadata
|
237
|
-
factors (class label, metadata, label/image properties) with individual
|
238
|
-
class labels.
|
239
|
-
|
240
|
-
Parameters
|
241
|
-
----------
|
242
|
-
num_neighbors: int
|
243
|
-
Number of nearest neighbors to use for computing MI between discrete
|
244
|
-
and continuous variables.
|
245
|
-
|
246
|
-
Attributes
|
247
|
-
----------
|
248
|
-
num_neighbors: int
|
249
|
-
Number of nearest neighbors to use for computing MI between discrete
|
250
|
-
and continuous variables.
|
251
|
-
data: np.ndarray
|
252
|
-
Array of metadata factors; string variables are converted to integers
|
253
|
-
names: List[str]
|
254
|
-
List of the names of metadata factor variables
|
255
|
-
is_categorical: List
|
256
|
-
List of boolean flags for categorical features. Mutual information is
|
257
|
-
computed differently for categorical/discrete and continuous variables
|
258
|
-
num_factors: int
|
259
|
-
Number of metadata factors in the dataset
|
260
|
-
num_samples: int
|
261
|
-
Number of samples in the dataset
|
262
|
-
"""
|
263
|
-
|
264
|
-
def __init__(self, num_neighbors: int = 5):
|
265
|
-
super().__init__(num_neighbors)
|
266
|
-
|
267
|
-
def compute(self) -> NDArray:
|
268
|
-
"""
|
269
|
-
Compute mutual information between metadata factors (class label, metadata,
|
270
|
-
label/image properties) with individual class labels.
|
271
|
-
|
272
|
-
Parameters
|
273
|
-
----------
|
274
|
-
num_neighbors: int
|
275
|
-
Number of nearest neighbors to use for computing MI between discrete
|
276
|
-
and continuous variables.
|
277
|
-
|
278
|
-
Notes
|
279
|
-
-----
|
280
|
-
We use mutual_info_classif from sklearn since class label is categorical
|
281
|
-
mutual_info_classif outputs are consistent up to O(1e-4) and depend on
|
282
|
-
a random seed
|
283
|
-
MI is computed differently for categorical and continuous variables,
|
284
|
-
so we have to specify with self.is_categorical.
|
285
|
-
|
286
|
-
Returns
|
287
|
-
-------
|
288
|
-
NDArray
|
289
|
-
(num_classes x num_factors) estimate of mutual information between
|
290
|
-
num_factors metadata factors and individual class labels.
|
291
|
-
|
292
|
-
See Also
|
293
|
-
--------
|
294
|
-
sklearn.feature_selection.mutual_info_classif
|
295
|
-
sklearn.feature_selection.mutual_info_regression
|
296
|
-
sklearn.metrics.mutual_info_score
|
297
|
-
compute_mutual_information
|
298
|
-
"""
|
299
|
-
|
300
|
-
self._collect_data()
|
301
|
-
# unique class labels
|
302
|
-
class_idx = self.names.index("class_label")
|
303
|
-
class_data = self.data[:, class_idx]
|
304
|
-
u_cls = np.unique(class_data)
|
305
|
-
num_classes = len(u_cls)
|
306
|
-
|
307
|
-
data_no_class = np.concatenate((self.data[:, :class_idx], self.data[:, (class_idx + 1) :]), axis=1)
|
308
|
-
|
309
|
-
# assume class is a factor
|
310
|
-
mi = np.empty((num_classes, self.num_factors - 1))
|
311
|
-
mi[:] = np.nan
|
312
|
-
|
313
|
-
# categorical variables, excluding class label
|
314
|
-
cat_mask = np.concatenate(
|
315
|
-
(self.is_categorical[:class_idx], self.is_categorical[(class_idx + 1) :]), axis=0
|
316
|
-
).astype(int)
|
317
|
-
|
318
|
-
# classification MI for discrete/categorical features
|
319
|
-
for idx, cls in enumerate(u_cls):
|
320
|
-
tgt = class_data == cls
|
321
|
-
# units: nat
|
322
|
-
mi[idx, :] = mutual_info_classif(
|
323
|
-
data_no_class,
|
324
|
-
tgt,
|
325
|
-
discrete_features=cat_mask, # type: ignore
|
326
|
-
n_neighbors=self.num_neighbors,
|
327
|
-
)
|
328
|
-
|
329
|
-
# let this recompute for all features including class label
|
330
|
-
ent_all = _entropy(self.data, self.names, self.is_categorical)
|
331
|
-
ent_tgt = ent_all[class_idx]
|
332
|
-
ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
|
333
|
-
norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
|
334
|
-
nmi = mi / norm_factor
|
335
|
-
return nmi
|
336
|
-
|
337
|
-
|
338
|
-
class BaseDiversityMetric(BaseBiasMetric):
|
339
|
-
"""
|
340
|
-
Base class for Diversity and ClasswiseDiversity metrics.
|
341
|
-
|
342
|
-
Parameters
|
343
|
-
----------
|
344
|
-
metric: str
|
345
|
-
string variable indicating which diversity index should be used.
|
346
|
-
Permissible values include "simpson" and "shannon"
|
347
|
-
|
348
|
-
Attributes
|
349
|
-
----------
|
350
|
-
metric: str
|
351
|
-
string variable indicating which diversity index should be used.
|
352
|
-
Permissible values include "simpson" and "shannon"
|
353
|
-
data: np.ndarray
|
354
|
-
Array of metadata factors; string variables are converted to integers
|
355
|
-
names: List[str]
|
356
|
-
List of the names of metadata factor variables
|
357
|
-
is_categorical: List
|
358
|
-
List of boolean flags for categorical features. Mutual information is
|
359
|
-
computed differently for categorical/discrete and continuous variables
|
360
|
-
num_factors: int
|
361
|
-
Number of metadata factors in the dataset
|
362
|
-
num_samples: int
|
363
|
-
Number of samples in the dataset
|
364
|
-
"""
|
365
|
-
|
366
|
-
def __init__(self, metric: str):
|
367
|
-
super().__init__()
|
368
|
-
allowed_metrics = ["simpson", "shannon"]
|
369
|
-
if metric.lower() not in allowed_metrics:
|
370
|
-
raise ValueError(f"metric '{metric}' should be one of {allowed_metrics}")
|
371
|
-
self.metric = metric
|
372
|
-
|
373
|
-
def _diversity_simpson(self, subset_mask: np.ndarray = np.empty(shape=0)) -> np.ndarray:
|
374
|
-
"""
|
375
|
-
Compute diversity for discrete/categorical variables and, through standard
|
376
|
-
histogram binning, for continuous variables.
|
377
|
-
|
378
|
-
We define diversity as a normalized form of the inverse Simpson diversity
|
379
|
-
index.
|
380
|
-
|
381
|
-
diversity = 1 implies that samples are evenly distributed across a particular factor
|
382
|
-
diversity = 1/num_categories implies that all samples belong to one category/bin
|
383
|
-
|
384
|
-
Parameters
|
385
|
-
----------
|
386
|
-
subset_mask: Optional[np.ndarray[bool]]
|
387
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
388
|
-
|
389
|
-
Notes
|
390
|
-
-----
|
391
|
-
For continuous variables, histogram bins are chosen automatically. See
|
392
|
-
numpy.histogram for details.
|
393
|
-
The expression is undefined for q=1, but it approaches the Shannon entropy
|
394
|
-
in the limit.
|
395
|
-
If there is only one category, the diversity index takes a value of 1 =
|
396
|
-
1/N = 1/1. Entropy will take a value of 0.
|
397
|
-
|
398
|
-
Returns
|
399
|
-
-------
|
400
|
-
np.ndarray
|
401
|
-
Diversity index per column of X
|
402
|
-
|
403
|
-
See Also
|
404
|
-
--------
|
405
|
-
numpy.histogram
|
406
|
-
"""
|
407
|
-
|
408
|
-
# hist_counts,_ = _get_counts(subset_mask)
|
409
|
-
hist_counts, _ = _get_counts(self.data, self.names, self.is_categorical, subset_mask)
|
410
|
-
# normalize by global counts, not classwise counts
|
411
|
-
num_bins = _get_num_bins(self.data, self.names, self.is_categorical)
|
412
|
-
|
413
|
-
ev_index = np.empty(self.num_factors)
|
414
|
-
# loop over columns for convenience
|
415
|
-
for col, cnts in enumerate(hist_counts.values()):
|
416
|
-
# relative frequencies
|
417
|
-
p_i = cnts / cnts.sum()
|
418
|
-
# inverse Simpson index normalized by (number of bins)
|
419
|
-
ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
|
420
|
-
|
421
|
-
return ev_index
|
422
|
-
|
423
|
-
def _diversity_shannon(self, subset_mask: np.ndarray = np.empty(shape=0)) -> np.ndarray:
|
424
|
-
"""
|
425
|
-
Compute diversity for discrete/categorical variables and, through standard
|
426
|
-
histogram binning, for continuous variables.
|
427
|
-
|
428
|
-
We define diversity as a normalized form of the Shannon entropy.
|
429
|
-
|
430
|
-
diversity = 1 implies that samples are evenly distributed across a particular factor
|
431
|
-
diversity = 0 implies that all samples belong to one category/bin
|
432
|
-
|
433
|
-
Parameters
|
434
|
-
----------
|
435
|
-
subset_mask: Optional[np.ndarray[bool]]
|
436
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
437
|
-
|
438
|
-
Notes
|
439
|
-
-----
|
440
|
-
- For continuous variables, histogram bins are chosen automatically. See
|
441
|
-
numpy.histogram for details.
|
442
|
-
|
443
|
-
Returns
|
444
|
-
-------
|
445
|
-
diversity_index: np.ndarray
|
446
|
-
Diversity index per column of X
|
447
|
-
|
448
|
-
See Also
|
449
|
-
--------
|
450
|
-
numpy.histogram
|
451
|
-
"""
|
452
|
-
|
453
|
-
# entropy computed using global auto bins so that we can properly normalize
|
454
|
-
ent_unnormalized = _entropy(
|
455
|
-
self.data, self.names, self.is_categorical, normalized=False, subset_mask=subset_mask
|
456
|
-
)
|
457
|
-
# normalize by global counts rather than classwise counts
|
458
|
-
num_bins = _get_num_bins(self.data, self.names, is_categorical=self.is_categorical, subset_mask=subset_mask)
|
459
|
-
return ent_unnormalized / np.log(num_bins)
|
460
|
-
|
461
|
-
|
462
|
-
class DiversityClasswise(BaseDiversityMetric):
|
463
|
-
"""
|
464
|
-
Classwise diversity index: evenness of the distribution of metadata factors
|
465
|
-
per class.
|
466
|
-
|
467
|
-
Parameters
|
468
|
-
----------
|
469
|
-
metric: str
|
470
|
-
string variable indicating which diversity index should be used.
|
471
|
-
Permissible values include "simpson" and "shannon"
|
472
|
-
|
473
|
-
Attributes
|
474
|
-
----------
|
475
|
-
metric: str
|
476
|
-
string variable indicating which diversity index should be used.
|
477
|
-
Permissible values include "simpson" and "shannon"
|
478
|
-
data: np.ndarray
|
479
|
-
Array of metadata factors; string variables are converted to integers
|
480
|
-
names: List[str]
|
481
|
-
List of the names of metadata factor variables
|
482
|
-
is_categorical: List
|
483
|
-
List of boolean flags for categorical features. Mutual information is
|
484
|
-
computed differently for categorical/discrete and continuous variables
|
485
|
-
num_factors: int
|
486
|
-
Number of metadata factors in the dataset
|
487
|
-
num_samples: int
|
488
|
-
Number of samples in the dataset
|
489
|
-
|
490
|
-
"""
|
491
|
-
|
492
|
-
def __init__(self, metric="simpson"):
|
493
|
-
super().__init__(metric=metric)
|
494
|
-
|
495
|
-
def compute(self):
|
496
|
-
"""
|
497
|
-
Compute diversity for discrete/categorical variables and, through standard
|
498
|
-
histogram binning, for continuous variables.
|
499
|
-
|
500
|
-
We define diversity as a normalized form of the inverse Simpson diversity
|
501
|
-
index.
|
502
|
-
|
503
|
-
diversity = 1 implies that samples are evenly distributed across a particular factor
|
504
|
-
diversity = 1/num_categories implies that all samples belong to one category/bin
|
505
|
-
|
506
|
-
Notes
|
507
|
-
-----
|
508
|
-
For continuous variables, histogram bins are chosen automatically. See
|
509
|
-
numpy.histogram for details.
|
510
|
-
The expression is undefined for q=1, but it approaches the Shannon entropy
|
511
|
-
in the limit.
|
512
|
-
If there is only one category, the diversity index takes a value of 1 =
|
513
|
-
1/N = 1/1. Entropy will take a value of 0.
|
514
|
-
|
515
|
-
Returns
|
516
|
-
-------
|
517
|
-
np.ndarray
|
518
|
-
Diversity index [n_class x n_factor]
|
519
|
-
|
520
|
-
See Also
|
521
|
-
--------
|
522
|
-
diversity_simpson
|
523
|
-
diversity_shannon
|
524
|
-
numpy.histogram
|
525
|
-
"""
|
526
|
-
self._collect_data()
|
527
|
-
|
528
|
-
class_idx = self.names.index("class_label")
|
529
|
-
class_labels = self.data[:, class_idx]
|
530
|
-
|
531
|
-
u_classes = np.unique(class_labels)
|
532
|
-
num_factors = len(self.names)
|
533
|
-
diversity = np.empty((len(u_classes), num_factors))
|
534
|
-
diversity[:] = np.nan
|
535
|
-
for idx, cls in enumerate(u_classes):
|
536
|
-
subset_mask = class_labels == cls
|
537
|
-
if self.metric == "simpson":
|
538
|
-
diversity[idx, :] = self._diversity_simpson(subset_mask)
|
539
|
-
elif self.metric == "shannon":
|
540
|
-
diversity[idx, :] = self._diversity_shannon(subset_mask)
|
541
|
-
div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
|
542
|
-
return div_no_class
|
543
|
-
|
544
|
-
|
545
|
-
class Diversity(BaseDiversityMetric):
|
546
|
-
"""
|
547
|
-
Diversity index: evenness of the distribution of metadata factors to
|
548
|
-
identify imbalance or undersampled data categories.
|
549
|
-
|
550
|
-
Parameters
|
551
|
-
----------
|
552
|
-
metric: str
|
553
|
-
string variable indicating which diversity index should be used.
|
554
|
-
Permissible values include "simpson" and "shannon"
|
555
|
-
|
556
|
-
Attributes
|
557
|
-
----------
|
558
|
-
metric: str
|
559
|
-
string variable indicating which diversity index should be used.
|
560
|
-
Permissible values include "simpson" and "shannon"
|
561
|
-
data: np.ndarray
|
562
|
-
Array of metadata factors; string variables are converted to integers
|
563
|
-
names: List[str]
|
564
|
-
List of the names of metadata factor variables
|
565
|
-
is_categorical: List
|
566
|
-
List of boolean flags for categorical features. Mutual information is
|
567
|
-
computed differently for categorical/discrete and continuous variables
|
568
|
-
num_factors: int
|
569
|
-
Number of metadata factors in the dataset
|
570
|
-
num_samples: int
|
571
|
-
Number of samples in the dataset
|
572
|
-
"""
|
573
|
-
|
574
|
-
def __init__(self, metric="simpson"):
|
575
|
-
super().__init__(metric=metric)
|
576
|
-
|
577
|
-
def compute(self):
|
578
|
-
"""
|
579
|
-
Compute diversity for discrete/categorical variables and, through standard
|
580
|
-
histogram binning, for continuous variables.
|
581
|
-
|
582
|
-
diversity = 1 implies that samples are evenly distributed across a particular factor
|
583
|
-
diversity = 0 implies that all samples belong to one category/bin
|
584
|
-
|
585
|
-
Parameters
|
586
|
-
----------
|
587
|
-
metric: str
|
588
|
-
The type of diversity index to return, currently ["simpson",
|
589
|
-
"shannon"]
|
590
|
-
|
591
|
-
Notes
|
592
|
-
-----
|
593
|
-
- For continuous variables, histogram bins are chosen automatically. See
|
594
|
-
numpy.histogram for details.
|
595
|
-
|
596
|
-
Returns
|
597
|
-
-------
|
598
|
-
diversity_index: np.ndarray
|
599
|
-
Diversity index per column of self.data or each factor in self.names
|
600
|
-
|
601
|
-
See Also
|
602
|
-
--------
|
603
|
-
numpy.histogram
|
604
|
-
|
605
|
-
"""
|
606
|
-
self._collect_data()
|
607
|
-
if self.metric.lower() == "simpson":
|
608
|
-
return self._diversity_simpson()
|
609
|
-
elif self.metric.lower() == "shannon":
|
610
|
-
return self._diversity_shannon()
|