dataeval 0.72.2__py3-none-any.whl → 0.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +14 -6
- dataeval/detectors/ood/aegmm.py +14 -6
- dataeval/detectors/ood/base.py +9 -3
- dataeval/detectors/ood/llr.py +22 -16
- dataeval/detectors/ood/vae.py +14 -6
- dataeval/detectors/ood/vaegmm.py +14 -6
- dataeval/interop.py +9 -7
- dataeval/metrics/bias/balance.py +25 -29
- dataeval/metrics/bias/coverage.py +35 -3
- dataeval/metrics/bias/diversity.py +50 -27
- dataeval/metrics/bias/metadata.py +99 -16
- dataeval/metrics/bias/parity.py +43 -35
- dataeval/utils/__init__.py +2 -1
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/tensorflow/_internal/gmm.py +8 -2
- dataeval/utils/tensorflow/_internal/loss.py +20 -11
- dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
- dataeval/utils/tensorflow/_internal/trainer.py +12 -5
- dataeval/utils/tensorflow/_internal/utils.py +70 -71
- {dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/METADATA +3 -3
- {dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/RECORD +25 -24
- dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
- {dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.2.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
dataeval/detectors/ood/ae.py
CHANGED
@@ -10,18 +10,26 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_AE"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
21
19
|
from dataeval.interop import as_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
24
22
|
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
import tensorflow as tf
|
25
|
+
import tf_keras as keras
|
26
|
+
|
27
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
28
|
+
else:
|
29
|
+
tf = lazyload("tensorflow")
|
30
|
+
keras = lazyload("tf_keras")
|
31
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
32
|
+
|
25
33
|
|
26
34
|
class OOD_AE(OODBase):
|
27
35
|
"""
|
@@ -33,7 +41,7 @@ class OOD_AE(OODBase):
|
|
33
41
|
An :term:`autoencoder<Autoencoder>` model.
|
34
42
|
"""
|
35
43
|
|
36
|
-
def __init__(self, model: AE) -> None:
|
44
|
+
def __init__(self, model: tf_models.AE) -> None:
|
37
45
|
super().__init__(model)
|
38
46
|
|
39
47
|
def fit(
|
@@ -41,7 +49,7 @@ class OOD_AE(OODBase):
|
|
41
49
|
x_ref: ArrayLike,
|
42
50
|
threshold_perc: float = 100.0,
|
43
51
|
loss_fn: Callable[..., tf.Tensor] | None = None,
|
44
|
-
optimizer: keras.optimizers.Optimizer =
|
52
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
45
53
|
epochs: int = 20,
|
46
54
|
batch_size: int = 64,
|
47
55
|
verbose: bool = True,
|
dataeval/detectors/ood/aegmm.py
CHANGED
@@ -10,19 +10,27 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_AEGMM"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
|
-
import tensorflow as tf
|
16
|
-
import tf_keras as keras
|
17
15
|
from numpy.typing import ArrayLike
|
18
16
|
|
19
17
|
from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
|
20
18
|
from dataeval.interop import to_numpy
|
21
|
-
from dataeval.utils.
|
19
|
+
from dataeval.utils.lazy import lazyload
|
22
20
|
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
23
21
|
from dataeval.utils.tensorflow._internal.loss import LossGMM
|
24
22
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
25
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import tensorflow as tf
|
26
|
+
import tf_keras as keras
|
27
|
+
|
28
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
+
|
26
34
|
|
27
35
|
class OOD_AEGMM(OODGMMBase):
|
28
36
|
"""
|
@@ -34,7 +42,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
34
42
|
An AEGMM model.
|
35
43
|
"""
|
36
44
|
|
37
|
-
def __init__(self, model: AEGMM) -> None:
|
45
|
+
def __init__(self, model: tf_models.AEGMM) -> None:
|
38
46
|
super().__init__(model)
|
39
47
|
|
40
48
|
def fit(
|
@@ -42,7 +50,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
42
50
|
x_ref: ArrayLike,
|
43
51
|
threshold_perc: float = 100.0,
|
44
52
|
loss_fn: Callable[..., tf.Tensor] | None = None,
|
45
|
-
optimizer: keras.optimizers.Optimizer =
|
53
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
46
54
|
epochs: int = 20,
|
47
55
|
batch_size: int = 64,
|
48
56
|
verbose: bool = True,
|
dataeval/detectors/ood/base.py
CHANGED
@@ -12,18 +12,24 @@ __all__ = ["OODOutput", "OODScoreOutput"]
|
|
12
12
|
|
13
13
|
from abc import ABC, abstractmethod
|
14
14
|
from dataclasses import dataclass
|
15
|
-
from typing import Callable, Literal, cast
|
15
|
+
from typing import TYPE_CHECKING, Callable, Literal, cast
|
16
16
|
|
17
17
|
import numpy as np
|
18
|
-
import tensorflow as tf
|
19
|
-
import tf_keras as keras
|
20
18
|
from numpy.typing import ArrayLike, NDArray
|
21
19
|
|
22
20
|
from dataeval.interop import to_numpy
|
23
21
|
from dataeval.output import OutputMetadata, set_metadata
|
22
|
+
from dataeval.utils.lazy import lazyload
|
24
23
|
from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
|
25
24
|
from dataeval.utils.tensorflow._internal.trainer import trainer
|
26
25
|
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
import tensorflow as tf
|
28
|
+
import tf_keras as keras
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
|
27
33
|
|
28
34
|
@dataclass(frozen=True)
|
29
35
|
class OODOutput(OutputMetadata):
|
dataeval/detectors/ood/llr.py
CHANGED
@@ -11,25 +11,31 @@ from __future__ import annotations
|
|
11
11
|
__all__ = ["OOD_LLR"]
|
12
12
|
|
13
13
|
from functools import partial
|
14
|
-
from typing import Callable
|
14
|
+
from typing import TYPE_CHECKING, Callable
|
15
15
|
|
16
16
|
import numpy as np
|
17
|
-
import tensorflow as tf
|
18
|
-
import tf_keras as keras
|
19
17
|
from numpy.typing import ArrayLike, NDArray
|
20
|
-
from tf_keras.layers import Input
|
21
|
-
from tf_keras.models import Model
|
22
18
|
|
23
19
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
24
20
|
from dataeval.interop import to_numpy
|
25
|
-
from dataeval.utils.
|
21
|
+
from dataeval.utils.lazy import lazyload
|
26
22
|
from dataeval.utils.tensorflow._internal.trainer import trainer
|
27
23
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
28
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
import tensorflow as tf
|
27
|
+
import tf_keras as keras
|
28
|
+
|
29
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
+
else:
|
31
|
+
tf = lazyload("tensorflow")
|
32
|
+
keras = lazyload("tf_keras")
|
33
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
+
|
29
35
|
|
30
36
|
def _build_model(
|
31
|
-
dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
32
|
-
) -> tuple[keras.Model, PixelCNN]:
|
37
|
+
dist: tf_models.PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
38
|
+
) -> tuple[keras.Model, tf_models.PixelCNN]:
|
33
39
|
"""
|
34
40
|
Create keras.Model from TF distribution.
|
35
41
|
|
@@ -46,9 +52,9 @@ def _build_model(
|
|
46
52
|
-------
|
47
53
|
TensorFlow model.
|
48
54
|
"""
|
49
|
-
x_in = Input(shape=input_shape)
|
55
|
+
x_in = keras.layers.Input(shape=input_shape)
|
50
56
|
log_prob = dist.log_prob(x_in)
|
51
|
-
model = Model(inputs=x_in, outputs=log_prob)
|
57
|
+
model = keras.models.Model(inputs=x_in, outputs=log_prob)
|
52
58
|
model.add_loss(-tf.reduce_mean(log_prob))
|
53
59
|
if isinstance(filepath, str):
|
54
60
|
model.load_weights(filepath)
|
@@ -109,13 +115,13 @@ class OOD_LLR(OODBase):
|
|
109
115
|
|
110
116
|
def __init__(
|
111
117
|
self,
|
112
|
-
model: PixelCNN,
|
113
|
-
model_background: PixelCNN | None = None,
|
118
|
+
model: tf_models.PixelCNN,
|
119
|
+
model_background: tf_models.PixelCNN | None = None,
|
114
120
|
log_prob: Callable | None = None,
|
115
121
|
sequential: bool = False,
|
116
122
|
) -> None:
|
117
|
-
self.dist_s: PixelCNN = model
|
118
|
-
self.dist_b: PixelCNN = (
|
123
|
+
self.dist_s: tf_models.PixelCNN = model
|
124
|
+
self.dist_b: tf_models.PixelCNN = (
|
119
125
|
model.copy()
|
120
126
|
if hasattr(model, "copy")
|
121
127
|
else keras.models.clone_model(model)
|
@@ -135,7 +141,7 @@ class OOD_LLR(OODBase):
|
|
135
141
|
x_ref: ArrayLike,
|
136
142
|
threshold_perc: float = 100.0,
|
137
143
|
loss_fn: Callable | None = None,
|
138
|
-
optimizer: keras.optimizers.Optimizer =
|
144
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
139
145
|
epochs: int = 20,
|
140
146
|
batch_size: int = 64,
|
141
147
|
verbose: bool = True,
|
@@ -176,7 +182,7 @@ class OOD_LLR(OODBase):
|
|
176
182
|
"""
|
177
183
|
x_ref = to_numpy(x_ref)
|
178
184
|
input_shape = x_ref.shape[1:]
|
179
|
-
optimizer =
|
185
|
+
optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
|
180
186
|
# Separate into two separate optimizers, one for semantic model and one for background model
|
181
187
|
optimizer_s = optimizer
|
182
188
|
optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
|
dataeval/detectors/ood/vae.py
CHANGED
@@ -10,19 +10,27 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_VAE"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
21
19
|
from dataeval.interop import to_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.loss import Elbo
|
24
22
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
25
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import tensorflow as tf
|
26
|
+
import tf_keras as keras
|
27
|
+
|
28
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
+
|
26
34
|
|
27
35
|
class OOD_VAE(OODBase):
|
28
36
|
"""
|
@@ -51,7 +59,7 @@ class OOD_VAE(OODBase):
|
|
51
59
|
>>> result = metric.predict(dataset, ood_type="feature")
|
52
60
|
"""
|
53
61
|
|
54
|
-
def __init__(self, model: VAE, samples: int = 10) -> None:
|
62
|
+
def __init__(self, model: tf_models.VAE, samples: int = 10) -> None:
|
55
63
|
super().__init__(model)
|
56
64
|
self.samples = samples
|
57
65
|
|
@@ -60,7 +68,7 @@ class OOD_VAE(OODBase):
|
|
60
68
|
x_ref: ArrayLike,
|
61
69
|
threshold_perc: float = 100.0,
|
62
70
|
loss_fn: Callable[..., tf.Tensor] = Elbo(0.05),
|
63
|
-
optimizer: keras.optimizers.Optimizer =
|
71
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
64
72
|
epochs: int = 20,
|
65
73
|
batch_size: int = 64,
|
66
74
|
verbose: bool = True,
|
dataeval/detectors/ood/vaegmm.py
CHANGED
@@ -10,20 +10,28 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_VAEGMM"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
|
21
19
|
from dataeval.interop import to_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
24
22
|
from dataeval.utils.tensorflow._internal.loss import Elbo, LossGMM
|
25
23
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
26
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
import tensorflow as tf
|
27
|
+
import tf_keras as keras
|
28
|
+
|
29
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
+
else:
|
31
|
+
tf = lazyload("tensorflow")
|
32
|
+
keras = lazyload("tf_keras")
|
33
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
+
|
27
35
|
|
28
36
|
class OOD_VAEGMM(OODGMMBase):
|
29
37
|
"""
|
@@ -37,7 +45,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
37
45
|
Number of samples sampled to evaluate each instance.
|
38
46
|
"""
|
39
47
|
|
40
|
-
def __init__(self, model: VAEGMM, samples: int = 10) -> None:
|
48
|
+
def __init__(self, model: tf_models.VAEGMM, samples: int = 10) -> None:
|
41
49
|
super().__init__(model)
|
42
50
|
self.samples = samples
|
43
51
|
|
@@ -46,7 +54,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
46
54
|
x_ref: ArrayLike,
|
47
55
|
threshold_perc: float = 100.0,
|
48
56
|
loss_fn: Callable[..., tf.Tensor] = LossGMM(elbo=Elbo(0.05)),
|
49
|
-
optimizer: keras.optimizers.Optimizer =
|
57
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
50
58
|
epochs: int = 20,
|
51
59
|
batch_size: int = 64,
|
52
60
|
verbose: bool = True,
|
dataeval/interop.py
CHANGED
@@ -37,13 +37,15 @@ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
|
|
37
37
|
if isinstance(array, np.ndarray):
|
38
38
|
return array.copy() if copy else array
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
if
|
46
|
-
|
40
|
+
if array.__class__.__module__.startswith("tensorflow"):
|
41
|
+
tf = _try_import("tensorflow")
|
42
|
+
if tf and tf.is_tensor(array):
|
43
|
+
return array.numpy().copy() if copy else array.numpy() # type: ignore
|
44
|
+
|
45
|
+
if array.__class__.__module__.startswith("torch"):
|
46
|
+
torch = _try_import("torch")
|
47
|
+
if torch and isinstance(array, torch.Tensor):
|
48
|
+
return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
|
47
49
|
|
48
50
|
return np.array(array, copy=copy)
|
49
51
|
|
dataeval/metrics/bias/balance.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = ["BalanceOutput", "balance"]
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import warnings
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Any, Mapping
|
@@ -13,6 +14,9 @@ from sklearn.feature_selection import mutual_info_classif, mutual_info_regressio
|
|
13
14
|
from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
|
14
15
|
from dataeval.output import OutputMetadata, set_metadata
|
15
16
|
|
17
|
+
with contextlib.suppress(ImportError):
|
18
|
+
from matplotlib.figure import Figure
|
19
|
+
|
16
20
|
|
17
21
|
@dataclass(frozen=True)
|
18
22
|
class BalanceOutput(OutputMetadata):
|
@@ -27,8 +31,8 @@ class BalanceOutput(OutputMetadata):
|
|
27
31
|
Estimate of inter/intra-factor mutual information
|
28
32
|
classwise : NDArray[np.float64]
|
29
33
|
Estimate of mutual information between metadata factors and individual class labels
|
30
|
-
class_list: NDArray
|
31
|
-
|
34
|
+
class_list: NDArray
|
35
|
+
Array of the class labels present in the dataset
|
32
36
|
metadata_names: list[str]
|
33
37
|
Names of each metadata factor
|
34
38
|
"""
|
@@ -36,36 +40,34 @@ class BalanceOutput(OutputMetadata):
|
|
36
40
|
balance: NDArray[np.float64]
|
37
41
|
factors: NDArray[np.float64]
|
38
42
|
classwise: NDArray[np.float64]
|
39
|
-
|
40
|
-
class_list: NDArray[np.int64]
|
43
|
+
class_list: NDArray[Any]
|
41
44
|
metadata_names: list[str]
|
42
45
|
|
43
46
|
def plot(
|
44
47
|
self,
|
45
|
-
row_labels: NDArray[Any] | None = None,
|
46
|
-
col_labels: NDArray[Any] | None = None,
|
48
|
+
row_labels: list[Any] | NDArray[Any] | None = None,
|
49
|
+
col_labels: list[Any] | NDArray[Any] | None = None,
|
47
50
|
plot_classwise: bool = False,
|
48
|
-
) ->
|
51
|
+
) -> Figure:
|
49
52
|
"""
|
50
53
|
Plot a heatmap of balance information
|
51
54
|
|
52
55
|
Parameters
|
53
56
|
----------
|
54
|
-
row_labels:
|
55
|
-
Array containing the labels for rows in the histogram
|
56
|
-
col_labels:
|
57
|
-
Array containing the labels for columns in the histogram
|
58
|
-
plot_classwise: bool, default False
|
57
|
+
row_labels : ArrayLike | None, default None
|
58
|
+
List/Array containing the labels for rows in the histogram
|
59
|
+
col_labels : ArrayLike | None, default None
|
60
|
+
List/Array containing the labels for columns in the histogram
|
61
|
+
plot_classwise : bool, default False
|
59
62
|
Whether to plot per-class balance instead of global balance
|
60
|
-
|
61
63
|
"""
|
62
64
|
if plot_classwise:
|
63
65
|
if row_labels is None:
|
64
|
-
row_labels =
|
66
|
+
row_labels = self.class_list
|
65
67
|
if col_labels is None:
|
66
68
|
col_labels = np.concatenate((["class"], self.metadata_names))
|
67
69
|
|
68
|
-
heatmap(
|
70
|
+
fig = heatmap(
|
69
71
|
self.classwise,
|
70
72
|
row_labels,
|
71
73
|
col_labels,
|
@@ -74,6 +76,7 @@ class BalanceOutput(OutputMetadata):
|
|
74
76
|
cbarlabel="Normalized Mutual Information",
|
75
77
|
)
|
76
78
|
else:
|
79
|
+
# Combine balance and factors results
|
77
80
|
data = np.concatenate([self.balance[np.newaxis, 1:], self.factors], axis=0)
|
78
81
|
# Create a mask for the upper triangle of the symmetrical array, ignoring the diagonal
|
79
82
|
mask = np.triu(data + 1, k=0) < 1
|
@@ -87,12 +90,9 @@ class BalanceOutput(OutputMetadata):
|
|
87
90
|
if col_labels is None:
|
88
91
|
col_labels = heat_labels[1:]
|
89
92
|
|
90
|
-
heatmap(
|
91
|
-
|
92
|
-
|
93
|
-
col_labels,
|
94
|
-
cbarlabel="Normalized Mutual Information",
|
95
|
-
)
|
93
|
+
fig = heatmap(heat_data, row_labels, col_labels, cbarlabel="Normalized Mutual Information")
|
94
|
+
|
95
|
+
return fig
|
96
96
|
|
97
97
|
|
98
98
|
def validate_num_neighbors(num_neighbors: int) -> int:
|
@@ -172,14 +172,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
172
172
|
sklearn.metrics.mutual_info_score
|
173
173
|
"""
|
174
174
|
num_neighbors = validate_num_neighbors(num_neighbors)
|
175
|
-
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
175
|
+
data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
|
176
176
|
num_factors = len(names)
|
177
177
|
mi = np.empty((num_factors, num_factors))
|
178
178
|
mi[:] = np.nan
|
179
179
|
|
180
|
-
class_idx = names.index("class_label")
|
181
|
-
class_lbl = np.array(data[:, class_idx], dtype=int)
|
182
|
-
|
183
180
|
for idx in range(num_factors):
|
184
181
|
tgt = data[:, idx].astype(int)
|
185
182
|
|
@@ -209,8 +206,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
209
206
|
|
210
207
|
# unique class labels
|
211
208
|
class_idx = names.index("class_label")
|
212
|
-
|
213
|
-
u_cls = np.unique(class_data)
|
209
|
+
u_cls = np.unique(data[:, class_idx])
|
214
210
|
num_classes = len(u_cls)
|
215
211
|
|
216
212
|
# assume class is a factor
|
@@ -220,7 +216,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
220
216
|
# categorical variables, excluding class label
|
221
217
|
cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
|
222
218
|
|
223
|
-
tgt_bin = np.stack([
|
219
|
+
tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(int)
|
224
220
|
ent_tgt_bin = entropy(
|
225
221
|
tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
|
226
222
|
)
|
@@ -240,4 +236,4 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
240
236
|
norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_all) + 1e-6
|
241
237
|
classwise = classwise_mi / norm_factor
|
242
238
|
|
243
|
-
return BalanceOutput(balance, factors, classwise,
|
239
|
+
return BalanceOutput(balance, factors, classwise, unique_labels, list(metadata.keys()))
|
@@ -2,18 +2,23 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = ["CoverageOutput", "coverage"]
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import math
|
6
7
|
from dataclasses import dataclass
|
7
|
-
from typing import Literal
|
8
|
+
from typing import Any, Literal
|
8
9
|
|
9
10
|
import numpy as np
|
10
11
|
from numpy.typing import ArrayLike, NDArray
|
11
12
|
from scipy.spatial.distance import pdist, squareform
|
12
13
|
|
13
14
|
from dataeval.interop import to_numpy
|
15
|
+
from dataeval.metrics.bias.metadata import coverage_plot
|
14
16
|
from dataeval.output import OutputMetadata, set_metadata
|
15
17
|
from dataeval.utils.shared import flatten
|
16
18
|
|
19
|
+
with contextlib.suppress(ImportError):
|
20
|
+
from matplotlib.figure import Figure
|
21
|
+
|
17
22
|
|
18
23
|
@dataclass(frozen=True)
|
19
24
|
class CoverageOutput(OutputMetadata):
|
@@ -34,13 +39,40 @@ class CoverageOutput(OutputMetadata):
|
|
34
39
|
radii: NDArray[np.float64]
|
35
40
|
critical_value: float
|
36
41
|
|
42
|
+
def plot(
|
43
|
+
self,
|
44
|
+
images: NDArray[Any],
|
45
|
+
top_k: int = 6,
|
46
|
+
) -> Figure:
|
47
|
+
"""
|
48
|
+
Plot the top k images together for visualization
|
49
|
+
|
50
|
+
Parameters
|
51
|
+
----------
|
52
|
+
images : ArrayLike
|
53
|
+
Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
|
54
|
+
top_k : int, default 6
|
55
|
+
Number of images to plot (plotting assumes groups of 3)
|
56
|
+
"""
|
57
|
+
# Determine which images to plot
|
58
|
+
highest_uncovered_indices = self.indices[:top_k]
|
59
|
+
|
60
|
+
# Grab the images
|
61
|
+
images = to_numpy(images)
|
62
|
+
selected_images = images[highest_uncovered_indices]
|
63
|
+
|
64
|
+
# Plot the images
|
65
|
+
fig = coverage_plot(selected_images, top_k)
|
66
|
+
|
67
|
+
return fig
|
68
|
+
|
37
69
|
|
38
70
|
@set_metadata()
|
39
71
|
def coverage(
|
40
72
|
embeddings: ArrayLike,
|
41
73
|
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
42
74
|
k: int = 20,
|
43
|
-
percent:
|
75
|
+
percent: float = 0.01,
|
44
76
|
) -> CoverageOutput:
|
45
77
|
"""
|
46
78
|
Class for evaluating :term:`coverage<Coverage>` and identifying images/samples that are in undercovered regions.
|
@@ -55,7 +87,7 @@ def coverage(
|
|
55
87
|
k: int, default 20
|
56
88
|
Number of observations required in order to be covered.
|
57
89
|
[1] suggests that a minimum of 20-50 samples is necessary.
|
58
|
-
percent:
|
90
|
+
percent: float, default 0.01
|
59
91
|
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
60
92
|
|
61
93
|
Returns
|