dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +1 -1
- dataeval/detectors/drift/base.py +2 -2
- dataeval/detectors/linters/clusterer.py +1 -1
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +14 -6
- dataeval/detectors/ood/aegmm.py +14 -6
- dataeval/detectors/ood/base.py +9 -3
- dataeval/detectors/ood/llr.py +22 -16
- dataeval/detectors/ood/vae.py +14 -6
- dataeval/detectors/ood/vaegmm.py +14 -6
- dataeval/interop.py +9 -7
- dataeval/metrics/bias/balance.py +50 -44
- dataeval/metrics/bias/coverage.py +38 -6
- dataeval/metrics/bias/diversity.py +117 -65
- dataeval/metrics/bias/metadata.py +225 -60
- dataeval/metrics/bias/parity.py +68 -54
- dataeval/utils/__init__.py +4 -3
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/shared.py +1 -1
- dataeval/utils/split_dataset.py +12 -6
- dataeval/utils/tensorflow/_internal/gmm.py +8 -2
- dataeval/utils/tensorflow/_internal/loss.py +20 -11
- dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
- dataeval/utils/tensorflow/_internal/trainer.py +12 -5
- dataeval/utils/tensorflow/_internal/utils.py +70 -71
- dataeval/utils/torch/datasets.py +2 -2
- dataeval/workflows/__init__.py +1 -1
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
- dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.73.1"
|
2
2
|
|
3
3
|
from importlib.util import find_spec
|
4
4
|
|
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics # noqa: E402
|
|
12
12
|
|
13
13
|
__all__ = ["detectors", "metrics"]
|
14
14
|
|
15
|
-
if _IS_TORCH_AVAILABLE:
|
15
|
+
if _IS_TORCH_AVAILABLE:
|
16
16
|
from dataeval import workflows
|
17
17
|
|
18
18
|
__all__ += ["workflows"]
|
19
19
|
|
20
|
-
if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
|
20
|
+
if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
|
21
21
|
from dataeval import utils
|
22
22
|
|
23
23
|
__all__ += ["utils"]
|
dataeval/detectors/__init__.py
CHANGED
@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
|
|
10
10
|
|
11
11
|
__all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
|
12
12
|
|
13
|
-
if _IS_TORCH_AVAILABLE:
|
13
|
+
if _IS_TORCH_AVAILABLE:
|
14
14
|
from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
|
15
15
|
from dataeval.detectors.drift.torch import preprocess_drift
|
16
16
|
from dataeval.detectors.drift.uncertainty import DriftUncertainty
|
dataeval/detectors/drift/base.py
CHANGED
@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
|
|
18
18
|
import numpy as np
|
19
19
|
from numpy.typing import ArrayLike, NDArray
|
20
20
|
|
21
|
-
from dataeval.interop import as_numpy
|
21
|
+
from dataeval.interop import as_numpy
|
22
22
|
from dataeval.output import OutputMetadata, set_metadata
|
23
23
|
|
24
24
|
R = TypeVar("R")
|
@@ -196,7 +196,7 @@ class BaseDrift:
|
|
196
196
|
if correction not in ["bonferroni", "fdr"]:
|
197
197
|
raise ValueError("`correction` must be `bonferroni` or `fdr`.")
|
198
198
|
|
199
|
-
self._x_ref =
|
199
|
+
self._x_ref = as_numpy(x_ref)
|
200
200
|
self.x_ref_preprocessed: bool = x_ref_preprocessed
|
201
201
|
|
202
202
|
# Other attributes
|
@@ -480,7 +480,7 @@ class Clusterer:
|
|
480
480
|
samples = self.clusters[level][cluster_id].samples
|
481
481
|
if len(samples) >= self._min_num_samples_per_cluster:
|
482
482
|
duplicates_std.append(self.clusters[level][cluster_id].dist_std)
|
483
|
-
diag_mask = np.ones_like(self._sqdmat, dtype=
|
483
|
+
diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
|
484
484
|
np.fill_diagonal(diag_mask, 0)
|
485
485
|
diag_mask = np.triu(diag_mask)
|
486
486
|
|
@@ -4,7 +4,7 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
|
|
4
4
|
|
5
5
|
from dataeval import _IS_TENSORFLOW_AVAILABLE
|
6
6
|
|
7
|
-
if _IS_TENSORFLOW_AVAILABLE:
|
7
|
+
if _IS_TENSORFLOW_AVAILABLE:
|
8
8
|
from dataeval.detectors.ood.ae import OOD_AE
|
9
9
|
from dataeval.detectors.ood.aegmm import OOD_AEGMM
|
10
10
|
from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
|
dataeval/detectors/ood/ae.py
CHANGED
@@ -10,18 +10,26 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_AE"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
21
19
|
from dataeval.interop import as_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
24
22
|
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
import tensorflow as tf
|
25
|
+
import tf_keras as keras
|
26
|
+
|
27
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
28
|
+
else:
|
29
|
+
tf = lazyload("tensorflow")
|
30
|
+
keras = lazyload("tf_keras")
|
31
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
32
|
+
|
25
33
|
|
26
34
|
class OOD_AE(OODBase):
|
27
35
|
"""
|
@@ -33,7 +41,7 @@ class OOD_AE(OODBase):
|
|
33
41
|
An :term:`autoencoder<Autoencoder>` model.
|
34
42
|
"""
|
35
43
|
|
36
|
-
def __init__(self, model: AE) -> None:
|
44
|
+
def __init__(self, model: tf_models.AE) -> None:
|
37
45
|
super().__init__(model)
|
38
46
|
|
39
47
|
def fit(
|
@@ -41,7 +49,7 @@ class OOD_AE(OODBase):
|
|
41
49
|
x_ref: ArrayLike,
|
42
50
|
threshold_perc: float = 100.0,
|
43
51
|
loss_fn: Callable[..., tf.Tensor] | None = None,
|
44
|
-
optimizer: keras.optimizers.Optimizer =
|
52
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
45
53
|
epochs: int = 20,
|
46
54
|
batch_size: int = 64,
|
47
55
|
verbose: bool = True,
|
dataeval/detectors/ood/aegmm.py
CHANGED
@@ -10,19 +10,27 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_AEGMM"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
|
-
import tensorflow as tf
|
16
|
-
import tf_keras as keras
|
17
15
|
from numpy.typing import ArrayLike
|
18
16
|
|
19
17
|
from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
|
20
18
|
from dataeval.interop import to_numpy
|
21
|
-
from dataeval.utils.
|
19
|
+
from dataeval.utils.lazy import lazyload
|
22
20
|
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
23
21
|
from dataeval.utils.tensorflow._internal.loss import LossGMM
|
24
22
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
25
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import tensorflow as tf
|
26
|
+
import tf_keras as keras
|
27
|
+
|
28
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
+
|
26
34
|
|
27
35
|
class OOD_AEGMM(OODGMMBase):
|
28
36
|
"""
|
@@ -34,7 +42,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
34
42
|
An AEGMM model.
|
35
43
|
"""
|
36
44
|
|
37
|
-
def __init__(self, model: AEGMM) -> None:
|
45
|
+
def __init__(self, model: tf_models.AEGMM) -> None:
|
38
46
|
super().__init__(model)
|
39
47
|
|
40
48
|
def fit(
|
@@ -42,7 +50,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
42
50
|
x_ref: ArrayLike,
|
43
51
|
threshold_perc: float = 100.0,
|
44
52
|
loss_fn: Callable[..., tf.Tensor] | None = None,
|
45
|
-
optimizer: keras.optimizers.Optimizer =
|
53
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
46
54
|
epochs: int = 20,
|
47
55
|
batch_size: int = 64,
|
48
56
|
verbose: bool = True,
|
dataeval/detectors/ood/base.py
CHANGED
@@ -12,18 +12,24 @@ __all__ = ["OODOutput", "OODScoreOutput"]
|
|
12
12
|
|
13
13
|
from abc import ABC, abstractmethod
|
14
14
|
from dataclasses import dataclass
|
15
|
-
from typing import Callable, Literal, cast
|
15
|
+
from typing import TYPE_CHECKING, Callable, Literal, cast
|
16
16
|
|
17
17
|
import numpy as np
|
18
|
-
import tensorflow as tf
|
19
|
-
import tf_keras as keras
|
20
18
|
from numpy.typing import ArrayLike, NDArray
|
21
19
|
|
22
20
|
from dataeval.interop import to_numpy
|
23
21
|
from dataeval.output import OutputMetadata, set_metadata
|
22
|
+
from dataeval.utils.lazy import lazyload
|
24
23
|
from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
|
25
24
|
from dataeval.utils.tensorflow._internal.trainer import trainer
|
26
25
|
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
import tensorflow as tf
|
28
|
+
import tf_keras as keras
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
|
27
33
|
|
28
34
|
@dataclass(frozen=True)
|
29
35
|
class OODOutput(OutputMetadata):
|
dataeval/detectors/ood/llr.py
CHANGED
@@ -11,25 +11,31 @@ from __future__ import annotations
|
|
11
11
|
__all__ = ["OOD_LLR"]
|
12
12
|
|
13
13
|
from functools import partial
|
14
|
-
from typing import Callable
|
14
|
+
from typing import TYPE_CHECKING, Callable
|
15
15
|
|
16
16
|
import numpy as np
|
17
|
-
import tensorflow as tf
|
18
|
-
import tf_keras as keras
|
19
17
|
from numpy.typing import ArrayLike, NDArray
|
20
|
-
from tf_keras.layers import Input
|
21
|
-
from tf_keras.models import Model
|
22
18
|
|
23
19
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
24
20
|
from dataeval.interop import to_numpy
|
25
|
-
from dataeval.utils.
|
21
|
+
from dataeval.utils.lazy import lazyload
|
26
22
|
from dataeval.utils.tensorflow._internal.trainer import trainer
|
27
23
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
28
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
import tensorflow as tf
|
27
|
+
import tf_keras as keras
|
28
|
+
|
29
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
+
else:
|
31
|
+
tf = lazyload("tensorflow")
|
32
|
+
keras = lazyload("tf_keras")
|
33
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
+
|
29
35
|
|
30
36
|
def _build_model(
|
31
|
-
dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
32
|
-
) -> tuple[keras.Model, PixelCNN]:
|
37
|
+
dist: tf_models.PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
38
|
+
) -> tuple[keras.Model, tf_models.PixelCNN]:
|
33
39
|
"""
|
34
40
|
Create keras.Model from TF distribution.
|
35
41
|
|
@@ -46,9 +52,9 @@ def _build_model(
|
|
46
52
|
-------
|
47
53
|
TensorFlow model.
|
48
54
|
"""
|
49
|
-
x_in = Input(shape=input_shape)
|
55
|
+
x_in = keras.layers.Input(shape=input_shape)
|
50
56
|
log_prob = dist.log_prob(x_in)
|
51
|
-
model = Model(inputs=x_in, outputs=log_prob)
|
57
|
+
model = keras.models.Model(inputs=x_in, outputs=log_prob)
|
52
58
|
model.add_loss(-tf.reduce_mean(log_prob))
|
53
59
|
if isinstance(filepath, str):
|
54
60
|
model.load_weights(filepath)
|
@@ -109,13 +115,13 @@ class OOD_LLR(OODBase):
|
|
109
115
|
|
110
116
|
def __init__(
|
111
117
|
self,
|
112
|
-
model: PixelCNN,
|
113
|
-
model_background: PixelCNN | None = None,
|
118
|
+
model: tf_models.PixelCNN,
|
119
|
+
model_background: tf_models.PixelCNN | None = None,
|
114
120
|
log_prob: Callable | None = None,
|
115
121
|
sequential: bool = False,
|
116
122
|
) -> None:
|
117
|
-
self.dist_s: PixelCNN = model
|
118
|
-
self.dist_b: PixelCNN = (
|
123
|
+
self.dist_s: tf_models.PixelCNN = model
|
124
|
+
self.dist_b: tf_models.PixelCNN = (
|
119
125
|
model.copy()
|
120
126
|
if hasattr(model, "copy")
|
121
127
|
else keras.models.clone_model(model)
|
@@ -135,7 +141,7 @@ class OOD_LLR(OODBase):
|
|
135
141
|
x_ref: ArrayLike,
|
136
142
|
threshold_perc: float = 100.0,
|
137
143
|
loss_fn: Callable | None = None,
|
138
|
-
optimizer: keras.optimizers.Optimizer =
|
144
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
139
145
|
epochs: int = 20,
|
140
146
|
batch_size: int = 64,
|
141
147
|
verbose: bool = True,
|
@@ -176,7 +182,7 @@ class OOD_LLR(OODBase):
|
|
176
182
|
"""
|
177
183
|
x_ref = to_numpy(x_ref)
|
178
184
|
input_shape = x_ref.shape[1:]
|
179
|
-
optimizer =
|
185
|
+
optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
|
180
186
|
# Separate into two separate optimizers, one for semantic model and one for background model
|
181
187
|
optimizer_s = optimizer
|
182
188
|
optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
|
dataeval/detectors/ood/vae.py
CHANGED
@@ -10,19 +10,27 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_VAE"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
21
19
|
from dataeval.interop import to_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.loss import Elbo
|
24
22
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
25
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import tensorflow as tf
|
26
|
+
import tf_keras as keras
|
27
|
+
|
28
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
+
|
26
34
|
|
27
35
|
class OOD_VAE(OODBase):
|
28
36
|
"""
|
@@ -51,7 +59,7 @@ class OOD_VAE(OODBase):
|
|
51
59
|
>>> result = metric.predict(dataset, ood_type="feature")
|
52
60
|
"""
|
53
61
|
|
54
|
-
def __init__(self, model: VAE, samples: int = 10) -> None:
|
62
|
+
def __init__(self, model: tf_models.VAE, samples: int = 10) -> None:
|
55
63
|
super().__init__(model)
|
56
64
|
self.samples = samples
|
57
65
|
|
@@ -60,7 +68,7 @@ class OOD_VAE(OODBase):
|
|
60
68
|
x_ref: ArrayLike,
|
61
69
|
threshold_perc: float = 100.0,
|
62
70
|
loss_fn: Callable[..., tf.Tensor] = Elbo(0.05),
|
63
|
-
optimizer: keras.optimizers.Optimizer =
|
71
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
64
72
|
epochs: int = 20,
|
65
73
|
batch_size: int = 64,
|
66
74
|
verbose: bool = True,
|
dataeval/detectors/ood/vaegmm.py
CHANGED
@@ -10,20 +10,28 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = ["OOD_VAEGMM"]
|
12
12
|
|
13
|
-
from typing import Callable
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
14
|
|
15
15
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
16
|
from numpy.typing import ArrayLike
|
19
17
|
|
20
18
|
from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
|
21
19
|
from dataeval.interop import to_numpy
|
22
|
-
from dataeval.utils.
|
20
|
+
from dataeval.utils.lazy import lazyload
|
23
21
|
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
24
22
|
from dataeval.utils.tensorflow._internal.loss import Elbo, LossGMM
|
25
23
|
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
26
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
import tensorflow as tf
|
27
|
+
import tf_keras as keras
|
28
|
+
|
29
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
+
else:
|
31
|
+
tf = lazyload("tensorflow")
|
32
|
+
keras = lazyload("tf_keras")
|
33
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
+
|
27
35
|
|
28
36
|
class OOD_VAEGMM(OODGMMBase):
|
29
37
|
"""
|
@@ -37,7 +45,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
37
45
|
Number of samples sampled to evaluate each instance.
|
38
46
|
"""
|
39
47
|
|
40
|
-
def __init__(self, model: VAEGMM, samples: int = 10) -> None:
|
48
|
+
def __init__(self, model: tf_models.VAEGMM, samples: int = 10) -> None:
|
41
49
|
super().__init__(model)
|
42
50
|
self.samples = samples
|
43
51
|
|
@@ -46,7 +54,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
46
54
|
x_ref: ArrayLike,
|
47
55
|
threshold_perc: float = 100.0,
|
48
56
|
loss_fn: Callable[..., tf.Tensor] = LossGMM(elbo=Elbo(0.05)),
|
49
|
-
optimizer: keras.optimizers.Optimizer =
|
57
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
50
58
|
epochs: int = 20,
|
51
59
|
batch_size: int = 64,
|
52
60
|
verbose: bool = True,
|
dataeval/interop.py
CHANGED
@@ -37,13 +37,15 @@ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
|
|
37
37
|
if isinstance(array, np.ndarray):
|
38
38
|
return array.copy() if copy else array
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
if
|
46
|
-
|
40
|
+
if array.__class__.__module__.startswith("tensorflow"):
|
41
|
+
tf = _try_import("tensorflow")
|
42
|
+
if tf and tf.is_tensor(array):
|
43
|
+
return array.numpy().copy() if copy else array.numpy() # type: ignore
|
44
|
+
|
45
|
+
if array.__class__.__module__.startswith("torch"):
|
46
|
+
torch = _try_import("torch")
|
47
|
+
if torch and isinstance(array, torch.Tensor):
|
48
|
+
return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
|
47
49
|
|
48
50
|
return np.array(array, copy=copy)
|
49
51
|
|
dataeval/metrics/bias/balance.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = ["BalanceOutput", "balance"]
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import warnings
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Any, Mapping
|
@@ -10,9 +11,12 @@ import numpy as np
|
|
10
11
|
from numpy.typing import ArrayLike, NDArray
|
11
12
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
12
13
|
|
13
|
-
from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
|
14
|
+
from dataeval.metrics.bias.metadata import CLASS_LABEL, entropy, heatmap, preprocess_metadata
|
14
15
|
from dataeval.output import OutputMetadata, set_metadata
|
15
16
|
|
17
|
+
with contextlib.suppress(ImportError):
|
18
|
+
from matplotlib.figure import Figure
|
19
|
+
|
16
20
|
|
17
21
|
@dataclass(frozen=True)
|
18
22
|
class BalanceOutput(OutputMetadata):
|
@@ -27,45 +31,43 @@ class BalanceOutput(OutputMetadata):
|
|
27
31
|
Estimate of inter/intra-factor mutual information
|
28
32
|
classwise : NDArray[np.float64]
|
29
33
|
Estimate of mutual information between metadata factors and individual class labels
|
30
|
-
class_list: NDArray
|
31
|
-
|
32
|
-
metadata_names: list[str]
|
34
|
+
class_list : NDArray
|
35
|
+
Array of the class labels present in the dataset
|
36
|
+
metadata_names : list[str]
|
33
37
|
Names of each metadata factor
|
34
38
|
"""
|
35
39
|
|
36
40
|
balance: NDArray[np.float64]
|
37
41
|
factors: NDArray[np.float64]
|
38
42
|
classwise: NDArray[np.float64]
|
39
|
-
|
40
|
-
class_list: NDArray[np.int64]
|
43
|
+
class_list: NDArray[Any]
|
41
44
|
metadata_names: list[str]
|
42
45
|
|
43
46
|
def plot(
|
44
47
|
self,
|
45
|
-
row_labels: NDArray[Any] | None = None,
|
46
|
-
col_labels: NDArray[Any] | None = None,
|
48
|
+
row_labels: list[Any] | NDArray[Any] | None = None,
|
49
|
+
col_labels: list[Any] | NDArray[Any] | None = None,
|
47
50
|
plot_classwise: bool = False,
|
48
|
-
) ->
|
51
|
+
) -> Figure:
|
49
52
|
"""
|
50
53
|
Plot a heatmap of balance information
|
51
54
|
|
52
55
|
Parameters
|
53
56
|
----------
|
54
|
-
row_labels:
|
55
|
-
Array containing the labels for rows in the histogram
|
56
|
-
col_labels:
|
57
|
-
Array containing the labels for columns in the histogram
|
58
|
-
plot_classwise: bool, default False
|
57
|
+
row_labels : ArrayLike or None, default None
|
58
|
+
List/Array containing the labels for rows in the histogram
|
59
|
+
col_labels : ArrayLike or None, default None
|
60
|
+
List/Array containing the labels for columns in the histogram
|
61
|
+
plot_classwise : bool, default False
|
59
62
|
Whether to plot per-class balance instead of global balance
|
60
|
-
|
61
63
|
"""
|
62
64
|
if plot_classwise:
|
63
65
|
if row_labels is None:
|
64
|
-
row_labels =
|
66
|
+
row_labels = self.class_list
|
65
67
|
if col_labels is None:
|
66
68
|
col_labels = np.concatenate((["class"], self.metadata_names))
|
67
69
|
|
68
|
-
heatmap(
|
70
|
+
fig = heatmap(
|
69
71
|
self.classwise,
|
70
72
|
row_labels,
|
71
73
|
col_labels,
|
@@ -74,6 +76,7 @@ class BalanceOutput(OutputMetadata):
|
|
74
76
|
cbarlabel="Normalized Mutual Information",
|
75
77
|
)
|
76
78
|
else:
|
79
|
+
# Combine balance and factors results
|
77
80
|
data = np.concatenate([self.balance[np.newaxis, 1:], self.factors], axis=0)
|
78
81
|
# Create a mask for the upper triangle of the symmetrical array, ignoring the diagonal
|
79
82
|
mask = np.triu(data + 1, k=0) < 1
|
@@ -87,12 +90,9 @@ class BalanceOutput(OutputMetadata):
|
|
87
90
|
if col_labels is None:
|
88
91
|
col_labels = heat_labels[1:]
|
89
92
|
|
90
|
-
heatmap(
|
91
|
-
|
92
|
-
|
93
|
-
col_labels,
|
94
|
-
cbarlabel="Normalized Mutual Information",
|
95
|
-
)
|
93
|
+
fig = heatmap(heat_data, row_labels, col_labels, cbarlabel="Normalized Mutual Information")
|
94
|
+
|
95
|
+
return fig
|
96
96
|
|
97
97
|
|
98
98
|
def validate_num_neighbors(num_neighbors: int) -> int:
|
@@ -116,19 +116,29 @@ def validate_num_neighbors(num_neighbors: int) -> int:
|
|
116
116
|
|
117
117
|
|
118
118
|
@set_metadata("dataeval.metrics")
|
119
|
-
def balance(
|
119
|
+
def balance(
|
120
|
+
class_labels: ArrayLike,
|
121
|
+
metadata: Mapping[str, ArrayLike],
|
122
|
+
num_neighbors: int = 5,
|
123
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
124
|
+
) -> BalanceOutput:
|
120
125
|
"""
|
121
126
|
Mutual information (MI) between factors (class label, metadata, label/image properties)
|
122
127
|
|
123
128
|
Parameters
|
124
129
|
----------
|
125
|
-
class_labels: ArrayLike
|
130
|
+
class_labels : ArrayLike
|
126
131
|
List of class labels for each image
|
127
|
-
metadata: Mapping[str, ArrayLike]
|
132
|
+
metadata : Mapping[str, ArrayLike]
|
128
133
|
Dict of lists of metadata factors for each image
|
129
|
-
num_neighbors: int, default 5
|
134
|
+
num_neighbors : int, default 5
|
130
135
|
Number of nearest neighbors to use for computing MI between discrete
|
131
136
|
and continuous variables.
|
137
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
138
|
+
The factors in metadata that have continuous values and the array of bin counts to
|
139
|
+
discretize values into. All factors are treated as having discrete values unless they
|
140
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
141
|
+
in metadata.
|
132
142
|
|
133
143
|
Returns
|
134
144
|
-------
|
@@ -148,7 +158,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
148
158
|
-------
|
149
159
|
Return balance (mutual information) of factors with class_labels
|
150
160
|
|
151
|
-
>>> bal = balance(class_labels, metadata)
|
161
|
+
>>> bal = balance(class_labels, metadata, continuous_factor_bincounts=continuous_factor_bincounts)
|
152
162
|
>>> bal.balance
|
153
163
|
array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
|
154
164
|
|
@@ -165,6 +175,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
165
175
|
array([[0.99999822, 0.13363788, 0. , 0. ],
|
166
176
|
[0.99999822, 0.13363788, 0. , 0. ]])
|
167
177
|
|
178
|
+
|
168
179
|
See Also
|
169
180
|
--------
|
170
181
|
sklearn.feature_selection.mutual_info_classif
|
@@ -172,18 +183,15 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
172
183
|
sklearn.metrics.mutual_info_score
|
173
184
|
"""
|
174
185
|
num_neighbors = validate_num_neighbors(num_neighbors)
|
175
|
-
data, names, is_categorical = preprocess_metadata(class_labels, metadata)
|
186
|
+
data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
|
176
187
|
num_factors = len(names)
|
177
188
|
mi = np.empty((num_factors, num_factors))
|
178
189
|
mi[:] = np.nan
|
179
190
|
|
180
|
-
class_idx = names.index("class_label")
|
181
|
-
class_lbl = np.array(data[:, class_idx], dtype=int)
|
182
|
-
|
183
191
|
for idx in range(num_factors):
|
184
|
-
tgt = data[:, idx].astype(
|
192
|
+
tgt = data[:, idx].astype(np.intp)
|
185
193
|
|
186
|
-
if
|
194
|
+
if continuous_factor_bincounts and names[idx] not in continuous_factor_bincounts:
|
187
195
|
mi[idx, :] = mutual_info_classif(
|
188
196
|
data,
|
189
197
|
tgt,
|
@@ -200,7 +208,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
200
208
|
random_state=0,
|
201
209
|
)
|
202
210
|
|
203
|
-
ent_all = entropy(data, names,
|
211
|
+
ent_all = entropy(data, names, continuous_factor_bincounts, normalized=False)
|
204
212
|
norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
|
205
213
|
# in principle MI should be symmetric, but it is not in practice.
|
206
214
|
nmi = 0.5 * (mi + mi.T) / norm_factor
|
@@ -208,9 +216,8 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
208
216
|
factors = nmi[1:, 1:]
|
209
217
|
|
210
218
|
# unique class labels
|
211
|
-
class_idx = names.index(
|
212
|
-
|
213
|
-
u_cls = np.unique(class_data)
|
219
|
+
class_idx = names.index(CLASS_LABEL)
|
220
|
+
u_cls = np.unique(data[:, class_idx])
|
214
221
|
num_classes = len(u_cls)
|
215
222
|
|
216
223
|
# assume class is a factor
|
@@ -218,12 +225,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
218
225
|
classwise_mi[:] = np.nan
|
219
226
|
|
220
227
|
# categorical variables, excluding class label
|
221
|
-
cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(
|
228
|
+
cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(np.intp)
|
222
229
|
|
223
|
-
tgt_bin = np.stack([
|
224
|
-
|
225
|
-
|
226
|
-
)
|
230
|
+
tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(np.intp)
|
231
|
+
names = [str(idx) for idx in range(num_classes)]
|
232
|
+
ent_tgt_bin = entropy(tgt_bin, names, continuous_factor_bincounts)
|
227
233
|
|
228
234
|
# classification MI for discrete/categorical features
|
229
235
|
for idx in range(num_classes):
|
@@ -240,4 +246,4 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
240
246
|
norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_all) + 1e-6
|
241
247
|
classwise = classwise_mi / norm_factor
|
242
248
|
|
243
|
-
return BalanceOutput(balance, factors, classwise,
|
249
|
+
return BalanceOutput(balance, factors, classwise, unique_labels, list(metadata.keys()))
|