dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/detectors/__init__.py +1 -1
  3. dataeval/detectors/drift/__init__.py +1 -1
  4. dataeval/detectors/drift/base.py +2 -2
  5. dataeval/detectors/linters/clusterer.py +1 -1
  6. dataeval/detectors/ood/__init__.py +1 -1
  7. dataeval/detectors/ood/ae.py +14 -6
  8. dataeval/detectors/ood/aegmm.py +14 -6
  9. dataeval/detectors/ood/base.py +9 -3
  10. dataeval/detectors/ood/llr.py +22 -16
  11. dataeval/detectors/ood/vae.py +14 -6
  12. dataeval/detectors/ood/vaegmm.py +14 -6
  13. dataeval/interop.py +9 -7
  14. dataeval/metrics/bias/balance.py +50 -44
  15. dataeval/metrics/bias/coverage.py +38 -6
  16. dataeval/metrics/bias/diversity.py +117 -65
  17. dataeval/metrics/bias/metadata.py +225 -60
  18. dataeval/metrics/bias/parity.py +68 -54
  19. dataeval/utils/__init__.py +4 -3
  20. dataeval/utils/lazy.py +26 -0
  21. dataeval/utils/metadata.py +258 -0
  22. dataeval/utils/shared.py +1 -1
  23. dataeval/utils/split_dataset.py +12 -6
  24. dataeval/utils/tensorflow/_internal/gmm.py +8 -2
  25. dataeval/utils/tensorflow/_internal/loss.py +20 -11
  26. dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
  27. dataeval/utils/tensorflow/_internal/trainer.py +12 -5
  28. dataeval/utils/tensorflow/_internal/utils.py +70 -71
  29. dataeval/utils/torch/datasets.py +2 -2
  30. dataeval/workflows/__init__.py +1 -1
  31. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
  32. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
  33. dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
  34. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
  35. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
dataeval/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.72.2"
1
+ __version__ = "0.73.1"
2
2
 
3
3
  from importlib.util import find_spec
4
4
 
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics # noqa: E402
12
12
 
13
13
  __all__ = ["detectors", "metrics"]
14
14
 
15
- if _IS_TORCH_AVAILABLE: # pragma: no cover
15
+ if _IS_TORCH_AVAILABLE:
16
16
  from dataeval import workflows
17
17
 
18
18
  __all__ += ["workflows"]
19
19
 
20
- if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE: # pragma: no cover
20
+ if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
21
21
  from dataeval import utils
22
22
 
23
23
  __all__ += ["utils"]
@@ -7,7 +7,7 @@ from dataeval.detectors import drift, linters
7
7
 
8
8
  __all__ = ["drift", "linters"]
9
9
 
10
- if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
10
+ if _IS_TENSORFLOW_AVAILABLE:
11
11
  from dataeval.detectors import ood
12
12
 
13
13
  __all__ += ["ood"]
@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
10
10
 
11
11
  __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
12
12
 
13
- if _IS_TORCH_AVAILABLE: # pragma: no cover
13
+ if _IS_TORCH_AVAILABLE:
14
14
  from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
15
15
  from dataeval.detectors.drift.torch import preprocess_drift
16
16
  from dataeval.detectors.drift.uncertainty import DriftUncertainty
@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
18
18
  import numpy as np
19
19
  from numpy.typing import ArrayLike, NDArray
20
20
 
21
- from dataeval.interop import as_numpy, to_numpy
21
+ from dataeval.interop import as_numpy
22
22
  from dataeval.output import OutputMetadata, set_metadata
23
23
 
24
24
  R = TypeVar("R")
@@ -196,7 +196,7 @@ class BaseDrift:
196
196
  if correction not in ["bonferroni", "fdr"]:
197
197
  raise ValueError("`correction` must be `bonferroni` or `fdr`.")
198
198
 
199
- self._x_ref = to_numpy(x_ref)
199
+ self._x_ref = as_numpy(x_ref)
200
200
  self.x_ref_preprocessed: bool = x_ref_preprocessed
201
201
 
202
202
  # Other attributes
@@ -480,7 +480,7 @@ class Clusterer:
480
480
  samples = self.clusters[level][cluster_id].samples
481
481
  if len(samples) >= self._min_num_samples_per_cluster:
482
482
  duplicates_std.append(self.clusters[level][cluster_id].dist_std)
483
- diag_mask = np.ones_like(self._sqdmat, dtype=bool)
483
+ diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
484
484
  np.fill_diagonal(diag_mask, 0)
485
485
  diag_mask = np.triu(diag_mask)
486
486
 
@@ -4,7 +4,7 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
4
4
 
5
5
  from dataeval import _IS_TENSORFLOW_AVAILABLE
6
6
 
7
- if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
7
+ if _IS_TENSORFLOW_AVAILABLE:
8
8
  from dataeval.detectors.ood.ae import OOD_AE
9
9
  from dataeval.detectors.ood.aegmm import OOD_AEGMM
10
10
  from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
@@ -10,18 +10,26 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = ["OOD_AE"]
12
12
 
13
- from typing import Callable
13
+ from typing import TYPE_CHECKING, Callable
14
14
 
15
15
  import numpy as np
16
- import tensorflow as tf
17
- import tf_keras as keras
18
16
  from numpy.typing import ArrayLike
19
17
 
20
18
  from dataeval.detectors.ood.base import OODBase, OODScoreOutput
21
19
  from dataeval.interop import as_numpy
22
- from dataeval.utils.tensorflow._internal.autoencoder import AE
20
+ from dataeval.utils.lazy import lazyload
23
21
  from dataeval.utils.tensorflow._internal.utils import predict_batch
24
22
 
23
+ if TYPE_CHECKING:
24
+ import tensorflow as tf
25
+ import tf_keras as keras
26
+
27
+ import dataeval.utils.tensorflow._internal.models as tf_models
28
+ else:
29
+ tf = lazyload("tensorflow")
30
+ keras = lazyload("tf_keras")
31
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
32
+
25
33
 
26
34
  class OOD_AE(OODBase):
27
35
  """
@@ -33,7 +41,7 @@ class OOD_AE(OODBase):
33
41
  An :term:`autoencoder<Autoencoder>` model.
34
42
  """
35
43
 
36
- def __init__(self, model: AE) -> None:
44
+ def __init__(self, model: tf_models.AE) -> None:
37
45
  super().__init__(model)
38
46
 
39
47
  def fit(
@@ -41,7 +49,7 @@ class OOD_AE(OODBase):
41
49
  x_ref: ArrayLike,
42
50
  threshold_perc: float = 100.0,
43
51
  loss_fn: Callable[..., tf.Tensor] | None = None,
44
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
52
+ optimizer: keras.optimizers.Optimizer | None = None,
45
53
  epochs: int = 20,
46
54
  batch_size: int = 64,
47
55
  verbose: bool = True,
@@ -10,19 +10,27 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = ["OOD_AEGMM"]
12
12
 
13
- from typing import Callable
13
+ from typing import TYPE_CHECKING, Callable
14
14
 
15
- import tensorflow as tf
16
- import tf_keras as keras
17
15
  from numpy.typing import ArrayLike
18
16
 
19
17
  from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
20
18
  from dataeval.interop import to_numpy
21
- from dataeval.utils.tensorflow._internal.autoencoder import AEGMM
19
+ from dataeval.utils.lazy import lazyload
22
20
  from dataeval.utils.tensorflow._internal.gmm import gmm_energy
23
21
  from dataeval.utils.tensorflow._internal.loss import LossGMM
24
22
  from dataeval.utils.tensorflow._internal.utils import predict_batch
25
23
 
24
+ if TYPE_CHECKING:
25
+ import tensorflow as tf
26
+ import tf_keras as keras
27
+
28
+ import dataeval.utils.tensorflow._internal.models as tf_models
29
+ else:
30
+ tf = lazyload("tensorflow")
31
+ keras = lazyload("tf_keras")
32
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
33
+
26
34
 
27
35
  class OOD_AEGMM(OODGMMBase):
28
36
  """
@@ -34,7 +42,7 @@ class OOD_AEGMM(OODGMMBase):
34
42
  An AEGMM model.
35
43
  """
36
44
 
37
- def __init__(self, model: AEGMM) -> None:
45
+ def __init__(self, model: tf_models.AEGMM) -> None:
38
46
  super().__init__(model)
39
47
 
40
48
  def fit(
@@ -42,7 +50,7 @@ class OOD_AEGMM(OODGMMBase):
42
50
  x_ref: ArrayLike,
43
51
  threshold_perc: float = 100.0,
44
52
  loss_fn: Callable[..., tf.Tensor] | None = None,
45
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
53
+ optimizer: keras.optimizers.Optimizer | None = None,
46
54
  epochs: int = 20,
47
55
  batch_size: int = 64,
48
56
  verbose: bool = True,
@@ -12,18 +12,24 @@ __all__ = ["OODOutput", "OODScoreOutput"]
12
12
 
13
13
  from abc import ABC, abstractmethod
14
14
  from dataclasses import dataclass
15
- from typing import Callable, Literal, cast
15
+ from typing import TYPE_CHECKING, Callable, Literal, cast
16
16
 
17
17
  import numpy as np
18
- import tensorflow as tf
19
- import tf_keras as keras
20
18
  from numpy.typing import ArrayLike, NDArray
21
19
 
22
20
  from dataeval.interop import to_numpy
23
21
  from dataeval.output import OutputMetadata, set_metadata
22
+ from dataeval.utils.lazy import lazyload
24
23
  from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
25
24
  from dataeval.utils.tensorflow._internal.trainer import trainer
26
25
 
26
+ if TYPE_CHECKING:
27
+ import tensorflow as tf
28
+ import tf_keras as keras
29
+ else:
30
+ tf = lazyload("tensorflow")
31
+ keras = lazyload("tf_keras")
32
+
27
33
 
28
34
  @dataclass(frozen=True)
29
35
  class OODOutput(OutputMetadata):
@@ -11,25 +11,31 @@ from __future__ import annotations
11
11
  __all__ = ["OOD_LLR"]
12
12
 
13
13
  from functools import partial
14
- from typing import Callable
14
+ from typing import TYPE_CHECKING, Callable
15
15
 
16
16
  import numpy as np
17
- import tensorflow as tf
18
- import tf_keras as keras
19
17
  from numpy.typing import ArrayLike, NDArray
20
- from tf_keras.layers import Input
21
- from tf_keras.models import Model
22
18
 
23
19
  from dataeval.detectors.ood.base import OODBase, OODScoreOutput
24
20
  from dataeval.interop import to_numpy
25
- from dataeval.utils.tensorflow._internal.pixelcnn import PixelCNN
21
+ from dataeval.utils.lazy import lazyload
26
22
  from dataeval.utils.tensorflow._internal.trainer import trainer
27
23
  from dataeval.utils.tensorflow._internal.utils import predict_batch
28
24
 
25
+ if TYPE_CHECKING:
26
+ import tensorflow as tf
27
+ import tf_keras as keras
28
+
29
+ import dataeval.utils.tensorflow._internal.models as tf_models
30
+ else:
31
+ tf = lazyload("tensorflow")
32
+ keras = lazyload("tf_keras")
33
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
34
+
29
35
 
30
36
  def _build_model(
31
- dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
32
- ) -> tuple[keras.Model, PixelCNN]:
37
+ dist: tf_models.PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
38
+ ) -> tuple[keras.Model, tf_models.PixelCNN]:
33
39
  """
34
40
  Create keras.Model from TF distribution.
35
41
 
@@ -46,9 +52,9 @@ def _build_model(
46
52
  -------
47
53
  TensorFlow model.
48
54
  """
49
- x_in = Input(shape=input_shape)
55
+ x_in = keras.layers.Input(shape=input_shape)
50
56
  log_prob = dist.log_prob(x_in)
51
- model = Model(inputs=x_in, outputs=log_prob)
57
+ model = keras.models.Model(inputs=x_in, outputs=log_prob)
52
58
  model.add_loss(-tf.reduce_mean(log_prob))
53
59
  if isinstance(filepath, str):
54
60
  model.load_weights(filepath)
@@ -109,13 +115,13 @@ class OOD_LLR(OODBase):
109
115
 
110
116
  def __init__(
111
117
  self,
112
- model: PixelCNN,
113
- model_background: PixelCNN | None = None,
118
+ model: tf_models.PixelCNN,
119
+ model_background: tf_models.PixelCNN | None = None,
114
120
  log_prob: Callable | None = None,
115
121
  sequential: bool = False,
116
122
  ) -> None:
117
- self.dist_s: PixelCNN = model
118
- self.dist_b: PixelCNN = (
123
+ self.dist_s: tf_models.PixelCNN = model
124
+ self.dist_b: tf_models.PixelCNN = (
119
125
  model.copy()
120
126
  if hasattr(model, "copy")
121
127
  else keras.models.clone_model(model)
@@ -135,7 +141,7 @@ class OOD_LLR(OODBase):
135
141
  x_ref: ArrayLike,
136
142
  threshold_perc: float = 100.0,
137
143
  loss_fn: Callable | None = None,
138
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
144
+ optimizer: keras.optimizers.Optimizer | None = None,
139
145
  epochs: int = 20,
140
146
  batch_size: int = 64,
141
147
  verbose: bool = True,
@@ -176,7 +182,7 @@ class OOD_LLR(OODBase):
176
182
  """
177
183
  x_ref = to_numpy(x_ref)
178
184
  input_shape = x_ref.shape[1:]
179
- optimizer = optimizer() if isinstance(optimizer, type) else optimizer
185
+ optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
180
186
  # Separate into two separate optimizers, one for semantic model and one for background model
181
187
  optimizer_s = optimizer
182
188
  optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
@@ -10,19 +10,27 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = ["OOD_VAE"]
12
12
 
13
- from typing import Callable
13
+ from typing import TYPE_CHECKING, Callable
14
14
 
15
15
  import numpy as np
16
- import tensorflow as tf
17
- import tf_keras as keras
18
16
  from numpy.typing import ArrayLike
19
17
 
20
18
  from dataeval.detectors.ood.base import OODBase, OODScoreOutput
21
19
  from dataeval.interop import to_numpy
22
- from dataeval.utils.tensorflow._internal.autoencoder import VAE
20
+ from dataeval.utils.lazy import lazyload
23
21
  from dataeval.utils.tensorflow._internal.loss import Elbo
24
22
  from dataeval.utils.tensorflow._internal.utils import predict_batch
25
23
 
24
+ if TYPE_CHECKING:
25
+ import tensorflow as tf
26
+ import tf_keras as keras
27
+
28
+ import dataeval.utils.tensorflow._internal.models as tf_models
29
+ else:
30
+ tf = lazyload("tensorflow")
31
+ keras = lazyload("tf_keras")
32
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
33
+
26
34
 
27
35
  class OOD_VAE(OODBase):
28
36
  """
@@ -51,7 +59,7 @@ class OOD_VAE(OODBase):
51
59
  >>> result = metric.predict(dataset, ood_type="feature")
52
60
  """
53
61
 
54
- def __init__(self, model: VAE, samples: int = 10) -> None:
62
+ def __init__(self, model: tf_models.VAE, samples: int = 10) -> None:
55
63
  super().__init__(model)
56
64
  self.samples = samples
57
65
 
@@ -60,7 +68,7 @@ class OOD_VAE(OODBase):
60
68
  x_ref: ArrayLike,
61
69
  threshold_perc: float = 100.0,
62
70
  loss_fn: Callable[..., tf.Tensor] = Elbo(0.05),
63
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
71
+ optimizer: keras.optimizers.Optimizer | None = None,
64
72
  epochs: int = 20,
65
73
  batch_size: int = 64,
66
74
  verbose: bool = True,
@@ -10,20 +10,28 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = ["OOD_VAEGMM"]
12
12
 
13
- from typing import Callable
13
+ from typing import TYPE_CHECKING, Callable
14
14
 
15
15
  import numpy as np
16
- import tensorflow as tf
17
- import tf_keras as keras
18
16
  from numpy.typing import ArrayLike
19
17
 
20
18
  from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
21
19
  from dataeval.interop import to_numpy
22
- from dataeval.utils.tensorflow._internal.autoencoder import VAEGMM
20
+ from dataeval.utils.lazy import lazyload
23
21
  from dataeval.utils.tensorflow._internal.gmm import gmm_energy
24
22
  from dataeval.utils.tensorflow._internal.loss import Elbo, LossGMM
25
23
  from dataeval.utils.tensorflow._internal.utils import predict_batch
26
24
 
25
+ if TYPE_CHECKING:
26
+ import tensorflow as tf
27
+ import tf_keras as keras
28
+
29
+ import dataeval.utils.tensorflow._internal.models as tf_models
30
+ else:
31
+ tf = lazyload("tensorflow")
32
+ keras = lazyload("tf_keras")
33
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
34
+
27
35
 
28
36
  class OOD_VAEGMM(OODGMMBase):
29
37
  """
@@ -37,7 +45,7 @@ class OOD_VAEGMM(OODGMMBase):
37
45
  Number of samples sampled to evaluate each instance.
38
46
  """
39
47
 
40
- def __init__(self, model: VAEGMM, samples: int = 10) -> None:
48
+ def __init__(self, model: tf_models.VAEGMM, samples: int = 10) -> None:
41
49
  super().__init__(model)
42
50
  self.samples = samples
43
51
 
@@ -46,7 +54,7 @@ class OOD_VAEGMM(OODGMMBase):
46
54
  x_ref: ArrayLike,
47
55
  threshold_perc: float = 100.0,
48
56
  loss_fn: Callable[..., tf.Tensor] = LossGMM(elbo=Elbo(0.05)),
49
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
57
+ optimizer: keras.optimizers.Optimizer | None = None,
50
58
  epochs: int = 20,
51
59
  batch_size: int = 64,
52
60
  verbose: bool = True,
dataeval/interop.py CHANGED
@@ -37,13 +37,15 @@ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
37
37
  if isinstance(array, np.ndarray):
38
38
  return array.copy() if copy else array
39
39
 
40
- tf = _try_import("tensorflow")
41
- if tf and tf.is_tensor(array):
42
- return array.numpy().copy() if copy else array.numpy() # type: ignore
43
-
44
- torch = _try_import("torch")
45
- if torch and isinstance(array, torch.Tensor):
46
- return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
40
+ if array.__class__.__module__.startswith("tensorflow"):
41
+ tf = _try_import("tensorflow")
42
+ if tf and tf.is_tensor(array):
43
+ return array.numpy().copy() if copy else array.numpy() # type: ignore
44
+
45
+ if array.__class__.__module__.startswith("torch"):
46
+ torch = _try_import("torch")
47
+ if torch and isinstance(array, torch.Tensor):
48
+ return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
47
49
 
48
50
  return np.array(array, copy=copy)
49
51
 
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = ["BalanceOutput", "balance"]
4
4
 
5
+ import contextlib
5
6
  import warnings
6
7
  from dataclasses import dataclass
7
8
  from typing import Any, Mapping
@@ -10,9 +11,12 @@ import numpy as np
10
11
  from numpy.typing import ArrayLike, NDArray
11
12
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
12
13
 
13
- from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
14
+ from dataeval.metrics.bias.metadata import CLASS_LABEL, entropy, heatmap, preprocess_metadata
14
15
  from dataeval.output import OutputMetadata, set_metadata
15
16
 
17
+ with contextlib.suppress(ImportError):
18
+ from matplotlib.figure import Figure
19
+
16
20
 
17
21
  @dataclass(frozen=True)
18
22
  class BalanceOutput(OutputMetadata):
@@ -27,45 +31,43 @@ class BalanceOutput(OutputMetadata):
27
31
  Estimate of inter/intra-factor mutual information
28
32
  classwise : NDArray[np.float64]
29
33
  Estimate of mutual information between metadata factors and individual class labels
30
- class_list: NDArray[np.int64]
31
- Class labels for each value in the dataset
32
- metadata_names: list[str]
34
+ class_list : NDArray
35
+ Array of the class labels present in the dataset
36
+ metadata_names : list[str]
33
37
  Names of each metadata factor
34
38
  """
35
39
 
36
40
  balance: NDArray[np.float64]
37
41
  factors: NDArray[np.float64]
38
42
  classwise: NDArray[np.float64]
39
-
40
- class_list: NDArray[np.int64]
43
+ class_list: NDArray[Any]
41
44
  metadata_names: list[str]
42
45
 
43
46
  def plot(
44
47
  self,
45
- row_labels: NDArray[Any] | None = None,
46
- col_labels: NDArray[Any] | None = None,
48
+ row_labels: list[Any] | NDArray[Any] | None = None,
49
+ col_labels: list[Any] | NDArray[Any] | None = None,
47
50
  plot_classwise: bool = False,
48
- ) -> None:
51
+ ) -> Figure:
49
52
  """
50
53
  Plot a heatmap of balance information
51
54
 
52
55
  Parameters
53
56
  ----------
54
- row_labels: NDArray | None, default None
55
- Array containing the labels for rows in the histogram
56
- col_labels: NDArray | None, default None
57
- Array containing the labels for columns in the histogram
58
- plot_classwise: bool, default False
57
+ row_labels : ArrayLike or None, default None
58
+ List/Array containing the labels for rows in the histogram
59
+ col_labels : ArrayLike or None, default None
60
+ List/Array containing the labels for columns in the histogram
61
+ plot_classwise : bool, default False
59
62
  Whether to plot per-class balance instead of global balance
60
-
61
63
  """
62
64
  if plot_classwise:
63
65
  if row_labels is None:
64
- row_labels = np.unique(self.class_list)
66
+ row_labels = self.class_list
65
67
  if col_labels is None:
66
68
  col_labels = np.concatenate((["class"], self.metadata_names))
67
69
 
68
- heatmap(
70
+ fig = heatmap(
69
71
  self.classwise,
70
72
  row_labels,
71
73
  col_labels,
@@ -74,6 +76,7 @@ class BalanceOutput(OutputMetadata):
74
76
  cbarlabel="Normalized Mutual Information",
75
77
  )
76
78
  else:
79
+ # Combine balance and factors results
77
80
  data = np.concatenate([self.balance[np.newaxis, 1:], self.factors], axis=0)
78
81
  # Create a mask for the upper triangle of the symmetrical array, ignoring the diagonal
79
82
  mask = np.triu(data + 1, k=0) < 1
@@ -87,12 +90,9 @@ class BalanceOutput(OutputMetadata):
87
90
  if col_labels is None:
88
91
  col_labels = heat_labels[1:]
89
92
 
90
- heatmap(
91
- heat_data,
92
- row_labels,
93
- col_labels,
94
- cbarlabel="Normalized Mutual Information",
95
- )
93
+ fig = heatmap(heat_data, row_labels, col_labels, cbarlabel="Normalized Mutual Information")
94
+
95
+ return fig
96
96
 
97
97
 
98
98
  def validate_num_neighbors(num_neighbors: int) -> int:
@@ -116,19 +116,29 @@ def validate_num_neighbors(num_neighbors: int) -> int:
116
116
 
117
117
 
118
118
  @set_metadata("dataeval.metrics")
119
- def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
119
+ def balance(
120
+ class_labels: ArrayLike,
121
+ metadata: Mapping[str, ArrayLike],
122
+ num_neighbors: int = 5,
123
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
124
+ ) -> BalanceOutput:
120
125
  """
121
126
  Mutual information (MI) between factors (class label, metadata, label/image properties)
122
127
 
123
128
  Parameters
124
129
  ----------
125
- class_labels: ArrayLike
130
+ class_labels : ArrayLike
126
131
  List of class labels for each image
127
- metadata: Mapping[str, ArrayLike]
132
+ metadata : Mapping[str, ArrayLike]
128
133
  Dict of lists of metadata factors for each image
129
- num_neighbors: int, default 5
134
+ num_neighbors : int, default 5
130
135
  Number of nearest neighbors to use for computing MI between discrete
131
136
  and continuous variables.
137
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
138
+ The factors in metadata that have continuous values and the array of bin counts to
139
+ discretize values into. All factors are treated as having discrete values unless they
140
+ are specified as keys in this dictionary. Each element of this array must occur as a key
141
+ in metadata.
132
142
 
133
143
  Returns
134
144
  -------
@@ -148,7 +158,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
148
158
  -------
149
159
  Return balance (mutual information) of factors with class_labels
150
160
 
151
- >>> bal = balance(class_labels, metadata)
161
+ >>> bal = balance(class_labels, metadata, continuous_factor_bincounts=continuous_factor_bincounts)
152
162
  >>> bal.balance
153
163
  array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
154
164
 
@@ -165,6 +175,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
165
175
  array([[0.99999822, 0.13363788, 0. , 0. ],
166
176
  [0.99999822, 0.13363788, 0. , 0. ]])
167
177
 
178
+
168
179
  See Also
169
180
  --------
170
181
  sklearn.feature_selection.mutual_info_classif
@@ -172,18 +183,15 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
172
183
  sklearn.metrics.mutual_info_score
173
184
  """
174
185
  num_neighbors = validate_num_neighbors(num_neighbors)
175
- data, names, is_categorical = preprocess_metadata(class_labels, metadata)
186
+ data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
176
187
  num_factors = len(names)
177
188
  mi = np.empty((num_factors, num_factors))
178
189
  mi[:] = np.nan
179
190
 
180
- class_idx = names.index("class_label")
181
- class_lbl = np.array(data[:, class_idx], dtype=int)
182
-
183
191
  for idx in range(num_factors):
184
- tgt = data[:, idx].astype(int)
192
+ tgt = data[:, idx].astype(np.intp)
185
193
 
186
- if is_categorical[idx]:
194
+ if continuous_factor_bincounts and names[idx] not in continuous_factor_bincounts:
187
195
  mi[idx, :] = mutual_info_classif(
188
196
  data,
189
197
  tgt,
@@ -200,7 +208,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
200
208
  random_state=0,
201
209
  )
202
210
 
203
- ent_all = entropy(data, names, is_categorical, normalized=False)
211
+ ent_all = entropy(data, names, continuous_factor_bincounts, normalized=False)
204
212
  norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
205
213
  # in principle MI should be symmetric, but it is not in practice.
206
214
  nmi = 0.5 * (mi + mi.T) / norm_factor
@@ -208,9 +216,8 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
208
216
  factors = nmi[1:, 1:]
209
217
 
210
218
  # unique class labels
211
- class_idx = names.index("class_label")
212
- class_data = data[:, class_idx].astype(int)
213
- u_cls = np.unique(class_data)
219
+ class_idx = names.index(CLASS_LABEL)
220
+ u_cls = np.unique(data[:, class_idx])
214
221
  num_classes = len(u_cls)
215
222
 
216
223
  # assume class is a factor
@@ -218,12 +225,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
218
225
  classwise_mi[:] = np.nan
219
226
 
220
227
  # categorical variables, excluding class label
221
- cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
228
+ cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(np.intp)
222
229
 
223
- tgt_bin = np.stack([class_data == cls for cls in u_cls]).T.astype(int)
224
- ent_tgt_bin = entropy(
225
- tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
226
- )
230
+ tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(np.intp)
231
+ names = [str(idx) for idx in range(num_classes)]
232
+ ent_tgt_bin = entropy(tgt_bin, names, continuous_factor_bincounts)
227
233
 
228
234
  # classification MI for discrete/categorical features
229
235
  for idx in range(num_classes):
@@ -240,4 +246,4 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
240
246
  norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_all) + 1e-6
241
247
  classwise = classwise_mi / norm_factor
242
248
 
243
- return BalanceOutput(balance, factors, classwise, class_lbl, list(metadata.keys()))
249
+ return BalanceOutput(balance, factors, classwise, unique_labels, list(metadata.keys()))