dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +45 -16
  3. dataeval/_internal/detectors/drift/base.py +15 -12
  4. dataeval/_internal/detectors/drift/cvm.py +12 -8
  5. dataeval/_internal/detectors/drift/ks.py +7 -3
  6. dataeval/_internal/detectors/drift/mmd.py +15 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +6 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -11
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +7 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +6 -5
  12. dataeval/_internal/detectors/ood/base.py +15 -13
  13. dataeval/_internal/detectors/ood/llr.py +8 -5
  14. dataeval/_internal/detectors/ood/vae.py +6 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  16. dataeval/_internal/interop.py +43 -0
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +2 -84
  19. dataeval/_internal/metrics/ber.py +77 -53
  20. dataeval/_internal/metrics/coverage.py +80 -55
  21. dataeval/_internal/metrics/divergence.py +62 -54
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +292 -163
  24. dataeval/_internal/metrics/stats.py +48 -35
  25. dataeval/_internal/metrics/uap.py +31 -26
  26. dataeval/_internal/metrics/utils.py +237 -2
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/_internal/workflows/__init__.py +0 -0
  29. dataeval/metrics/__init__.py +25 -5
  30. dataeval/utils/__init__.py +9 -0
  31. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
  32. dataeval-0.64.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/hash.py +0 -79
  34. dataeval-0.61.0.dist-info/RECORD +0 -55
  35. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -1,6 +1,7 @@
1
- from typing import Literal, Optional, Sequence, Union
1
+ from typing import Iterable, Literal, Optional, Sequence, Union
2
2
 
3
3
  import numpy as np
4
+ from numpy.typing import ArrayLike
4
5
 
5
6
  from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
6
7
  from dataeval._internal.metrics.stats import ImageStats
@@ -30,25 +31,79 @@ def _get_outlier_mask(
30
31
 
31
32
 
32
33
  class Linter:
33
- """
34
- Calculates statistical outliers of a dataset using various statistical
35
- tests applied to each image
34
+ r"""
35
+ Calculates statistical outliers of a dataset using various statistical tests applied to each image
36
+
37
+ Parameters
38
+ ----------
39
+ flags : [ImageProperty | ImageStatistics | ImageVisuals], default None
40
+ Metric(s) to calculate for each image - calculates all metrics if None
41
+ outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
42
+ Statistical method used to identify outliers
43
+ outlier_threshold : float, optional - default None
44
+ Threshold value for the given ``outlier_method``, above which data is considered an outlier.
45
+ Uses method specific default if `None`
46
+
47
+ Attributes
48
+ ----------
49
+ stats : ImageStats
50
+ Class to hold the value of each metric for each image
51
+
52
+ See Also
53
+ --------
54
+ Duplicates
55
+
56
+ Notes
57
+ ------
58
+ There are 3 different statistical methods:
59
+
60
+ - zscore
61
+ - modzscore
62
+ - iqr
63
+
64
+ | The z score method is based on the difference between the data point and the mean of the data.
65
+ The default threshold value for `zscore` is 3.
66
+ | Z score = :math:`|x_i - \mu| / \sigma`
67
+
68
+ | The modified z score method is based on the difference between the data point and the median of the data.
69
+ The default threshold value for `modzscore` is 3.5.
70
+ | Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`, where :math:`MAD` is the median absolute deviation
71
+
72
+ | The interquartile range method is based on the difference between the data point and
73
+ the difference between the 75th and 25th qartile. The default threshold value for `iqr` is 1.5.
74
+ | Interquartile range = :math:`threshold * (Q_3 - Q_1)`
75
+
76
+ Examples
77
+ --------
78
+ Initialize the Linter class:
79
+
80
+ >>> lint = Linter()
81
+
82
+ Specifying specific metrics to analyze:
83
+
84
+ >>> lint = Linter(flags=[ImageProperty.SIZE, ImageVisuals.ALL])
85
+
86
+ Specifying an outlier method:
87
+
88
+ >>> lint = Linter(outlier_method="iqr")
89
+
90
+ Specifying an outlier method and threshold:
91
+
92
+ >>> lint = Linter(outlier_method="zscore", outlier_threshold=2.5)
36
93
  """
37
94
 
38
95
  def __init__(
39
96
  self,
40
- images: np.ndarray,
41
97
  flags: Optional[Union[LinterFlags, Sequence[LinterFlags]]] = None,
98
+ outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
99
+ outlier_threshold: Optional[float] = None,
42
100
  ):
43
101
  flags = flags if flags is not None else (ImageProperty.ALL, ImageVisuals.ALL)
44
102
  self.stats = ImageStats(flags)
45
- self.images = images
103
+ self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
104
+ self.outlier_threshold = outlier_threshold
46
105
 
47
- def _get_outliers(
48
- self,
49
- outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
50
- outlier_threshold: Optional[float] = None,
51
- ) -> dict:
106
+ def _get_outliers(self) -> dict:
52
107
  flagged_images = {}
53
108
 
54
109
  for stat, values in self.results.items():
@@ -56,23 +111,37 @@ class Linter:
56
111
  continue
57
112
 
58
113
  if values.ndim == 1 and np.std(values) != 0:
59
- mask = _get_outlier_mask(values, outlier_method, outlier_threshold)
114
+ mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
60
115
  indices = np.flatnonzero(mask)
61
116
  for i, value in zip(indices, values[mask]):
62
117
  flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
63
118
 
64
119
  return dict(sorted(flagged_images.items()))
65
120
 
66
- def evaluate(self) -> dict:
121
+ def evaluate(self, images: Iterable[ArrayLike]) -> dict:
67
122
  """
68
- Returns indices of outliers with and the issues identified for each
123
+ Returns indices of outliers with the issues identified for each
124
+
125
+ Parameters
126
+ ----------
127
+ images : Iterable[ArrayLike], shape - (N, C, H, W)
128
+ A dataset in an ArrayLike format.
129
+ Function expects the data to have 3 dimensions, CxHxW.
69
130
 
70
131
  Returns
71
132
  -------
72
133
  Dict[int, Dict[str, float]]
73
- Dictionary containing the indices of outliers and a dictionary issues and calculated values
134
+ Dictionary containing the indices of outliers and a dictionary showing
135
+ the issues and calculated values for the given index.
136
+
137
+ Example
138
+ -------
139
+ Evaluate the dataset:
140
+
141
+ >>> lint.evaluate(images)
142
+ {18: {'brightness': 0.78}, 25: {'brightness': 0.98}}
74
143
  """
75
144
  self.stats.reset()
76
- self.stats.update(self.images)
145
+ self.stats.update(images)
77
146
  self.results = self.stats.compute()
78
147
  return self._get_outliers()
@@ -10,8 +10,10 @@ from typing import Callable
10
10
 
11
11
  import keras
12
12
  import numpy as np
13
+ from numpy.typing import ArrayLike
13
14
 
14
15
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
16
+ from dataeval._internal.interop import to_numpy
15
17
  from dataeval._internal.models.tensorflow.autoencoder import AE
16
18
  from dataeval._internal.models.tensorflow.utils import predict_batch
17
19
 
@@ -30,7 +32,7 @@ class OOD_AE(OODBase):
30
32
 
31
33
  def fit(
32
34
  self,
33
- x_ref: np.ndarray,
35
+ x_ref: ArrayLike,
34
36
  threshold_perc: float = 100.0,
35
37
  loss_fn: Callable = keras.losses.MeanSquaredError(),
36
38
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -43,7 +45,7 @@ class OOD_AE(OODBase):
43
45
 
44
46
  Parameters
45
47
  ----------
46
- x_ref : np.ndarray
48
+ x_ref : ArrayLike
47
49
  Training batch.
48
50
  threshold_perc : float, default 100.0
49
51
  Percentage of reference data that is normal.
@@ -58,10 +60,10 @@ class OOD_AE(OODBase):
58
60
  verbose : bool, default True
59
61
  Whether to print training progress.
60
62
  """
61
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
63
+ super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
62
64
 
63
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
64
- self._validate(X)
65
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
66
+ self._validate(X := to_numpy(X))
65
67
 
66
68
  # reconstruct instances
67
69
  X_recon = predict_batch(X, self.model, batch_size=batch_size)
@@ -9,9 +9,10 @@ Licensed under Apache Software License (Apache 2.0)
9
9
  from typing import Callable
10
10
 
11
11
  import keras
12
- import numpy as np
12
+ from numpy.typing import ArrayLike
13
13
 
14
14
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
15
+ from dataeval._internal.interop import to_numpy
15
16
  from dataeval._internal.models.tensorflow.autoencoder import AEGMM
16
17
  from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
18
  from dataeval._internal.models.tensorflow.losses import LossGMM
@@ -32,7 +33,7 @@ class OOD_AEGMM(OODGMMBase):
32
33
 
33
34
  def fit(
34
35
  self,
35
- x_ref: np.ndarray,
36
+ x_ref: ArrayLike,
36
37
  threshold_perc: float = 100.0,
37
38
  loss_fn: Callable = LossGMM(),
38
39
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -45,7 +46,7 @@ class OOD_AEGMM(OODGMMBase):
45
46
 
46
47
  Parameters
47
48
  ----------
48
- x_ref : np.ndarray
49
+ x_ref : ArrayLike
49
50
  Training batch.
50
51
  threshold_perc : float, default 100.0
51
52
  Percentage of reference data that is normal.
@@ -62,8 +63,8 @@ class OOD_AEGMM(OODGMMBase):
62
63
  """
63
64
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
64
65
 
65
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
66
- self._validate(X)
66
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
67
+ self._validate(X := to_numpy(X))
67
68
  _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
68
69
  energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
69
70
  return OODScore(energy.numpy()) # type: ignore
@@ -12,7 +12,9 @@ from typing import Callable, Dict, List, Literal, NamedTuple, Optional, Tuple, c
12
12
  import keras
13
13
  import numpy as np
14
14
  import tensorflow as tf
15
+ from numpy.typing import ArrayLike
15
16
 
17
+ from dataeval._internal.interop import to_numpy
16
18
  from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
17
19
  from dataeval._internal.models.tensorflow.trainer import trainer
18
20
 
@@ -66,13 +68,13 @@ class OODBase(ABC):
66
68
  self._validate(X)
67
69
 
68
70
  @abstractmethod
69
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
71
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
70
72
  """
71
73
  Compute instance and (optionally) feature level outlier scores.
72
74
 
73
75
  Parameters
74
76
  ----------
75
- X : np.ndarray
77
+ X : ArrayLike
76
78
  Batch of instances.
77
79
  batch_size : int, default int(1e10)
78
80
  Batch size used when making predictions with the autoencoder.
@@ -87,7 +89,7 @@ class OODBase(ABC):
87
89
 
88
90
  def fit(
89
91
  self,
90
- x_ref: np.ndarray,
92
+ x_ref: ArrayLike,
91
93
  threshold_perc: float,
92
94
  loss_fn: Callable,
93
95
  optimizer: keras.optimizers.Optimizer,
@@ -100,7 +102,7 @@ class OODBase(ABC):
100
102
 
101
103
  Parameters
102
104
  ----------
103
- x_ref: : np.ndarray
105
+ x_ref: : ArrayLike
104
106
  Training batch.
105
107
  threshold_perc : float
106
108
  Percentage of reference data that is normal.
@@ -119,7 +121,7 @@ class OODBase(ABC):
119
121
  trainer(
120
122
  model=self.model,
121
123
  loss_fn=loss_fn,
122
- x_train=x_ref,
124
+ x_train=to_numpy(x_ref),
123
125
  optimizer=optimizer,
124
126
  epochs=epochs,
125
127
  batch_size=batch_size,
@@ -132,7 +134,7 @@ class OODBase(ABC):
132
134
 
133
135
  def predict(
134
136
  self,
135
- X: np.ndarray,
137
+ X: ArrayLike,
136
138
  batch_size: int = int(1e10),
137
139
  ood_type: Literal["feature", "instance"] = "instance",
138
140
  ) -> Dict[str, np.ndarray]:
@@ -141,18 +143,18 @@ class OODBase(ABC):
141
143
 
142
144
  Parameters
143
145
  ----------
144
- X
146
+ X : ArrayLike
145
147
  Batch of instances.
146
- ood_type
147
- Predict out-of-distribution at the 'feature' or 'instance' level.
148
- batch_size
148
+ batch_size : int, default int(1e10)
149
149
  Batch size used when making predictions with the autoencoder.
150
+ ood_type : Literal["feature", "instance"], default "instance"
151
+ Predict out-of-distribution at the 'feature' or 'instance' level.
150
152
 
151
153
  Returns
152
154
  -------
153
155
  Dictionary containing the outlier predictions and both feature and instance level outlier scores.
154
156
  """
155
- self._validate_state(X)
157
+ self._validate_state(X := to_numpy(X))
156
158
  # compute outlier scores
157
159
  score = self.score(X, batch_size=batch_size)
158
160
  ood_pred = (score.get(ood_type) > self._threshold_score(ood_type)).astype(int)
@@ -171,7 +173,7 @@ class OODGMMBase(OODBase):
171
173
 
172
174
  def fit(
173
175
  self,
174
- x_ref: np.ndarray,
176
+ x_ref: ArrayLike,
175
177
  threshold_perc: float,
176
178
  loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
177
179
  optimizer: keras.optimizers.Optimizer,
@@ -183,7 +185,7 @@ class OODGMMBase(OODBase):
183
185
  trainer(
184
186
  model=self.model,
185
187
  loss_fn=loss_fn,
186
- x_train=x_ref,
188
+ x_train=to_numpy(x_ref),
187
189
  optimizer=optimizer,
188
190
  epochs=epochs,
189
191
  batch_size=batch_size,
@@ -14,8 +14,10 @@ import numpy as np
14
14
  import tensorflow as tf
15
15
  from keras.layers import Input
16
16
  from keras.models import Model
17
+ from numpy.typing import ArrayLike
17
18
 
18
19
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
20
+ from dataeval._internal.interop import to_numpy
19
21
  from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
20
22
  from dataeval._internal.models.tensorflow.trainer import trainer
21
23
  from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -125,7 +127,7 @@ class OOD_LLR(OODBase):
125
127
 
126
128
  def fit(
127
129
  self,
128
- x_ref: np.ndarray,
130
+ x_ref: ArrayLike,
129
131
  threshold_perc: float = 100.0,
130
132
  loss_fn: Optional[Callable] = None,
131
133
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -141,7 +143,7 @@ class OOD_LLR(OODBase):
141
143
 
142
144
  Parameters
143
145
  ----------
144
- x_ref : np.ndarray
146
+ x_ref : ArrayLike
145
147
  Training batch.
146
148
  threshold_perc : float, default 100.0
147
149
  Percentage of reference data that is normal.
@@ -163,6 +165,7 @@ class OOD_LLR(OODBase):
163
165
  mutate_batch_size: int, default int(1e10)
164
166
  Batch size used to generate the mutations for the background dataset.
165
167
  """
168
+ x_ref = to_numpy(x_ref)
166
169
  input_shape = x_ref.shape[1:]
167
170
  optimizer = optimizer() if isinstance(optimizer, type) else optimizer
168
171
  # Separate into two separate optimizers, one for semantic model and one for background model
@@ -178,7 +181,7 @@ class OOD_LLR(OODBase):
178
181
 
179
182
  # create background data
180
183
  mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
181
- X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype)
184
+ X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype) # type: ignore
182
185
 
183
186
  # prepare sequential data
184
187
  if self.sequential and not self.has_log_prob:
@@ -275,10 +278,10 @@ class OOD_LLR(OODBase):
275
278
 
276
279
  def score(
277
280
  self,
278
- X: np.ndarray,
281
+ X: ArrayLike,
279
282
  batch_size: int = int(1e10),
280
283
  ) -> OODScore:
281
- self._validate(X)
284
+ self._validate(X := to_numpy(X))
282
285
  fscore = -self._llr(X, True, batch_size=batch_size)
283
286
  iscore = -self._llr(X, False, batch_size=batch_size)
284
287
  return OODScore(iscore, fscore)
@@ -10,8 +10,10 @@ from typing import Callable
10
10
 
11
11
  import keras
12
12
  import numpy as np
13
+ from numpy.typing import ArrayLike
13
14
 
14
15
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
16
+ from dataeval._internal.interop import to_numpy
15
17
  from dataeval._internal.models.tensorflow.autoencoder import VAE
16
18
  from dataeval._internal.models.tensorflow.losses import Elbo
17
19
  from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -34,7 +36,7 @@ class OOD_VAE(OODBase):
34
36
 
35
37
  def fit(
36
38
  self,
37
- x_ref: np.ndarray,
39
+ x_ref: ArrayLike,
38
40
  threshold_perc: float = 100.0,
39
41
  loss_fn: Callable = Elbo(0.05),
40
42
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -47,7 +49,7 @@ class OOD_VAE(OODBase):
47
49
 
48
50
  Parameters
49
51
  ----------
50
- x_ref : np.ndarray
52
+ x_ref : ArrayLike
51
53
  Training batch.
52
54
  threshold_perc : float, default 100.0
53
55
  Percentage of reference data that is normal.
@@ -64,8 +66,8 @@ class OOD_VAE(OODBase):
64
66
  """
65
67
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
66
68
 
67
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
68
- self._validate(X)
69
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
70
+ self._validate(X := to_numpy(X))
69
71
 
70
72
  # sample reconstructed instances
71
73
  X_samples = np.repeat(X, self.samples, axis=0)
@@ -10,8 +10,10 @@ from typing import Callable
10
10
 
11
11
  import keras
12
12
  import numpy as np
13
+ from numpy.typing import ArrayLike
13
14
 
14
15
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
16
+ from dataeval._internal.interop import to_numpy
15
17
  from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
16
18
  from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
19
  from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
@@ -35,7 +37,7 @@ class OOD_VAEGMM(OODGMMBase):
35
37
 
36
38
  def fit(
37
39
  self,
38
- x_ref: np.ndarray,
40
+ x_ref: ArrayLike,
39
41
  threshold_perc: float = 100.0,
40
42
  loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
41
43
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -48,7 +50,7 @@ class OOD_VAEGMM(OODGMMBase):
48
50
 
49
51
  Parameters
50
52
  ----------
51
- X : np.ndarray
53
+ X : ArrayLike
52
54
  Training batch.
53
55
  threshold_perc : float, default 100.0
54
56
  Percentage of reference data that is normal.
@@ -65,8 +67,8 @@ class OOD_VAEGMM(OODGMMBase):
65
67
  """
66
68
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
67
69
 
68
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
69
- self._validate(X)
70
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
71
+ self._validate(X := to_numpy(X))
70
72
 
71
73
  # draw samples from latent space
72
74
  X_samples = np.repeat(X, self.samples, axis=0)
@@ -0,0 +1,43 @@
1
+ from importlib import import_module
2
+ from typing import Iterable, Optional
3
+
4
+ import numpy as np
5
+ from numpy.typing import ArrayLike
6
+
7
+ module_cache = {}
8
+
9
+
10
+ def try_import(module_name):
11
+ if module_name in module_cache:
12
+ return module_cache[module_name]
13
+
14
+ try:
15
+ module = import_module(module_name)
16
+ except ImportError: # pragma: no cover - covered by test_mindeps.py
17
+ module = None
18
+
19
+ module_cache[module_name] = module
20
+ return module
21
+
22
+
23
+ def to_numpy(array: Optional[ArrayLike]) -> np.ndarray:
24
+ if array is None:
25
+ return np.ndarray([])
26
+
27
+ if isinstance(array, np.ndarray):
28
+ return array
29
+
30
+ tf = try_import("tensorflow")
31
+ if tf and tf.is_tensor(array):
32
+ return array.numpy() # type: ignore
33
+
34
+ torch = try_import("torch")
35
+ if torch and isinstance(array, torch.Tensor):
36
+ return array.detach().cpu().numpy() # type: ignore
37
+
38
+ return np.asarray(array)
39
+
40
+
41
+ def to_numpy_iter(iterable: Iterable[ArrayLike]):
42
+ for array in iterable:
43
+ yield to_numpy(array)
@@ -0,0 +1,180 @@
1
+ import warnings
2
+ from typing import Dict, List, NamedTuple, Sequence
3
+
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
7
+
8
+ from dataeval._internal.metrics.utils import entropy, preprocess_metadata
9
+
10
+
11
+ class BalanceOutput(NamedTuple):
12
+ """
13
+ Attributes
14
+ ----------
15
+ mutual_information : NDArray[np.float64]
16
+ Estimate of mutual information between metadata factors and class label
17
+ """
18
+
19
+ mutual_information: NDArray[np.float64]
20
+
21
+
22
+ def validate_num_neighbors(num_neighbors: int) -> int:
23
+ if not isinstance(num_neighbors, (int, float)):
24
+ raise TypeError(
25
+ f"Variable {num_neighbors} is not real-valued numeric type."
26
+ "num_neighbors should be an int, greater than 0 and less than"
27
+ "the number of samples in the dataset"
28
+ )
29
+ if num_neighbors < 1:
30
+ raise ValueError(
31
+ f"Invalid value for {num_neighbors}."
32
+ "Choose a value greater than 0 and less than number of samples"
33
+ "in the dataset."
34
+ )
35
+ if isinstance(num_neighbors, float):
36
+ num_neighbors = int(num_neighbors)
37
+ warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
38
+
39
+ return num_neighbors
40
+
41
+
42
+ def balance(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
43
+ """
44
+ Mutual information (MI) between factors (class label, metadata, label/image properties)
45
+
46
+ Parameters
47
+ ----------
48
+ class_labels: Sequence[int]
49
+ List of class labels for each image
50
+ metadata: List[Dict]
51
+ List of metadata factors for each image
52
+ num_neighbors: int, default 5
53
+ Number of nearest neighbors to use for computing MI between discrete
54
+ and continuous variables.
55
+
56
+ Returns
57
+ -------
58
+ BalanceOutput
59
+ (num_factors+1) x (num_factors+1) estimate of mutual information
60
+ between num_factors metadata factors and class label. Symmetry is enforced.
61
+
62
+ Notes
63
+ -----
64
+ We use `mutual_info_classif` from sklearn since class label is categorical.
65
+ `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
66
+ seed. MI is computed differently for categorical and continuous variables, and
67
+ we attempt to infer whether a variable is categorical by the fraction of unique
68
+ values in the dataset.
69
+
70
+ See Also
71
+ --------
72
+ sklearn.feature_selection.mutual_info_classif
73
+ sklearn.feature_selection.mutual_info_regression
74
+ sklearn.metrics.mutual_info_score
75
+ """
76
+ num_neighbors = validate_num_neighbors(num_neighbors)
77
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
78
+ num_factors = len(names)
79
+ mi = np.empty((num_factors, num_factors))
80
+ mi[:] = np.nan
81
+
82
+ for idx in range(num_factors):
83
+ tgt = data[:, idx]
84
+
85
+ if is_categorical[idx]:
86
+ # categorical target
87
+ mi[idx, :] = mutual_info_classif(
88
+ data,
89
+ tgt,
90
+ discrete_features=is_categorical, # type: ignore
91
+ n_neighbors=num_neighbors,
92
+ )
93
+ else:
94
+ # continuous variables
95
+ mi[idx, :] = mutual_info_regression(
96
+ data,
97
+ tgt,
98
+ discrete_features=is_categorical, # type: ignore
99
+ n_neighbors=num_neighbors,
100
+ )
101
+
102
+ ent_all = entropy(data, names, is_categorical, normalized=False)
103
+ norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
104
+ # in principle MI should be symmetric, but it is not in practice.
105
+ nmi = 0.5 * (mi + mi.T) / norm_factor
106
+
107
+ return BalanceOutput(nmi)
108
+
109
+
110
+ def balance_classwise(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
111
+ """
112
+ Compute mutual information (analogous to correlation) between metadata factors
113
+ (class label, metadata, label/image properties) with individual class labels.
114
+
115
+ Parameters
116
+ ----------
117
+ class_labels: Sequence[int]
118
+ List of class labels for each image
119
+ metadata: List[Dict]
120
+ List of metadata factors for each image
121
+ num_neighbors: int, default 5
122
+ Number of nearest neighbors to use for computing MI between discrete
123
+ and continuous variables.
124
+
125
+ Notes
126
+ -----
127
+ We use `mutual_info_classif` from sklearn since class label is categorical.
128
+ `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
129
+ seed. MI is computed differently for categorical and continuous variables, so we
130
+ have to specify with is_categorical.
131
+
132
+ Returns
133
+ -------
134
+ BalanceOutput
135
+ (num_classes x num_factors) estimate of mutual information between
136
+ num_factors metadata factors and individual class labels.
137
+
138
+ See Also
139
+ --------
140
+ sklearn.feature_selection.mutual_info_classif
141
+ sklearn.feature_selection.mutual_info_regression
142
+ sklearn.metrics.mutual_info_score
143
+ compute_mutual_information
144
+ """
145
+ num_neighbors = validate_num_neighbors(num_neighbors)
146
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
147
+ num_factors = len(names)
148
+ # unique class labels
149
+ class_idx = names.index("class_label")
150
+ class_data = data[:, class_idx]
151
+ u_cls = np.unique(class_data)
152
+ num_classes = len(u_cls)
153
+
154
+ data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
155
+
156
+ # assume class is a factor
157
+ mi = np.empty((num_classes, num_factors - 1))
158
+ mi[:] = np.nan
159
+
160
+ # categorical variables, excluding class label
161
+ cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
162
+
163
+ # classification MI for discrete/categorical features
164
+ for idx, cls in enumerate(u_cls):
165
+ tgt = class_data == cls
166
+ # units: nat
167
+ mi[idx, :] = mutual_info_classif(
168
+ data_no_class,
169
+ tgt,
170
+ discrete_features=cat_mask, # type: ignore
171
+ n_neighbors=num_neighbors,
172
+ )
173
+
174
+ # let this recompute for all features including class label
175
+ ent_all = entropy(data, names, is_categorical)
176
+ ent_tgt = ent_all[class_idx]
177
+ ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
178
+ norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
179
+ nmi = mi / norm_factor
180
+ return BalanceOutput(nmi)