dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_internal/detectors/clusterer.py +44 -16
  3. dataeval/_internal/detectors/drift/base.py +14 -12
  4. dataeval/_internal/detectors/drift/cvm.py +11 -8
  5. dataeval/_internal/detectors/drift/ks.py +6 -3
  6. dataeval/_internal/detectors/drift/mmd.py +14 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +7 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -12
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +6 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +5 -5
  12. dataeval/_internal/detectors/ood/base.py +14 -13
  13. dataeval/_internal/detectors/ood/llr.py +6 -4
  14. dataeval/_internal/detectors/ood/vae.py +5 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +5 -4
  16. dataeval/_internal/functional/__init__.py +0 -0
  17. dataeval/_internal/functional/ber.py +63 -0
  18. dataeval/_internal/functional/coverage.py +75 -0
  19. dataeval/_internal/functional/divergence.py +16 -0
  20. dataeval/_internal/{metrics → functional}/hash.py +1 -1
  21. dataeval/_internal/functional/metadata.py +136 -0
  22. dataeval/_internal/functional/metadataparity.py +190 -0
  23. dataeval/_internal/functional/uap.py +6 -0
  24. dataeval/_internal/interop.py +52 -0
  25. dataeval/_internal/maite/__init__.py +0 -0
  26. dataeval/_internal/maite/utils.py +30 -0
  27. dataeval/_internal/metrics/base.py +2 -2
  28. dataeval/_internal/metrics/ber.py +16 -66
  29. dataeval/_internal/metrics/coverage.py +51 -35
  30. dataeval/_internal/metrics/divergence.py +50 -42
  31. dataeval/_internal/metrics/metadata.py +610 -0
  32. dataeval/_internal/metrics/metadataparity.py +67 -0
  33. dataeval/_internal/metrics/parity.py +40 -56
  34. dataeval/_internal/metrics/stats.py +46 -35
  35. dataeval/_internal/metrics/uap.py +14 -17
  36. dataeval/_internal/workflows/__init__.py +0 -0
  37. dataeval/metrics/__init__.py +2 -1
  38. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
  39. dataeval-0.63.0.dist-info/RECORD +68 -0
  40. dataeval-0.61.0.dist-info/RECORD +0 -55
  41. /dataeval/_internal/{metrics → functional}/utils.py +0 -0
  42. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
  43. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
@@ -1,8 +1,9 @@
1
- from typing import Literal, Optional, Sequence, Union
1
+ from typing import Iterable, Literal, Optional, Sequence, Union
2
2
 
3
3
  import numpy as np
4
4
 
5
5
  from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
6
+ from dataeval._internal.interop import ArrayLike
6
7
  from dataeval._internal.metrics.stats import ImageStats
7
8
 
8
9
 
@@ -30,25 +31,79 @@ def _get_outlier_mask(
30
31
 
31
32
 
32
33
  class Linter:
33
- """
34
- Calculates statistical outliers of a dataset using various statistical
35
- tests applied to each image
34
+ r"""
35
+ Calculates statistical outliers of a dataset using various statistical tests applied to each image
36
+
37
+ Parameters
38
+ ----------
39
+ flags : [ImageProperty | ImageStatistics | ImageVisuals], default None
40
+ Metric(s) to calculate for each image - calculates all metrics if None
41
+ outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
42
+ Statistical method used to identify outliers
43
+ outlier_threshold : float, optional - default None
44
+ Threshold value for the given ``outlier_method``, above which data is considered an outlier.
45
+ Uses method specific default if `None`
46
+
47
+ Attributes
48
+ ----------
49
+ stats : ImageStats
50
+ Class to hold the value of each metric for each image
51
+
52
+ See Also
53
+ --------
54
+ Duplicates
55
+
56
+ Notes
57
+ ------
58
+ There are 3 different statistical methods:
59
+
60
+ - zscore
61
+ - modzscore
62
+ - iqr
63
+
64
+ | The z score method is based on the difference between the data point and the mean of the data.
65
+ The default threshold value for `zscore` is 3.
66
+ | Z score = :math:`|x_i - \mu| / \sigma`
67
+
68
+ | The modified z score method is based on the difference between the data point and the median of the data.
69
+ The default threshold value for `modzscore` is 3.5.
70
+ | Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`, where :math:`MAD` is the median absolute deviation
71
+
72
+ | The interquartile range method is based on the difference between the data point and
73
+ the difference between the 75th and 25th qartile. The default threshold value for `iqr` is 1.5.
74
+ | Interquartile range = :math:`threshold * (Q_3 - Q_1)`
75
+
76
+ Examples
77
+ --------
78
+ Initialize the Linter class:
79
+
80
+ >>> lint = Linter()
81
+
82
+ Specifying specific metrics to analyze:
83
+
84
+ >>> lint = Linter(flags=[ImageProperty.SIZE, ImageVisuals.ALL])
85
+
86
+ Specifying an outlier method:
87
+
88
+ >>> lint = Linter(outlier_method="iqr")
89
+
90
+ Specifying an outlier method and threshold:
91
+
92
+ >>> lint = Linter(outlier_method="zscore", outlier_threshold=2.5)
36
93
  """
37
94
 
38
95
  def __init__(
39
96
  self,
40
- images: np.ndarray,
41
97
  flags: Optional[Union[LinterFlags, Sequence[LinterFlags]]] = None,
98
+ outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
99
+ outlier_threshold: Optional[float] = None,
42
100
  ):
43
101
  flags = flags if flags is not None else (ImageProperty.ALL, ImageVisuals.ALL)
44
102
  self.stats = ImageStats(flags)
45
- self.images = images
103
+ self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
104
+ self.outlier_threshold = outlier_threshold
46
105
 
47
- def _get_outliers(
48
- self,
49
- outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
50
- outlier_threshold: Optional[float] = None,
51
- ) -> dict:
106
+ def _get_outliers(self) -> dict:
52
107
  flagged_images = {}
53
108
 
54
109
  for stat, values in self.results.items():
@@ -56,23 +111,37 @@ class Linter:
56
111
  continue
57
112
 
58
113
  if values.ndim == 1 and np.std(values) != 0:
59
- mask = _get_outlier_mask(values, outlier_method, outlier_threshold)
114
+ mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
60
115
  indices = np.flatnonzero(mask)
61
116
  for i, value in zip(indices, values[mask]):
62
117
  flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
63
118
 
64
119
  return dict(sorted(flagged_images.items()))
65
120
 
66
- def evaluate(self) -> dict:
121
+ def evaluate(self, images: Iterable[ArrayLike]) -> dict:
67
122
  """
68
- Returns indices of outliers with and the issues identified for each
123
+ Returns indices of outliers with the issues identified for each
124
+
125
+ Parameters
126
+ ----------
127
+ images : Iterable[ArrayLike], shape - (N, C, H, W)
128
+ A dataset in an ArrayLike format.
129
+ Function expects the data to have 3 dimensions, CxHxW.
69
130
 
70
131
  Returns
71
132
  -------
72
133
  Dict[int, Dict[str, float]]
73
- Dictionary containing the indices of outliers and a dictionary issues and calculated values
134
+ Dictionary containing the indices of outliers and a dictionary showing
135
+ the issues and calculated values for the given index.
136
+
137
+ Example
138
+ -------
139
+ Evaluate the dataset:
140
+
141
+ >>> lint.evaluate(images)
142
+ {18: {'brightness': 0.78}, 25: {'brightness': 0.98}}
74
143
  """
75
144
  self.stats.reset()
76
- self.stats.update(self.images)
145
+ self.stats.update(images)
77
146
  self.results = self.stats.compute()
78
147
  return self._get_outliers()
@@ -12,6 +12,7 @@ import keras
12
12
  import numpy as np
13
13
 
14
14
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
15
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
16
  from dataeval._internal.models.tensorflow.autoencoder import AE
16
17
  from dataeval._internal.models.tensorflow.utils import predict_batch
17
18
 
@@ -30,7 +31,7 @@ class OOD_AE(OODBase):
30
31
 
31
32
  def fit(
32
33
  self,
33
- x_ref: np.ndarray,
34
+ x_ref: ArrayLike,
34
35
  threshold_perc: float = 100.0,
35
36
  loss_fn: Callable = keras.losses.MeanSquaredError(),
36
37
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -43,7 +44,7 @@ class OOD_AE(OODBase):
43
44
 
44
45
  Parameters
45
46
  ----------
46
- x_ref : np.ndarray
47
+ x_ref : ArrayLike
47
48
  Training batch.
48
49
  threshold_perc : float, default 100.0
49
50
  Percentage of reference data that is normal.
@@ -58,10 +59,10 @@ class OOD_AE(OODBase):
58
59
  verbose : bool, default True
59
60
  Whether to print training progress.
60
61
  """
61
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
62
+ super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
62
63
 
63
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
64
- self._validate(X)
64
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
65
+ self._validate(X := to_numpy(X))
65
66
 
66
67
  # reconstruct instances
67
68
  X_recon = predict_batch(X, self.model, batch_size=batch_size)
@@ -9,9 +9,9 @@ Licensed under Apache Software License (Apache 2.0)
9
9
  from typing import Callable
10
10
 
11
11
  import keras
12
- import numpy as np
13
12
 
14
13
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
14
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
15
  from dataeval._internal.models.tensorflow.autoencoder import AEGMM
16
16
  from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
17
  from dataeval._internal.models.tensorflow.losses import LossGMM
@@ -32,7 +32,7 @@ class OOD_AEGMM(OODGMMBase):
32
32
 
33
33
  def fit(
34
34
  self,
35
- x_ref: np.ndarray,
35
+ x_ref: ArrayLike,
36
36
  threshold_perc: float = 100.0,
37
37
  loss_fn: Callable = LossGMM(),
38
38
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -45,7 +45,7 @@ class OOD_AEGMM(OODGMMBase):
45
45
 
46
46
  Parameters
47
47
  ----------
48
- x_ref : np.ndarray
48
+ x_ref : ArrayLike
49
49
  Training batch.
50
50
  threshold_perc : float, default 100.0
51
51
  Percentage of reference data that is normal.
@@ -62,8 +62,8 @@ class OOD_AEGMM(OODGMMBase):
62
62
  """
63
63
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
64
64
 
65
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
66
- self._validate(X)
65
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
66
+ self._validate(X := to_numpy(X))
67
67
  _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
68
68
  energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
69
69
  return OODScore(energy.numpy()) # type: ignore
@@ -13,6 +13,7 @@ import keras
13
13
  import numpy as np
14
14
  import tensorflow as tf
15
15
 
16
+ from dataeval._internal.interop import ArrayLike, to_numpy
16
17
  from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
17
18
  from dataeval._internal.models.tensorflow.trainer import trainer
18
19
 
@@ -66,13 +67,13 @@ class OODBase(ABC):
66
67
  self._validate(X)
67
68
 
68
69
  @abstractmethod
69
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
70
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
70
71
  """
71
72
  Compute instance and (optionally) feature level outlier scores.
72
73
 
73
74
  Parameters
74
75
  ----------
75
- X : np.ndarray
76
+ X : ArrayLike
76
77
  Batch of instances.
77
78
  batch_size : int, default int(1e10)
78
79
  Batch size used when making predictions with the autoencoder.
@@ -87,7 +88,7 @@ class OODBase(ABC):
87
88
 
88
89
  def fit(
89
90
  self,
90
- x_ref: np.ndarray,
91
+ x_ref: ArrayLike,
91
92
  threshold_perc: float,
92
93
  loss_fn: Callable,
93
94
  optimizer: keras.optimizers.Optimizer,
@@ -100,7 +101,7 @@ class OODBase(ABC):
100
101
 
101
102
  Parameters
102
103
  ----------
103
- x_ref: : np.ndarray
104
+ x_ref: : ArrayLike
104
105
  Training batch.
105
106
  threshold_perc : float
106
107
  Percentage of reference data that is normal.
@@ -119,7 +120,7 @@ class OODBase(ABC):
119
120
  trainer(
120
121
  model=self.model,
121
122
  loss_fn=loss_fn,
122
- x_train=x_ref,
123
+ x_train=to_numpy(x_ref),
123
124
  optimizer=optimizer,
124
125
  epochs=epochs,
125
126
  batch_size=batch_size,
@@ -132,7 +133,7 @@ class OODBase(ABC):
132
133
 
133
134
  def predict(
134
135
  self,
135
- X: np.ndarray,
136
+ X: ArrayLike,
136
137
  batch_size: int = int(1e10),
137
138
  ood_type: Literal["feature", "instance"] = "instance",
138
139
  ) -> Dict[str, np.ndarray]:
@@ -141,18 +142,18 @@ class OODBase(ABC):
141
142
 
142
143
  Parameters
143
144
  ----------
144
- X
145
+ X : ArrayLike
145
146
  Batch of instances.
146
- ood_type
147
- Predict out-of-distribution at the 'feature' or 'instance' level.
148
- batch_size
147
+ batch_size : int, default int(1e10)
149
148
  Batch size used when making predictions with the autoencoder.
149
+ ood_type : Literal["feature", "instance"], default "instance"
150
+ Predict out-of-distribution at the 'feature' or 'instance' level.
150
151
 
151
152
  Returns
152
153
  -------
153
154
  Dictionary containing the outlier predictions and both feature and instance level outlier scores.
154
155
  """
155
- self._validate_state(X)
156
+ self._validate_state(X := to_numpy(X))
156
157
  # compute outlier scores
157
158
  score = self.score(X, batch_size=batch_size)
158
159
  ood_pred = (score.get(ood_type) > self._threshold_score(ood_type)).astype(int)
@@ -171,7 +172,7 @@ class OODGMMBase(OODBase):
171
172
 
172
173
  def fit(
173
174
  self,
174
- x_ref: np.ndarray,
175
+ x_ref: ArrayLike,
175
176
  threshold_perc: float,
176
177
  loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
177
178
  optimizer: keras.optimizers.Optimizer,
@@ -183,7 +184,7 @@ class OODGMMBase(OODBase):
183
184
  trainer(
184
185
  model=self.model,
185
186
  loss_fn=loss_fn,
186
- x_train=x_ref,
187
+ x_train=to_numpy(x_ref),
187
188
  optimizer=optimizer,
188
189
  epochs=epochs,
189
190
  batch_size=batch_size,
@@ -16,6 +16,7 @@ from keras.layers import Input
16
16
  from keras.models import Model
17
17
 
18
18
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
19
+ from dataeval._internal.interop import ArrayLike, to_numpy
19
20
  from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
20
21
  from dataeval._internal.models.tensorflow.trainer import trainer
21
22
  from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -125,7 +126,7 @@ class OOD_LLR(OODBase):
125
126
 
126
127
  def fit(
127
128
  self,
128
- x_ref: np.ndarray,
129
+ x_ref: ArrayLike,
129
130
  threshold_perc: float = 100.0,
130
131
  loss_fn: Optional[Callable] = None,
131
132
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -141,7 +142,7 @@ class OOD_LLR(OODBase):
141
142
 
142
143
  Parameters
143
144
  ----------
144
- x_ref : np.ndarray
145
+ x_ref : ArrayLike
145
146
  Training batch.
146
147
  threshold_perc : float, default 100.0
147
148
  Percentage of reference data that is normal.
@@ -163,6 +164,7 @@ class OOD_LLR(OODBase):
163
164
  mutate_batch_size: int, default int(1e10)
164
165
  Batch size used to generate the mutations for the background dataset.
165
166
  """
167
+ x_ref = to_numpy(x_ref)
166
168
  input_shape = x_ref.shape[1:]
167
169
  optimizer = optimizer() if isinstance(optimizer, type) else optimizer
168
170
  # Separate into two separate optimizers, one for semantic model and one for background model
@@ -275,10 +277,10 @@ class OOD_LLR(OODBase):
275
277
 
276
278
  def score(
277
279
  self,
278
- X: np.ndarray,
280
+ X: ArrayLike,
279
281
  batch_size: int = int(1e10),
280
282
  ) -> OODScore:
281
- self._validate(X)
283
+ self._validate(X := to_numpy(X))
282
284
  fscore = -self._llr(X, True, batch_size=batch_size)
283
285
  iscore = -self._llr(X, False, batch_size=batch_size)
284
286
  return OODScore(iscore, fscore)
@@ -12,6 +12,7 @@ import keras
12
12
  import numpy as np
13
13
 
14
14
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
15
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
16
  from dataeval._internal.models.tensorflow.autoencoder import VAE
16
17
  from dataeval._internal.models.tensorflow.losses import Elbo
17
18
  from dataeval._internal.models.tensorflow.utils import predict_batch
@@ -34,7 +35,7 @@ class OOD_VAE(OODBase):
34
35
 
35
36
  def fit(
36
37
  self,
37
- x_ref: np.ndarray,
38
+ x_ref: ArrayLike,
38
39
  threshold_perc: float = 100.0,
39
40
  loss_fn: Callable = Elbo(0.05),
40
41
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -47,7 +48,7 @@ class OOD_VAE(OODBase):
47
48
 
48
49
  Parameters
49
50
  ----------
50
- x_ref : np.ndarray
51
+ x_ref : ArrayLike
51
52
  Training batch.
52
53
  threshold_perc : float, default 100.0
53
54
  Percentage of reference data that is normal.
@@ -64,8 +65,8 @@ class OOD_VAE(OODBase):
64
65
  """
65
66
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
66
67
 
67
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
68
- self._validate(X)
68
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
69
+ self._validate(X := to_numpy(X))
69
70
 
70
71
  # sample reconstructed instances
71
72
  X_samples = np.repeat(X, self.samples, axis=0)
@@ -12,6 +12,7 @@ import keras
12
12
  import numpy as np
13
13
 
14
14
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
15
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
16
  from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
16
17
  from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
18
  from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
@@ -35,7 +36,7 @@ class OOD_VAEGMM(OODGMMBase):
35
36
 
36
37
  def fit(
37
38
  self,
38
- x_ref: np.ndarray,
39
+ x_ref: ArrayLike,
39
40
  threshold_perc: float = 100.0,
40
41
  loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
41
42
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
@@ -48,7 +49,7 @@ class OOD_VAEGMM(OODGMMBase):
48
49
 
49
50
  Parameters
50
51
  ----------
51
- X : np.ndarray
52
+ X : ArrayLike
52
53
  Training batch.
53
54
  threshold_perc : float, default 100.0
54
55
  Percentage of reference data that is normal.
@@ -65,8 +66,8 @@ class OOD_VAEGMM(OODGMMBase):
65
66
  """
66
67
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
67
68
 
68
- def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
69
- self._validate(X)
69
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
70
+ self._validate(X := to_numpy(X))
70
71
 
71
72
  # draw samples from latent space
72
73
  X_samples = np.repeat(X, self.samples, axis=0)
File without changes
@@ -0,0 +1,63 @@
1
+ from typing import Tuple
2
+
3
+ import numpy as np
4
+ from scipy.sparse import coo_matrix
5
+ from scipy.stats import mode
6
+
7
+ from dataeval._internal.functional.utils import compute_neighbors, get_classes_counts, minimum_spanning_tree
8
+
9
+
10
+ def ber_mst(X: np.ndarray, y: np.ndarray, _: int) -> Tuple[float, float]:
11
+ """Calculates the Bayes Error Rate using a minimum spanning tree
12
+
13
+ Parameters
14
+ ----------
15
+ X : np.ndarray (N, :)
16
+ Data points with arbitrary dimensionality
17
+ y : np.ndarray (N, 1)
18
+ Labels for each data point
19
+ """
20
+
21
+ M, N = get_classes_counts(y)
22
+
23
+ tree = coo_matrix(minimum_spanning_tree(X))
24
+ matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
25
+ deltas = matches / (2 * N)
26
+ upper = 2 * deltas
27
+ lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
28
+ return upper, lower
29
+
30
+
31
+ def ber_knn(X: np.ndarray, y: np.ndarray, k: int) -> Tuple[float, float]:
32
+ """Calculates the Bayes Error Rate using K-nearest neighbors"""
33
+
34
+ M, N = get_classes_counts(y)
35
+
36
+ # All features belong on second dimension
37
+ X = X.reshape((X.shape[0], -1))
38
+ nn_indices = compute_neighbors(X, X, k=k)
39
+ nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
40
+ modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
41
+ upper = float(np.count_nonzero(modal_class - y) / N)
42
+ lower = _knn_lowerbound(upper, M, k)
43
+ return upper, lower
44
+
45
+
46
+ def _knn_lowerbound(value: float, classes: int, k: int) -> float:
47
+ """Several cases for computing the BER lower bound"""
48
+ if value <= 1e-10:
49
+ return 0.0
50
+
51
+ if classes == 2 and k != 1:
52
+ if k > 5:
53
+ # Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
54
+ alpha = 0.3399
55
+ beta = 0.9749
56
+ a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
57
+ return value / (1 + a_k)
58
+ if k > 2:
59
+ return value / (1 + (1 / np.sqrt(k)))
60
+ # k == 2:
61
+ return value / 2
62
+
63
+ return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
@@ -0,0 +1,75 @@
1
+ import math
2
+ from typing import Literal, Tuple
3
+
4
+ import numpy as np
5
+ from scipy.spatial.distance import pdist, squareform
6
+
7
+
8
+ def coverage(
9
+ embeddings: np.ndarray,
10
+ radius_type: Literal["adaptive", "naive"] = "adaptive",
11
+ k: int = 20,
12
+ percent: np.float64 = np.float64(0.01),
13
+ ) -> Tuple[np.ndarray, np.ndarray, float]:
14
+ """
15
+ Perform a one-way chi-squared test between observation frequencies and expected frequencies that
16
+ tests the null hypothesis that the observed data has the expected frequencies.
17
+
18
+ Parameters
19
+ ----------
20
+ embeddings : ArrayLike, shape - (N, P)
21
+ A dataset in an ArrayLike format.
22
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
23
+ radius_type : Literal["adaptive", "naive"], default "adaptive"
24
+ The function used to determine radius.
25
+ k: int, default 20
26
+ Number of observations required in order to be covered.
27
+ [1] suggests that a minimum of 20-50 samples is necessary.
28
+ percent: np.float64, default np.float(0.01)
29
+ Percent of observations to be considered uncovered. Only applies to adaptive radius.
30
+
31
+ Returns
32
+ -------
33
+ np.ndarray
34
+ Array of uncovered indices
35
+ np.ndarray
36
+ Array of critical value radii
37
+ float
38
+ Radius for coverage
39
+
40
+ Raises
41
+ ------
42
+ ValueError
43
+ If length of embeddings is less than or equal to k
44
+ ValueError
45
+ If radius_type is unknown
46
+
47
+ Note
48
+ ----
49
+ Embeddings should be on the unit interval.
50
+
51
+ Reference
52
+ ---------
53
+ This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
54
+ [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
55
+ """
56
+
57
+ # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
58
+ n = len(embeddings)
59
+ if n <= k:
60
+ raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
61
+ mat = squareform(pdist(embeddings))
62
+ sorted_dists = np.sort(mat, axis=1)
63
+ crit = sorted_dists[:, k + 1]
64
+
65
+ d = np.shape(embeddings)[1]
66
+ if radius_type == "naive":
67
+ rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
68
+ pvals = np.where(crit > rho)[0]
69
+ elif radius_type == "adaptive":
70
+ # Use data adaptive cutoff as rho
71
+ rho = int(n * percent)
72
+ pvals = np.argsort(crit)[::-1][:rho]
73
+ else:
74
+ raise ValueError("Invalid radius type.")
75
+ return pvals, crit, rho
@@ -0,0 +1,16 @@
1
+ import numpy as np
2
+
3
+ from .utils import compute_neighbors, minimum_spanning_tree
4
+
5
+
6
+ def divergence_mst(data: np.ndarray, labels: np.ndarray) -> int:
7
+ mst = minimum_spanning_tree(data).toarray()
8
+ edgelist = np.transpose(np.nonzero(mst))
9
+ errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
10
+ return errors
11
+
12
+
13
+ def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
14
+ nn_indices = compute_neighbors(data, data)
15
+ errors = np.sum(np.abs(labels[nn_indices] - labels))
16
+ return errors
@@ -3,7 +3,7 @@ import xxhash as xxh
3
3
  from PIL import Image
4
4
  from scipy.fftpack import dct
5
5
 
6
- from dataeval._internal.metrics.utils import normalize_image_shape, rescale
6
+ from dataeval._internal.functional.utils import normalize_image_shape, rescale
7
7
 
8
8
  HASH_SIZE = 8
9
9
  MAX_FACTOR = 4