dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_internal/detectors/clusterer.py +44 -16
- dataeval/_internal/detectors/drift/base.py +14 -12
- dataeval/_internal/detectors/drift/cvm.py +11 -8
- dataeval/_internal/detectors/drift/ks.py +6 -3
- dataeval/_internal/detectors/drift/mmd.py +14 -12
- dataeval/_internal/detectors/drift/uncertainty.py +7 -5
- dataeval/_internal/detectors/duplicates.py +35 -12
- dataeval/_internal/detectors/linter.py +85 -16
- dataeval/_internal/detectors/ood/ae.py +6 -5
- dataeval/_internal/detectors/ood/aegmm.py +5 -5
- dataeval/_internal/detectors/ood/base.py +14 -13
- dataeval/_internal/detectors/ood/llr.py +6 -4
- dataeval/_internal/detectors/ood/vae.py +5 -4
- dataeval/_internal/detectors/ood/vaegmm.py +5 -4
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +63 -0
- dataeval/_internal/functional/coverage.py +75 -0
- dataeval/_internal/functional/divergence.py +16 -0
- dataeval/_internal/{metrics → functional}/hash.py +1 -1
- dataeval/_internal/functional/metadata.py +136 -0
- dataeval/_internal/functional/metadataparity.py +190 -0
- dataeval/_internal/functional/uap.py +6 -0
- dataeval/_internal/interop.py +52 -0
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +30 -0
- dataeval/_internal/metrics/base.py +2 -2
- dataeval/_internal/metrics/ber.py +16 -66
- dataeval/_internal/metrics/coverage.py +51 -35
- dataeval/_internal/metrics/divergence.py +50 -42
- dataeval/_internal/metrics/metadata.py +610 -0
- dataeval/_internal/metrics/metadataparity.py +67 -0
- dataeval/_internal/metrics/parity.py +40 -56
- dataeval/_internal/metrics/stats.py +46 -35
- dataeval/_internal/metrics/uap.py +14 -17
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/metrics/__init__.py +2 -1
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
- dataeval-0.63.0.dist-info/RECORD +68 -0
- dataeval-0.61.0.dist-info/RECORD +0 -55
- /dataeval/_internal/{metrics → functional}/utils.py +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
@@ -1,8 +1,9 @@
|
|
1
|
-
from typing import Literal, Optional, Sequence, Union
|
1
|
+
from typing import Iterable, Literal, Optional, Sequence, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
|
5
5
|
from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
|
6
|
+
from dataeval._internal.interop import ArrayLike
|
6
7
|
from dataeval._internal.metrics.stats import ImageStats
|
7
8
|
|
8
9
|
|
@@ -30,25 +31,79 @@ def _get_outlier_mask(
|
|
30
31
|
|
31
32
|
|
32
33
|
class Linter:
|
33
|
-
"""
|
34
|
-
Calculates statistical outliers of a dataset using various statistical
|
35
|
-
|
34
|
+
r"""
|
35
|
+
Calculates statistical outliers of a dataset using various statistical tests applied to each image
|
36
|
+
|
37
|
+
Parameters
|
38
|
+
----------
|
39
|
+
flags : [ImageProperty | ImageStatistics | ImageVisuals], default None
|
40
|
+
Metric(s) to calculate for each image - calculates all metrics if None
|
41
|
+
outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
|
42
|
+
Statistical method used to identify outliers
|
43
|
+
outlier_threshold : float, optional - default None
|
44
|
+
Threshold value for the given ``outlier_method``, above which data is considered an outlier.
|
45
|
+
Uses method specific default if `None`
|
46
|
+
|
47
|
+
Attributes
|
48
|
+
----------
|
49
|
+
stats : ImageStats
|
50
|
+
Class to hold the value of each metric for each image
|
51
|
+
|
52
|
+
See Also
|
53
|
+
--------
|
54
|
+
Duplicates
|
55
|
+
|
56
|
+
Notes
|
57
|
+
------
|
58
|
+
There are 3 different statistical methods:
|
59
|
+
|
60
|
+
- zscore
|
61
|
+
- modzscore
|
62
|
+
- iqr
|
63
|
+
|
64
|
+
| The z score method is based on the difference between the data point and the mean of the data.
|
65
|
+
The default threshold value for `zscore` is 3.
|
66
|
+
| Z score = :math:`|x_i - \mu| / \sigma`
|
67
|
+
|
68
|
+
| The modified z score method is based on the difference between the data point and the median of the data.
|
69
|
+
The default threshold value for `modzscore` is 3.5.
|
70
|
+
| Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`, where :math:`MAD` is the median absolute deviation
|
71
|
+
|
72
|
+
| The interquartile range method is based on the difference between the data point and
|
73
|
+
the difference between the 75th and 25th qartile. The default threshold value for `iqr` is 1.5.
|
74
|
+
| Interquartile range = :math:`threshold * (Q_3 - Q_1)`
|
75
|
+
|
76
|
+
Examples
|
77
|
+
--------
|
78
|
+
Initialize the Linter class:
|
79
|
+
|
80
|
+
>>> lint = Linter()
|
81
|
+
|
82
|
+
Specifying specific metrics to analyze:
|
83
|
+
|
84
|
+
>>> lint = Linter(flags=[ImageProperty.SIZE, ImageVisuals.ALL])
|
85
|
+
|
86
|
+
Specifying an outlier method:
|
87
|
+
|
88
|
+
>>> lint = Linter(outlier_method="iqr")
|
89
|
+
|
90
|
+
Specifying an outlier method and threshold:
|
91
|
+
|
92
|
+
>>> lint = Linter(outlier_method="zscore", outlier_threshold=2.5)
|
36
93
|
"""
|
37
94
|
|
38
95
|
def __init__(
|
39
96
|
self,
|
40
|
-
images: np.ndarray,
|
41
97
|
flags: Optional[Union[LinterFlags, Sequence[LinterFlags]]] = None,
|
98
|
+
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
99
|
+
outlier_threshold: Optional[float] = None,
|
42
100
|
):
|
43
101
|
flags = flags if flags is not None else (ImageProperty.ALL, ImageVisuals.ALL)
|
44
102
|
self.stats = ImageStats(flags)
|
45
|
-
self.
|
103
|
+
self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
|
104
|
+
self.outlier_threshold = outlier_threshold
|
46
105
|
|
47
|
-
def _get_outliers(
|
48
|
-
self,
|
49
|
-
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
50
|
-
outlier_threshold: Optional[float] = None,
|
51
|
-
) -> dict:
|
106
|
+
def _get_outliers(self) -> dict:
|
52
107
|
flagged_images = {}
|
53
108
|
|
54
109
|
for stat, values in self.results.items():
|
@@ -56,23 +111,37 @@ class Linter:
|
|
56
111
|
continue
|
57
112
|
|
58
113
|
if values.ndim == 1 and np.std(values) != 0:
|
59
|
-
mask = _get_outlier_mask(values, outlier_method, outlier_threshold)
|
114
|
+
mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
|
60
115
|
indices = np.flatnonzero(mask)
|
61
116
|
for i, value in zip(indices, values[mask]):
|
62
117
|
flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
|
63
118
|
|
64
119
|
return dict(sorted(flagged_images.items()))
|
65
120
|
|
66
|
-
def evaluate(self) -> dict:
|
121
|
+
def evaluate(self, images: Iterable[ArrayLike]) -> dict:
|
67
122
|
"""
|
68
|
-
Returns indices of outliers with
|
123
|
+
Returns indices of outliers with the issues identified for each
|
124
|
+
|
125
|
+
Parameters
|
126
|
+
----------
|
127
|
+
images : Iterable[ArrayLike], shape - (N, C, H, W)
|
128
|
+
A dataset in an ArrayLike format.
|
129
|
+
Function expects the data to have 3 dimensions, CxHxW.
|
69
130
|
|
70
131
|
Returns
|
71
132
|
-------
|
72
133
|
Dict[int, Dict[str, float]]
|
73
|
-
Dictionary containing the indices of outliers and a dictionary
|
134
|
+
Dictionary containing the indices of outliers and a dictionary showing
|
135
|
+
the issues and calculated values for the given index.
|
136
|
+
|
137
|
+
Example
|
138
|
+
-------
|
139
|
+
Evaluate the dataset:
|
140
|
+
|
141
|
+
>>> lint.evaluate(images)
|
142
|
+
{18: {'brightness': 0.78}, 25: {'brightness': 0.98}}
|
74
143
|
"""
|
75
144
|
self.stats.reset()
|
76
|
-
self.stats.update(
|
145
|
+
self.stats.update(images)
|
77
146
|
self.results = self.stats.compute()
|
78
147
|
return self._get_outliers()
|
@@ -12,6 +12,7 @@ import keras
|
|
12
12
|
import numpy as np
|
13
13
|
|
14
14
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
15
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
16
|
from dataeval._internal.models.tensorflow.autoencoder import AE
|
16
17
|
from dataeval._internal.models.tensorflow.utils import predict_batch
|
17
18
|
|
@@ -30,7 +31,7 @@ class OOD_AE(OODBase):
|
|
30
31
|
|
31
32
|
def fit(
|
32
33
|
self,
|
33
|
-
x_ref:
|
34
|
+
x_ref: ArrayLike,
|
34
35
|
threshold_perc: float = 100.0,
|
35
36
|
loss_fn: Callable = keras.losses.MeanSquaredError(),
|
36
37
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
@@ -43,7 +44,7 @@ class OOD_AE(OODBase):
|
|
43
44
|
|
44
45
|
Parameters
|
45
46
|
----------
|
46
|
-
x_ref :
|
47
|
+
x_ref : ArrayLike
|
47
48
|
Training batch.
|
48
49
|
threshold_perc : float, default 100.0
|
49
50
|
Percentage of reference data that is normal.
|
@@ -58,10 +59,10 @@ class OOD_AE(OODBase):
|
|
58
59
|
verbose : bool, default True
|
59
60
|
Whether to print training progress.
|
60
61
|
"""
|
61
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
62
|
+
super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
62
63
|
|
63
|
-
def score(self, X:
|
64
|
-
self._validate(X)
|
64
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
65
|
+
self._validate(X := to_numpy(X))
|
65
66
|
|
66
67
|
# reconstruct instances
|
67
68
|
X_recon = predict_batch(X, self.model, batch_size=batch_size)
|
@@ -9,9 +9,9 @@ Licensed under Apache Software License (Apache 2.0)
|
|
9
9
|
from typing import Callable
|
10
10
|
|
11
11
|
import keras
|
12
|
-
import numpy as np
|
13
12
|
|
14
13
|
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
14
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
15
|
from dataeval._internal.models.tensorflow.autoencoder import AEGMM
|
16
16
|
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
17
17
|
from dataeval._internal.models.tensorflow.losses import LossGMM
|
@@ -32,7 +32,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
32
32
|
|
33
33
|
def fit(
|
34
34
|
self,
|
35
|
-
x_ref:
|
35
|
+
x_ref: ArrayLike,
|
36
36
|
threshold_perc: float = 100.0,
|
37
37
|
loss_fn: Callable = LossGMM(),
|
38
38
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
@@ -45,7 +45,7 @@ class OOD_AEGMM(OODGMMBase):
|
|
45
45
|
|
46
46
|
Parameters
|
47
47
|
----------
|
48
|
-
x_ref :
|
48
|
+
x_ref : ArrayLike
|
49
49
|
Training batch.
|
50
50
|
threshold_perc : float, default 100.0
|
51
51
|
Percentage of reference data that is normal.
|
@@ -62,8 +62,8 @@ class OOD_AEGMM(OODGMMBase):
|
|
62
62
|
"""
|
63
63
|
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
64
64
|
|
65
|
-
def score(self, X:
|
66
|
-
self._validate(X)
|
65
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
66
|
+
self._validate(X := to_numpy(X))
|
67
67
|
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
68
68
|
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
69
69
|
return OODScore(energy.numpy()) # type: ignore
|
@@ -13,6 +13,7 @@ import keras
|
|
13
13
|
import numpy as np
|
14
14
|
import tensorflow as tf
|
15
15
|
|
16
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
16
17
|
from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
|
17
18
|
from dataeval._internal.models.tensorflow.trainer import trainer
|
18
19
|
|
@@ -66,13 +67,13 @@ class OODBase(ABC):
|
|
66
67
|
self._validate(X)
|
67
68
|
|
68
69
|
@abstractmethod
|
69
|
-
def score(self, X:
|
70
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
70
71
|
"""
|
71
72
|
Compute instance and (optionally) feature level outlier scores.
|
72
73
|
|
73
74
|
Parameters
|
74
75
|
----------
|
75
|
-
X :
|
76
|
+
X : ArrayLike
|
76
77
|
Batch of instances.
|
77
78
|
batch_size : int, default int(1e10)
|
78
79
|
Batch size used when making predictions with the autoencoder.
|
@@ -87,7 +88,7 @@ class OODBase(ABC):
|
|
87
88
|
|
88
89
|
def fit(
|
89
90
|
self,
|
90
|
-
x_ref:
|
91
|
+
x_ref: ArrayLike,
|
91
92
|
threshold_perc: float,
|
92
93
|
loss_fn: Callable,
|
93
94
|
optimizer: keras.optimizers.Optimizer,
|
@@ -100,7 +101,7 @@ class OODBase(ABC):
|
|
100
101
|
|
101
102
|
Parameters
|
102
103
|
----------
|
103
|
-
x_ref: :
|
104
|
+
x_ref: : ArrayLike
|
104
105
|
Training batch.
|
105
106
|
threshold_perc : float
|
106
107
|
Percentage of reference data that is normal.
|
@@ -119,7 +120,7 @@ class OODBase(ABC):
|
|
119
120
|
trainer(
|
120
121
|
model=self.model,
|
121
122
|
loss_fn=loss_fn,
|
122
|
-
x_train=x_ref,
|
123
|
+
x_train=to_numpy(x_ref),
|
123
124
|
optimizer=optimizer,
|
124
125
|
epochs=epochs,
|
125
126
|
batch_size=batch_size,
|
@@ -132,7 +133,7 @@ class OODBase(ABC):
|
|
132
133
|
|
133
134
|
def predict(
|
134
135
|
self,
|
135
|
-
X:
|
136
|
+
X: ArrayLike,
|
136
137
|
batch_size: int = int(1e10),
|
137
138
|
ood_type: Literal["feature", "instance"] = "instance",
|
138
139
|
) -> Dict[str, np.ndarray]:
|
@@ -141,18 +142,18 @@ class OODBase(ABC):
|
|
141
142
|
|
142
143
|
Parameters
|
143
144
|
----------
|
144
|
-
X
|
145
|
+
X : ArrayLike
|
145
146
|
Batch of instances.
|
146
|
-
|
147
|
-
Predict out-of-distribution at the 'feature' or 'instance' level.
|
148
|
-
batch_size
|
147
|
+
batch_size : int, default int(1e10)
|
149
148
|
Batch size used when making predictions with the autoencoder.
|
149
|
+
ood_type : Literal["feature", "instance"], default "instance"
|
150
|
+
Predict out-of-distribution at the 'feature' or 'instance' level.
|
150
151
|
|
151
152
|
Returns
|
152
153
|
-------
|
153
154
|
Dictionary containing the outlier predictions and both feature and instance level outlier scores.
|
154
155
|
"""
|
155
|
-
self._validate_state(X)
|
156
|
+
self._validate_state(X := to_numpy(X))
|
156
157
|
# compute outlier scores
|
157
158
|
score = self.score(X, batch_size=batch_size)
|
158
159
|
ood_pred = (score.get(ood_type) > self._threshold_score(ood_type)).astype(int)
|
@@ -171,7 +172,7 @@ class OODGMMBase(OODBase):
|
|
171
172
|
|
172
173
|
def fit(
|
173
174
|
self,
|
174
|
-
x_ref:
|
175
|
+
x_ref: ArrayLike,
|
175
176
|
threshold_perc: float,
|
176
177
|
loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
|
177
178
|
optimizer: keras.optimizers.Optimizer,
|
@@ -183,7 +184,7 @@ class OODGMMBase(OODBase):
|
|
183
184
|
trainer(
|
184
185
|
model=self.model,
|
185
186
|
loss_fn=loss_fn,
|
186
|
-
x_train=x_ref,
|
187
|
+
x_train=to_numpy(x_ref),
|
187
188
|
optimizer=optimizer,
|
188
189
|
epochs=epochs,
|
189
190
|
batch_size=batch_size,
|
@@ -16,6 +16,7 @@ from keras.layers import Input
|
|
16
16
|
from keras.models import Model
|
17
17
|
|
18
18
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
19
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
19
20
|
from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
|
20
21
|
from dataeval._internal.models.tensorflow.trainer import trainer
|
21
22
|
from dataeval._internal.models.tensorflow.utils import predict_batch
|
@@ -125,7 +126,7 @@ class OOD_LLR(OODBase):
|
|
125
126
|
|
126
127
|
def fit(
|
127
128
|
self,
|
128
|
-
x_ref:
|
129
|
+
x_ref: ArrayLike,
|
129
130
|
threshold_perc: float = 100.0,
|
130
131
|
loss_fn: Optional[Callable] = None,
|
131
132
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
@@ -141,7 +142,7 @@ class OOD_LLR(OODBase):
|
|
141
142
|
|
142
143
|
Parameters
|
143
144
|
----------
|
144
|
-
x_ref :
|
145
|
+
x_ref : ArrayLike
|
145
146
|
Training batch.
|
146
147
|
threshold_perc : float, default 100.0
|
147
148
|
Percentage of reference data that is normal.
|
@@ -163,6 +164,7 @@ class OOD_LLR(OODBase):
|
|
163
164
|
mutate_batch_size: int, default int(1e10)
|
164
165
|
Batch size used to generate the mutations for the background dataset.
|
165
166
|
"""
|
167
|
+
x_ref = to_numpy(x_ref)
|
166
168
|
input_shape = x_ref.shape[1:]
|
167
169
|
optimizer = optimizer() if isinstance(optimizer, type) else optimizer
|
168
170
|
# Separate into two separate optimizers, one for semantic model and one for background model
|
@@ -275,10 +277,10 @@ class OOD_LLR(OODBase):
|
|
275
277
|
|
276
278
|
def score(
|
277
279
|
self,
|
278
|
-
X:
|
280
|
+
X: ArrayLike,
|
279
281
|
batch_size: int = int(1e10),
|
280
282
|
) -> OODScore:
|
281
|
-
self._validate(X)
|
283
|
+
self._validate(X := to_numpy(X))
|
282
284
|
fscore = -self._llr(X, True, batch_size=batch_size)
|
283
285
|
iscore = -self._llr(X, False, batch_size=batch_size)
|
284
286
|
return OODScore(iscore, fscore)
|
@@ -12,6 +12,7 @@ import keras
|
|
12
12
|
import numpy as np
|
13
13
|
|
14
14
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
15
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
16
|
from dataeval._internal.models.tensorflow.autoencoder import VAE
|
16
17
|
from dataeval._internal.models.tensorflow.losses import Elbo
|
17
18
|
from dataeval._internal.models.tensorflow.utils import predict_batch
|
@@ -34,7 +35,7 @@ class OOD_VAE(OODBase):
|
|
34
35
|
|
35
36
|
def fit(
|
36
37
|
self,
|
37
|
-
x_ref:
|
38
|
+
x_ref: ArrayLike,
|
38
39
|
threshold_perc: float = 100.0,
|
39
40
|
loss_fn: Callable = Elbo(0.05),
|
40
41
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
@@ -47,7 +48,7 @@ class OOD_VAE(OODBase):
|
|
47
48
|
|
48
49
|
Parameters
|
49
50
|
----------
|
50
|
-
x_ref :
|
51
|
+
x_ref : ArrayLike
|
51
52
|
Training batch.
|
52
53
|
threshold_perc : float, default 100.0
|
53
54
|
Percentage of reference data that is normal.
|
@@ -64,8 +65,8 @@ class OOD_VAE(OODBase):
|
|
64
65
|
"""
|
65
66
|
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
66
67
|
|
67
|
-
def score(self, X:
|
68
|
-
self._validate(X)
|
68
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
69
|
+
self._validate(X := to_numpy(X))
|
69
70
|
|
70
71
|
# sample reconstructed instances
|
71
72
|
X_samples = np.repeat(X, self.samples, axis=0)
|
@@ -12,6 +12,7 @@ import keras
|
|
12
12
|
import numpy as np
|
13
13
|
|
14
14
|
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
15
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
16
|
from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
|
16
17
|
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
17
18
|
from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
|
@@ -35,7 +36,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
35
36
|
|
36
37
|
def fit(
|
37
38
|
self,
|
38
|
-
x_ref:
|
39
|
+
x_ref: ArrayLike,
|
39
40
|
threshold_perc: float = 100.0,
|
40
41
|
loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
|
41
42
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
@@ -48,7 +49,7 @@ class OOD_VAEGMM(OODGMMBase):
|
|
48
49
|
|
49
50
|
Parameters
|
50
51
|
----------
|
51
|
-
X :
|
52
|
+
X : ArrayLike
|
52
53
|
Training batch.
|
53
54
|
threshold_perc : float, default 100.0
|
54
55
|
Percentage of reference data that is normal.
|
@@ -65,8 +66,8 @@ class OOD_VAEGMM(OODGMMBase):
|
|
65
66
|
"""
|
66
67
|
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
67
68
|
|
68
|
-
def score(self, X:
|
69
|
-
self._validate(X)
|
69
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
70
|
+
self._validate(X := to_numpy(X))
|
70
71
|
|
71
72
|
# draw samples from latent space
|
72
73
|
X_samples = np.repeat(X, self.samples, axis=0)
|
File without changes
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from typing import Tuple
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from scipy.sparse import coo_matrix
|
5
|
+
from scipy.stats import mode
|
6
|
+
|
7
|
+
from dataeval._internal.functional.utils import compute_neighbors, get_classes_counts, minimum_spanning_tree
|
8
|
+
|
9
|
+
|
10
|
+
def ber_mst(X: np.ndarray, y: np.ndarray, _: int) -> Tuple[float, float]:
|
11
|
+
"""Calculates the Bayes Error Rate using a minimum spanning tree
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
X : np.ndarray (N, :)
|
16
|
+
Data points with arbitrary dimensionality
|
17
|
+
y : np.ndarray (N, 1)
|
18
|
+
Labels for each data point
|
19
|
+
"""
|
20
|
+
|
21
|
+
M, N = get_classes_counts(y)
|
22
|
+
|
23
|
+
tree = coo_matrix(minimum_spanning_tree(X))
|
24
|
+
matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
|
25
|
+
deltas = matches / (2 * N)
|
26
|
+
upper = 2 * deltas
|
27
|
+
lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
|
28
|
+
return upper, lower
|
29
|
+
|
30
|
+
|
31
|
+
def ber_knn(X: np.ndarray, y: np.ndarray, k: int) -> Tuple[float, float]:
|
32
|
+
"""Calculates the Bayes Error Rate using K-nearest neighbors"""
|
33
|
+
|
34
|
+
M, N = get_classes_counts(y)
|
35
|
+
|
36
|
+
# All features belong on second dimension
|
37
|
+
X = X.reshape((X.shape[0], -1))
|
38
|
+
nn_indices = compute_neighbors(X, X, k=k)
|
39
|
+
nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
|
40
|
+
modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
|
41
|
+
upper = float(np.count_nonzero(modal_class - y) / N)
|
42
|
+
lower = _knn_lowerbound(upper, M, k)
|
43
|
+
return upper, lower
|
44
|
+
|
45
|
+
|
46
|
+
def _knn_lowerbound(value: float, classes: int, k: int) -> float:
|
47
|
+
"""Several cases for computing the BER lower bound"""
|
48
|
+
if value <= 1e-10:
|
49
|
+
return 0.0
|
50
|
+
|
51
|
+
if classes == 2 and k != 1:
|
52
|
+
if k > 5:
|
53
|
+
# Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
|
54
|
+
alpha = 0.3399
|
55
|
+
beta = 0.9749
|
56
|
+
a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
|
57
|
+
return value / (1 + a_k)
|
58
|
+
if k > 2:
|
59
|
+
return value / (1 + (1 / np.sqrt(k)))
|
60
|
+
# k == 2:
|
61
|
+
return value / 2
|
62
|
+
|
63
|
+
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
@@ -0,0 +1,75 @@
|
|
1
|
+
import math
|
2
|
+
from typing import Literal, Tuple
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
from scipy.spatial.distance import pdist, squareform
|
6
|
+
|
7
|
+
|
8
|
+
def coverage(
|
9
|
+
embeddings: np.ndarray,
|
10
|
+
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
11
|
+
k: int = 20,
|
12
|
+
percent: np.float64 = np.float64(0.01),
|
13
|
+
) -> Tuple[np.ndarray, np.ndarray, float]:
|
14
|
+
"""
|
15
|
+
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
16
|
+
tests the null hypothesis that the observed data has the expected frequencies.
|
17
|
+
|
18
|
+
Parameters
|
19
|
+
----------
|
20
|
+
embeddings : ArrayLike, shape - (N, P)
|
21
|
+
A dataset in an ArrayLike format.
|
22
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
23
|
+
radius_type : Literal["adaptive", "naive"], default "adaptive"
|
24
|
+
The function used to determine radius.
|
25
|
+
k: int, default 20
|
26
|
+
Number of observations required in order to be covered.
|
27
|
+
[1] suggests that a minimum of 20-50 samples is necessary.
|
28
|
+
percent: np.float64, default np.float(0.01)
|
29
|
+
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
30
|
+
|
31
|
+
Returns
|
32
|
+
-------
|
33
|
+
np.ndarray
|
34
|
+
Array of uncovered indices
|
35
|
+
np.ndarray
|
36
|
+
Array of critical value radii
|
37
|
+
float
|
38
|
+
Radius for coverage
|
39
|
+
|
40
|
+
Raises
|
41
|
+
------
|
42
|
+
ValueError
|
43
|
+
If length of embeddings is less than or equal to k
|
44
|
+
ValueError
|
45
|
+
If radius_type is unknown
|
46
|
+
|
47
|
+
Note
|
48
|
+
----
|
49
|
+
Embeddings should be on the unit interval.
|
50
|
+
|
51
|
+
Reference
|
52
|
+
---------
|
53
|
+
This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
|
54
|
+
[1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
|
55
|
+
"""
|
56
|
+
|
57
|
+
# Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
|
58
|
+
n = len(embeddings)
|
59
|
+
if n <= k:
|
60
|
+
raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
|
61
|
+
mat = squareform(pdist(embeddings))
|
62
|
+
sorted_dists = np.sort(mat, axis=1)
|
63
|
+
crit = sorted_dists[:, k + 1]
|
64
|
+
|
65
|
+
d = np.shape(embeddings)[1]
|
66
|
+
if radius_type == "naive":
|
67
|
+
rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
|
68
|
+
pvals = np.where(crit > rho)[0]
|
69
|
+
elif radius_type == "adaptive":
|
70
|
+
# Use data adaptive cutoff as rho
|
71
|
+
rho = int(n * percent)
|
72
|
+
pvals = np.argsort(crit)[::-1][:rho]
|
73
|
+
else:
|
74
|
+
raise ValueError("Invalid radius type.")
|
75
|
+
return pvals, crit, rho
|
@@ -0,0 +1,16 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
from .utils import compute_neighbors, minimum_spanning_tree
|
4
|
+
|
5
|
+
|
6
|
+
def divergence_mst(data: np.ndarray, labels: np.ndarray) -> int:
|
7
|
+
mst = minimum_spanning_tree(data).toarray()
|
8
|
+
edgelist = np.transpose(np.nonzero(mst))
|
9
|
+
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
10
|
+
return errors
|
11
|
+
|
12
|
+
|
13
|
+
def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
|
14
|
+
nn_indices = compute_neighbors(data, data)
|
15
|
+
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
16
|
+
return errors
|
@@ -3,7 +3,7 @@ import xxhash as xxh
|
|
3
3
|
from PIL import Image
|
4
4
|
from scipy.fftpack import dct
|
5
5
|
|
6
|
-
from dataeval._internal.
|
6
|
+
from dataeval._internal.functional.utils import normalize_image_shape, rescale
|
7
7
|
|
8
8
|
HASH_SIZE = 8
|
9
9
|
MAX_FACTOR = 4
|