dataeval 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +18 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/clusterer.py +469 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/drift/base.py +265 -0
- dataeval/_internal/detectors/drift/cvm.py +97 -0
- dataeval/_internal/detectors/drift/ks.py +100 -0
- dataeval/_internal/detectors/drift/mmd.py +166 -0
- dataeval/_internal/detectors/drift/torch.py +310 -0
- dataeval/_internal/detectors/drift/uncertainty.py +149 -0
- dataeval/_internal/detectors/duplicates.py +49 -0
- dataeval/_internal/detectors/linter.py +78 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/ae.py +77 -0
- dataeval/_internal/detectors/ood/aegmm.py +69 -0
- dataeval/_internal/detectors/ood/base.py +199 -0
- dataeval/_internal/detectors/ood/llr.py +284 -0
- dataeval/_internal/detectors/ood/vae.py +86 -0
- dataeval/_internal/detectors/ood/vaegmm.py +79 -0
- dataeval/_internal/flags.py +47 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/base.py +92 -0
- dataeval/_internal/metrics/ber.py +124 -0
- dataeval/_internal/metrics/coverage.py +80 -0
- dataeval/_internal/metrics/divergence.py +94 -0
- dataeval/_internal/metrics/hash.py +79 -0
- dataeval/_internal/metrics/parity.py +180 -0
- dataeval/_internal/metrics/stats.py +332 -0
- dataeval/_internal/metrics/uap.py +45 -0
- dataeval/_internal/metrics/utils.py +158 -0
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/autoencoder.py +202 -0
- dataeval/_internal/models/pytorch/blocks.py +46 -0
- dataeval/_internal/models/pytorch/utils.py +67 -0
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
- dataeval/_internal/models/tensorflow/gmm.py +115 -0
- dataeval/_internal/models/tensorflow/losses.py +107 -0
- dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
- dataeval/_internal/models/tensorflow/trainer.py +102 -0
- dataeval/_internal/models/tensorflow/utils.py +254 -0
- dataeval/_internal/workflows/sufficiency.py +555 -0
- dataeval/detectors/__init__.py +29 -0
- dataeval/flags/__init__.py +3 -0
- dataeval/metrics/__init__.py +7 -0
- dataeval/models/__init__.py +15 -0
- dataeval/models/tensorflow/__init__.py +6 -0
- dataeval/models/torch/__init__.py +8 -0
- dataeval/py.typed +0 -0
- dataeval/workflows/__init__.py +8 -0
- dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
- dataeval-0.61.0.dist-info/METADATA +114 -0
- dataeval-0.61.0.dist-info/RECORD +55 -0
- dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Callable
|
10
|
+
|
11
|
+
import keras
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
15
|
+
from dataeval._internal.models.tensorflow.autoencoder import AEGMM
|
16
|
+
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
17
|
+
from dataeval._internal.models.tensorflow.losses import LossGMM
|
18
|
+
from dataeval._internal.models.tensorflow.utils import predict_batch
|
19
|
+
|
20
|
+
|
21
|
+
class OOD_AEGMM(OODGMMBase):
|
22
|
+
def __init__(self, model: AEGMM) -> None:
|
23
|
+
"""
|
24
|
+
AE with Gaussian Mixture Model based outlier detector.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
model : AEGMM
|
29
|
+
An AEGMM model.
|
30
|
+
"""
|
31
|
+
super().__init__(model)
|
32
|
+
|
33
|
+
def fit(
|
34
|
+
self,
|
35
|
+
x_ref: np.ndarray,
|
36
|
+
threshold_perc: float = 100.0,
|
37
|
+
loss_fn: Callable = LossGMM(),
|
38
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
39
|
+
epochs: int = 20,
|
40
|
+
batch_size: int = 64,
|
41
|
+
verbose: bool = True,
|
42
|
+
) -> None:
|
43
|
+
"""
|
44
|
+
Train the AEGMM model with recommended loss function and optimizer.
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
x_ref : np.ndarray
|
49
|
+
Training batch.
|
50
|
+
threshold_perc : float, default 100.0
|
51
|
+
Percentage of reference data that is normal.
|
52
|
+
loss_fn : Callable, default LossGMM()
|
53
|
+
Loss function used for training.
|
54
|
+
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
55
|
+
Optimizer used for training.
|
56
|
+
epochs : int, default 20
|
57
|
+
Number of training epochs.
|
58
|
+
batch_size : int, default 64
|
59
|
+
Batch size used for training.
|
60
|
+
verbose : bool, default True
|
61
|
+
Whether to print training progress.
|
62
|
+
"""
|
63
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
64
|
+
|
65
|
+
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
|
66
|
+
self._validate(X)
|
67
|
+
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
68
|
+
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
69
|
+
return OODScore(energy.numpy()) # type: ignore
|
@@ -0,0 +1,199 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from abc import ABC, abstractmethod
|
10
|
+
from typing import Callable, Dict, List, Literal, NamedTuple, Optional, Tuple, cast
|
11
|
+
|
12
|
+
import keras
|
13
|
+
import numpy as np
|
14
|
+
import tensorflow as tf
|
15
|
+
|
16
|
+
from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
|
17
|
+
from dataeval._internal.models.tensorflow.trainer import trainer
|
18
|
+
|
19
|
+
|
20
|
+
class OODScore(NamedTuple):
|
21
|
+
"""
|
22
|
+
NamedTuple containing the instance and (optionally) feature score.
|
23
|
+
|
24
|
+
Parameters
|
25
|
+
----------
|
26
|
+
instance_score : np.ndarray
|
27
|
+
Instance score of the evaluated dataset.
|
28
|
+
feature_score : Optional[np.ndarray], default None
|
29
|
+
Feature score, if available, of the evaluated dataset.
|
30
|
+
"""
|
31
|
+
|
32
|
+
instance_score: np.ndarray
|
33
|
+
feature_score: Optional[np.ndarray] = None
|
34
|
+
|
35
|
+
def get(self, ood_type: Literal["instance", "feature"]) -> np.ndarray:
|
36
|
+
return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
|
37
|
+
|
38
|
+
|
39
|
+
class OODBase(ABC):
|
40
|
+
def __init__(self, model: keras.Model) -> None:
|
41
|
+
self.model = model
|
42
|
+
|
43
|
+
self._ref_score: OODScore
|
44
|
+
self._threshold_perc: float
|
45
|
+
self._data_info: Optional[Tuple[tuple, type]] = None
|
46
|
+
|
47
|
+
if not isinstance(model, keras.Model):
|
48
|
+
raise TypeError("Model should be of type 'keras.Model'.")
|
49
|
+
|
50
|
+
def _get_data_info(self, X: np.ndarray) -> Tuple[tuple, type]:
|
51
|
+
if not isinstance(X, np.ndarray):
|
52
|
+
raise TypeError("Dataset should of type: `np.ndarray`.")
|
53
|
+
return X.shape[1:], X.dtype.type
|
54
|
+
|
55
|
+
def _validate(self, X: np.ndarray) -> None:
|
56
|
+
check_data_info = self._get_data_info(X)
|
57
|
+
if self._data_info is not None and check_data_info != self._data_info:
|
58
|
+
raise RuntimeError(f"Expect data of type: {self._data_info[1]} and shape: {self._data_info[0]}. \
|
59
|
+
Provided data is type: {check_data_info[1]} and shape: {check_data_info[0]}.")
|
60
|
+
|
61
|
+
def _validate_state(self, X: np.ndarray, additional_attrs: Optional[List[str]] = None) -> None:
|
62
|
+
attrs = ["_data_info", "_threshold_perc", "_ref_score"]
|
63
|
+
attrs = attrs if additional_attrs is None else attrs + additional_attrs
|
64
|
+
if not all(hasattr(self, attr) for attr in attrs) or any(getattr(self, attr) for attr in attrs) is None:
|
65
|
+
raise RuntimeError("Metric needs to be `fit` before method call.")
|
66
|
+
self._validate(X)
|
67
|
+
|
68
|
+
@abstractmethod
|
69
|
+
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
|
70
|
+
"""
|
71
|
+
Compute instance and (optionally) feature level outlier scores.
|
72
|
+
|
73
|
+
Parameters
|
74
|
+
----------
|
75
|
+
X : np.ndarray
|
76
|
+
Batch of instances.
|
77
|
+
batch_size : int, default int(1e10)
|
78
|
+
Batch size used when making predictions with the autoencoder.
|
79
|
+
|
80
|
+
Returns
|
81
|
+
-------
|
82
|
+
Instance and feature level outlier scores.
|
83
|
+
"""
|
84
|
+
|
85
|
+
def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
|
86
|
+
return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
|
87
|
+
|
88
|
+
def fit(
|
89
|
+
self,
|
90
|
+
x_ref: np.ndarray,
|
91
|
+
threshold_perc: float,
|
92
|
+
loss_fn: Callable,
|
93
|
+
optimizer: keras.optimizers.Optimizer,
|
94
|
+
epochs: int,
|
95
|
+
batch_size: int,
|
96
|
+
verbose: bool,
|
97
|
+
) -> None:
|
98
|
+
"""
|
99
|
+
Train the model and infer the threshold value.
|
100
|
+
|
101
|
+
Parameters
|
102
|
+
----------
|
103
|
+
x_ref: : np.ndarray
|
104
|
+
Training batch.
|
105
|
+
threshold_perc : float
|
106
|
+
Percentage of reference data that is normal.
|
107
|
+
loss_fn : Callable
|
108
|
+
Loss function used for training.
|
109
|
+
optimizer : keras.optimizers.Optimizer
|
110
|
+
Optimizer used for training.
|
111
|
+
epochs : int
|
112
|
+
Number of training epochs.
|
113
|
+
batch_size : int
|
114
|
+
Batch size used for training.
|
115
|
+
verbose : bool
|
116
|
+
Whether to print training progress.
|
117
|
+
"""
|
118
|
+
# Train the model
|
119
|
+
trainer(
|
120
|
+
model=self.model,
|
121
|
+
loss_fn=loss_fn,
|
122
|
+
x_train=x_ref,
|
123
|
+
optimizer=optimizer,
|
124
|
+
epochs=epochs,
|
125
|
+
batch_size=batch_size,
|
126
|
+
verbose=verbose,
|
127
|
+
)
|
128
|
+
|
129
|
+
# Infer the threshold values
|
130
|
+
self._ref_score = self.score(x_ref, batch_size)
|
131
|
+
self._threshold_perc = threshold_perc
|
132
|
+
|
133
|
+
def predict(
|
134
|
+
self,
|
135
|
+
X: np.ndarray,
|
136
|
+
batch_size: int = int(1e10),
|
137
|
+
ood_type: Literal["feature", "instance"] = "instance",
|
138
|
+
) -> Dict[str, np.ndarray]:
|
139
|
+
"""
|
140
|
+
Predict whether instances are out-of-distribution or not.
|
141
|
+
|
142
|
+
Parameters
|
143
|
+
----------
|
144
|
+
X
|
145
|
+
Batch of instances.
|
146
|
+
ood_type
|
147
|
+
Predict out-of-distribution at the 'feature' or 'instance' level.
|
148
|
+
batch_size
|
149
|
+
Batch size used when making predictions with the autoencoder.
|
150
|
+
|
151
|
+
Returns
|
152
|
+
-------
|
153
|
+
Dictionary containing the outlier predictions and both feature and instance level outlier scores.
|
154
|
+
"""
|
155
|
+
self._validate_state(X)
|
156
|
+
# compute outlier scores
|
157
|
+
score = self.score(X, batch_size=batch_size)
|
158
|
+
ood_pred = (score.get(ood_type) > self._threshold_score(ood_type)).astype(int)
|
159
|
+
return {**{"is_ood": ood_pred}, **score._asdict()}
|
160
|
+
|
161
|
+
|
162
|
+
class OODGMMBase(OODBase):
|
163
|
+
def __init__(self, model: keras.Model) -> None:
|
164
|
+
super().__init__(model)
|
165
|
+
self.gmm_params: GaussianMixtureModelParams
|
166
|
+
|
167
|
+
def _validate_state(self, X: np.ndarray, additional_attrs: Optional[List[str]] = None) -> None:
|
168
|
+
if additional_attrs is None:
|
169
|
+
additional_attrs = ["gmm_params"]
|
170
|
+
super()._validate_state(X, additional_attrs)
|
171
|
+
|
172
|
+
def fit(
|
173
|
+
self,
|
174
|
+
x_ref: np.ndarray,
|
175
|
+
threshold_perc: float,
|
176
|
+
loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
|
177
|
+
optimizer: keras.optimizers.Optimizer,
|
178
|
+
epochs: int,
|
179
|
+
batch_size: int,
|
180
|
+
verbose: bool,
|
181
|
+
) -> None:
|
182
|
+
# Train the model
|
183
|
+
trainer(
|
184
|
+
model=self.model,
|
185
|
+
loss_fn=loss_fn,
|
186
|
+
x_train=x_ref,
|
187
|
+
optimizer=optimizer,
|
188
|
+
epochs=epochs,
|
189
|
+
batch_size=batch_size,
|
190
|
+
verbose=verbose,
|
191
|
+
)
|
192
|
+
|
193
|
+
# Calculate the GMM parameters
|
194
|
+
_, z, gamma = cast(Tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
|
195
|
+
self.gmm_params = gmm_params(z, gamma)
|
196
|
+
|
197
|
+
# Infer the threshold values
|
198
|
+
self._ref_score = self.score(x_ref, batch_size)
|
199
|
+
self._threshold_perc = threshold_perc
|
@@ -0,0 +1,284 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from functools import partial
|
10
|
+
from typing import Callable, Optional, Tuple
|
11
|
+
|
12
|
+
import keras
|
13
|
+
import numpy as np
|
14
|
+
import tensorflow as tf
|
15
|
+
from keras.layers import Input
|
16
|
+
from keras.models import Model
|
17
|
+
|
18
|
+
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
19
|
+
from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
|
20
|
+
from dataeval._internal.models.tensorflow.trainer import trainer
|
21
|
+
from dataeval._internal.models.tensorflow.utils import predict_batch
|
22
|
+
|
23
|
+
|
24
|
+
def build_model(
|
25
|
+
dist: PixelCNN, input_shape: Optional[tuple] = None, filepath: Optional[str] = None
|
26
|
+
) -> Tuple[keras.Model, PixelCNN]:
|
27
|
+
"""
|
28
|
+
Create keras.Model from TF distribution.
|
29
|
+
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
dist
|
33
|
+
TensorFlow distribution.
|
34
|
+
input_shape
|
35
|
+
Input shape of the model.
|
36
|
+
filepath
|
37
|
+
File to load model weights from.
|
38
|
+
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
TensorFlow model.
|
42
|
+
"""
|
43
|
+
x_in = Input(shape=input_shape)
|
44
|
+
log_prob = dist.log_prob(x_in)
|
45
|
+
model = Model(inputs=x_in, outputs=log_prob)
|
46
|
+
model.add_loss(-tf.reduce_mean(log_prob))
|
47
|
+
if isinstance(filepath, str):
|
48
|
+
model.load_weights(filepath)
|
49
|
+
return model, dist
|
50
|
+
|
51
|
+
|
52
|
+
def mutate_categorical(
|
53
|
+
X: np.ndarray,
|
54
|
+
rate: float,
|
55
|
+
seed: int = 0,
|
56
|
+
feature_range: tuple = (0, 255),
|
57
|
+
) -> tf.Tensor:
|
58
|
+
"""
|
59
|
+
Randomly change integer feature values to values within a set range
|
60
|
+
with a specified permutation rate.
|
61
|
+
|
62
|
+
Parameters
|
63
|
+
----------
|
64
|
+
X
|
65
|
+
Batch of data to be perturbed.
|
66
|
+
rate
|
67
|
+
Permutation rate (between 0 and 1).
|
68
|
+
seed
|
69
|
+
Random seed.
|
70
|
+
feature_range
|
71
|
+
Min and max range for perturbed features.
|
72
|
+
|
73
|
+
Returns
|
74
|
+
-------
|
75
|
+
Array with perturbed data.
|
76
|
+
"""
|
77
|
+
frange = (feature_range[0] + 1, feature_range[1] + 1)
|
78
|
+
shape = X.shape
|
79
|
+
n_samples = np.prod(shape)
|
80
|
+
mask = tf.random.categorical(tf.math.log([[1.0 - rate, rate]]), n_samples, seed=seed, dtype=tf.int32)
|
81
|
+
mask = tf.reshape(mask, shape)
|
82
|
+
possible_mutations = tf.random.uniform(shape, minval=frange[0], maxval=frange[1], dtype=tf.int32, seed=seed + 1)
|
83
|
+
X = tf.math.floormod(tf.cast(X, tf.int32) + mask * possible_mutations, frange[1]) # type: ignore py38
|
84
|
+
return tf.cast(X, tf.float32) # type: ignore
|
85
|
+
|
86
|
+
|
87
|
+
class OOD_LLR(OODBase):
|
88
|
+
def __init__(
|
89
|
+
self,
|
90
|
+
model: PixelCNN,
|
91
|
+
model_background: Optional[PixelCNN] = None,
|
92
|
+
log_prob: Optional[Callable] = None,
|
93
|
+
sequential: bool = False,
|
94
|
+
) -> None:
|
95
|
+
"""
|
96
|
+
Likelihood Ratios based outlier detector.
|
97
|
+
|
98
|
+
Parameters
|
99
|
+
----------
|
100
|
+
model : PixelCNN
|
101
|
+
Generative distribution model.
|
102
|
+
model_background : Optional[PixelCNN], default None
|
103
|
+
Optional model for the background. Only needed if it is different from `model`.
|
104
|
+
log_prob : Optional[Callable], default None
|
105
|
+
Function used to evaluate log probabilities under the model
|
106
|
+
if the model does not have a `log_prob` function.
|
107
|
+
sequential : bool, default False
|
108
|
+
Whether the data is sequential. Used to create targets during training.
|
109
|
+
"""
|
110
|
+
self.dist_s = model
|
111
|
+
self.dist_b = (
|
112
|
+
model.copy()
|
113
|
+
if hasattr(model, "copy")
|
114
|
+
else keras.models.clone_model(model)
|
115
|
+
if model_background is None
|
116
|
+
else model_background
|
117
|
+
)
|
118
|
+
self.has_log_prob = hasattr(model, "log_prob")
|
119
|
+
self.sequential = sequential
|
120
|
+
self.log_prob = log_prob
|
121
|
+
|
122
|
+
self._ref_score: OODScore
|
123
|
+
self._threshold_perc: float
|
124
|
+
self._data_info: Optional[Tuple[tuple, type]] = None
|
125
|
+
|
126
|
+
def fit(
|
127
|
+
self,
|
128
|
+
x_ref: np.ndarray,
|
129
|
+
threshold_perc: float = 100.0,
|
130
|
+
loss_fn: Optional[Callable] = None,
|
131
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
132
|
+
epochs: int = 20,
|
133
|
+
batch_size: int = 64,
|
134
|
+
verbose: bool = True,
|
135
|
+
mutate_fn: Callable = mutate_categorical,
|
136
|
+
mutate_fn_kwargs: dict = {"rate": 0.2, "seed": 0, "feature_range": (0, 255)},
|
137
|
+
mutate_batch_size: int = int(1e10),
|
138
|
+
) -> None:
|
139
|
+
"""
|
140
|
+
Train semantic and background generative models.
|
141
|
+
|
142
|
+
Parameters
|
143
|
+
----------
|
144
|
+
x_ref : np.ndarray
|
145
|
+
Training batch.
|
146
|
+
threshold_perc : float, default 100.0
|
147
|
+
Percentage of reference data that is normal.
|
148
|
+
loss_fn : Optional[Callable], default None
|
149
|
+
Loss function used for training.
|
150
|
+
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
151
|
+
Optimizer used for training.
|
152
|
+
epochs : int, default 20
|
153
|
+
Number of training epochs.
|
154
|
+
batch_size : int, default 64
|
155
|
+
Batch size used for training.
|
156
|
+
verbose : bool, default True
|
157
|
+
Whether to print training progress.
|
158
|
+
mutate_fn : Callable, default mutate_categorical
|
159
|
+
Mutation function used to generate the background dataset.
|
160
|
+
mutate_fn_kwargs : dict, default {"rate": 0.2, "seed": 0, "feature_range": (0, 255)}
|
161
|
+
Kwargs for the mutation function used to generate the background dataset.
|
162
|
+
Default values set for an image dataset.
|
163
|
+
mutate_batch_size: int, default int(1e10)
|
164
|
+
Batch size used to generate the mutations for the background dataset.
|
165
|
+
"""
|
166
|
+
input_shape = x_ref.shape[1:]
|
167
|
+
optimizer = optimizer() if isinstance(optimizer, type) else optimizer
|
168
|
+
# Separate into two separate optimizers, one for semantic model and one for background model
|
169
|
+
optimizer_s = optimizer
|
170
|
+
optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
|
171
|
+
|
172
|
+
# training arguments
|
173
|
+
kwargs = {
|
174
|
+
"epochs": epochs,
|
175
|
+
"batch_size": batch_size,
|
176
|
+
"verbose": verbose,
|
177
|
+
}
|
178
|
+
|
179
|
+
# create background data
|
180
|
+
mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
|
181
|
+
X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype)
|
182
|
+
|
183
|
+
# prepare sequential data
|
184
|
+
if self.sequential and not self.has_log_prob:
|
185
|
+
y, y_back = x_ref[:, 1:], X_back[:, 1:] # type: ignore
|
186
|
+
X, X_back = x_ref[:, :-1], X_back[:, :-1] # type: ignore
|
187
|
+
else:
|
188
|
+
X = x_ref
|
189
|
+
y, y_back = None, None
|
190
|
+
|
191
|
+
# check if model needs to be built
|
192
|
+
use_build = self.has_log_prob and not isinstance(self.dist_s, keras.Model)
|
193
|
+
|
194
|
+
if use_build:
|
195
|
+
# build and train semantic model
|
196
|
+
self.model_s = build_model(self.dist_s, input_shape)[0]
|
197
|
+
self.model_s.compile(optimizer=optimizer_s)
|
198
|
+
self.model_s.fit(X, **kwargs)
|
199
|
+
# build and train background model
|
200
|
+
self.model_b = build_model(self.dist_b, input_shape)[0]
|
201
|
+
self.model_b.compile(optimizer=optimizer_b)
|
202
|
+
self.model_b.fit(X_back, **kwargs)
|
203
|
+
else:
|
204
|
+
# train semantic model
|
205
|
+
args = [self.dist_s, X]
|
206
|
+
kwargs.update({"y_train": y, "loss_fn": loss_fn, "optimizer": optimizer_s})
|
207
|
+
trainer(*args, **kwargs)
|
208
|
+
|
209
|
+
# train background model
|
210
|
+
args = [self.dist_b, X_back]
|
211
|
+
kwargs.update({"y_train": y_back, "loss_fn": loss_fn, "optimizer": optimizer_b})
|
212
|
+
trainer(*args, **kwargs)
|
213
|
+
|
214
|
+
self._datainfo = self._get_data_info(x_ref)
|
215
|
+
self._ref_score = self.score(x_ref, batch_size=batch_size)
|
216
|
+
self._threshold_perc = threshold_perc
|
217
|
+
|
218
|
+
def _logp(
|
219
|
+
self,
|
220
|
+
dist,
|
221
|
+
X: np.ndarray,
|
222
|
+
return_per_feature: bool = False,
|
223
|
+
batch_size: int = int(1e10),
|
224
|
+
) -> np.ndarray:
|
225
|
+
"""
|
226
|
+
Compute log probability of a batch of instances under the generative model.
|
227
|
+
"""
|
228
|
+
logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature)
|
229
|
+
# TODO: TBD: can this be any of the other types from predict_batch? i.e. tf.Tensor or tuple
|
230
|
+
return predict_batch(X, logp_fn, batch_size=batch_size) # type: ignore[return-value]
|
231
|
+
|
232
|
+
def _logp_alt(
|
233
|
+
self,
|
234
|
+
model: keras.Model,
|
235
|
+
X: np.ndarray,
|
236
|
+
return_per_feature: bool = False,
|
237
|
+
batch_size: int = int(1e10),
|
238
|
+
) -> np.ndarray:
|
239
|
+
"""
|
240
|
+
Compute log probability of a batch of instances with the user defined log_prob function.
|
241
|
+
"""
|
242
|
+
if self.sequential:
|
243
|
+
y, X = X[:, 1:], X[:, :-1]
|
244
|
+
else:
|
245
|
+
y = X.copy()
|
246
|
+
y_preds = predict_batch(X, model, batch_size=batch_size)
|
247
|
+
logp = self.log_prob(y, y_preds).numpy() # type: ignore
|
248
|
+
if return_per_feature:
|
249
|
+
return logp
|
250
|
+
else:
|
251
|
+
axis = tuple(np.arange(len(logp.shape))[1:])
|
252
|
+
return np.mean(logp, axis=axis)
|
253
|
+
|
254
|
+
def _llr(self, X: np.ndarray, return_per_feature: bool, batch_size: int = int(1e10)) -> np.ndarray:
|
255
|
+
"""
|
256
|
+
Compute likelihood ratios.
|
257
|
+
|
258
|
+
Parameters
|
259
|
+
----------
|
260
|
+
X
|
261
|
+
Batch of instances.
|
262
|
+
return_per_feature
|
263
|
+
Return likelihood ratio per feature.
|
264
|
+
batch_size
|
265
|
+
Batch size for the generative model evaluations.
|
266
|
+
|
267
|
+
Returns
|
268
|
+
-------
|
269
|
+
Likelihood ratios.
|
270
|
+
"""
|
271
|
+
logp_fn = self._logp if not isinstance(self.log_prob, Callable) else self._logp_alt # type: ignore
|
272
|
+
logp_s = logp_fn(self.dist_s, X, return_per_feature=return_per_feature, batch_size=batch_size)
|
273
|
+
logp_b = logp_fn(self.dist_b, X, return_per_feature=return_per_feature, batch_size=batch_size)
|
274
|
+
return logp_s - logp_b
|
275
|
+
|
276
|
+
def score(
|
277
|
+
self,
|
278
|
+
X: np.ndarray,
|
279
|
+
batch_size: int = int(1e10),
|
280
|
+
) -> OODScore:
|
281
|
+
self._validate(X)
|
282
|
+
fscore = -self._llr(X, True, batch_size=batch_size)
|
283
|
+
iscore = -self._llr(X, False, batch_size=batch_size)
|
284
|
+
return OODScore(iscore, fscore)
|
@@ -0,0 +1,86 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Callable
|
10
|
+
|
11
|
+
import keras
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
15
|
+
from dataeval._internal.models.tensorflow.autoencoder import VAE
|
16
|
+
from dataeval._internal.models.tensorflow.losses import Elbo
|
17
|
+
from dataeval._internal.models.tensorflow.utils import predict_batch
|
18
|
+
|
19
|
+
|
20
|
+
class OOD_VAE(OODBase):
|
21
|
+
def __init__(self, model: VAE, samples: int = 10) -> None:
|
22
|
+
"""
|
23
|
+
VAE based outlier detector.
|
24
|
+
|
25
|
+
Parameters
|
26
|
+
----------
|
27
|
+
model : VAE
|
28
|
+
A VAE model.
|
29
|
+
samples : int, default 10
|
30
|
+
Number of samples sampled to evaluate each instance.
|
31
|
+
"""
|
32
|
+
super().__init__(model)
|
33
|
+
self.samples = samples
|
34
|
+
|
35
|
+
def fit(
|
36
|
+
self,
|
37
|
+
x_ref: np.ndarray,
|
38
|
+
threshold_perc: float = 100.0,
|
39
|
+
loss_fn: Callable = Elbo(0.05),
|
40
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
41
|
+
epochs: int = 20,
|
42
|
+
batch_size: int = 64,
|
43
|
+
verbose: bool = True,
|
44
|
+
) -> None:
|
45
|
+
"""
|
46
|
+
Train the VAE model.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
x_ref : np.ndarray
|
51
|
+
Training batch.
|
52
|
+
threshold_perc : float, default 100.0
|
53
|
+
Percentage of reference data that is normal.
|
54
|
+
loss_fn : Callable, default Elbo(0.05)
|
55
|
+
Loss function used for training.
|
56
|
+
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
57
|
+
Optimizer used for training.
|
58
|
+
epochs : int, default 20
|
59
|
+
Number of training epochs.
|
60
|
+
batch_size : int, default 64
|
61
|
+
Batch size used for training.
|
62
|
+
verbose : bool, default True
|
63
|
+
Whether to print training progress.
|
64
|
+
"""
|
65
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
66
|
+
|
67
|
+
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
|
68
|
+
self._validate(X)
|
69
|
+
|
70
|
+
# sample reconstructed instances
|
71
|
+
X_samples = np.repeat(X, self.samples, axis=0)
|
72
|
+
X_recon = predict_batch(X_samples, model=self.model, batch_size=batch_size)
|
73
|
+
|
74
|
+
# compute feature scores
|
75
|
+
fscore = np.power(X_samples - X_recon, 2)
|
76
|
+
fscore = fscore.reshape((-1, self.samples) + X_samples.shape[1:])
|
77
|
+
fscore = np.mean(fscore, axis=1)
|
78
|
+
|
79
|
+
# compute instance scores
|
80
|
+
fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
|
81
|
+
n_score_features = int(np.ceil(fscore_flat.shape[1]))
|
82
|
+
sorted_fscore = np.sort(fscore_flat, axis=1)
|
83
|
+
sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
|
84
|
+
iscore = np.mean(sorted_fscore_perc, axis=1)
|
85
|
+
|
86
|
+
return OODScore(iscore, fscore)
|