dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,69 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from typing import Callable
10
+
11
+ import keras
12
+ import numpy as np
13
+
14
+ from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
15
+ from dataeval._internal.models.tensorflow.autoencoder import AEGMM
16
+ from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
+ from dataeval._internal.models.tensorflow.losses import LossGMM
18
+ from dataeval._internal.models.tensorflow.utils import predict_batch
19
+
20
+
21
+ class OOD_AEGMM(OODGMMBase):
22
+ def __init__(self, model: AEGMM) -> None:
23
+ """
24
+ AE with Gaussian Mixture Model based outlier detector.
25
+
26
+ Parameters
27
+ ----------
28
+ model : AEGMM
29
+ An AEGMM model.
30
+ """
31
+ super().__init__(model)
32
+
33
+ def fit(
34
+ self,
35
+ x_ref: np.ndarray,
36
+ threshold_perc: float = 100.0,
37
+ loss_fn: Callable = LossGMM(),
38
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
39
+ epochs: int = 20,
40
+ batch_size: int = 64,
41
+ verbose: bool = True,
42
+ ) -> None:
43
+ """
44
+ Train the AEGMM model with recommended loss function and optimizer.
45
+
46
+ Parameters
47
+ ----------
48
+ x_ref : np.ndarray
49
+ Training batch.
50
+ threshold_perc : float, default 100.0
51
+ Percentage of reference data that is normal.
52
+ loss_fn : Callable, default LossGMM()
53
+ Loss function used for training.
54
+ optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
55
+ Optimizer used for training.
56
+ epochs : int, default 20
57
+ Number of training epochs.
58
+ batch_size : int, default 64
59
+ Batch size used for training.
60
+ verbose : bool, default True
61
+ Whether to print training progress.
62
+ """
63
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
64
+
65
+ def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
66
+ self._validate(X)
67
+ _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
68
+ energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
69
+ return OODScore(energy.numpy()) # type: ignore
@@ -0,0 +1,199 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from abc import ABC, abstractmethod
10
+ from typing import Callable, Dict, List, Literal, NamedTuple, Optional, Tuple, cast
11
+
12
+ import keras
13
+ import numpy as np
14
+ import tensorflow as tf
15
+
16
+ from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
17
+ from dataeval._internal.models.tensorflow.trainer import trainer
18
+
19
+
20
+ class OODScore(NamedTuple):
21
+ """
22
+ NamedTuple containing the instance and (optionally) feature score.
23
+
24
+ Parameters
25
+ ----------
26
+ instance_score : np.ndarray
27
+ Instance score of the evaluated dataset.
28
+ feature_score : Optional[np.ndarray], default None
29
+ Feature score, if available, of the evaluated dataset.
30
+ """
31
+
32
+ instance_score: np.ndarray
33
+ feature_score: Optional[np.ndarray] = None
34
+
35
+ def get(self, ood_type: Literal["instance", "feature"]) -> np.ndarray:
36
+ return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
37
+
38
+
39
+ class OODBase(ABC):
40
+ def __init__(self, model: keras.Model) -> None:
41
+ self.model = model
42
+
43
+ self._ref_score: OODScore
44
+ self._threshold_perc: float
45
+ self._data_info: Optional[Tuple[tuple, type]] = None
46
+
47
+ if not isinstance(model, keras.Model):
48
+ raise TypeError("Model should be of type 'keras.Model'.")
49
+
50
+ def _get_data_info(self, X: np.ndarray) -> Tuple[tuple, type]:
51
+ if not isinstance(X, np.ndarray):
52
+ raise TypeError("Dataset should of type: `np.ndarray`.")
53
+ return X.shape[1:], X.dtype.type
54
+
55
+ def _validate(self, X: np.ndarray) -> None:
56
+ check_data_info = self._get_data_info(X)
57
+ if self._data_info is not None and check_data_info != self._data_info:
58
+ raise RuntimeError(f"Expect data of type: {self._data_info[1]} and shape: {self._data_info[0]}. \
59
+ Provided data is type: {check_data_info[1]} and shape: {check_data_info[0]}.")
60
+
61
+ def _validate_state(self, X: np.ndarray, additional_attrs: Optional[List[str]] = None) -> None:
62
+ attrs = ["_data_info", "_threshold_perc", "_ref_score"]
63
+ attrs = attrs if additional_attrs is None else attrs + additional_attrs
64
+ if not all(hasattr(self, attr) for attr in attrs) or any(getattr(self, attr) for attr in attrs) is None:
65
+ raise RuntimeError("Metric needs to be `fit` before method call.")
66
+ self._validate(X)
67
+
68
+ @abstractmethod
69
+ def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
70
+ """
71
+ Compute instance and (optionally) feature level outlier scores.
72
+
73
+ Parameters
74
+ ----------
75
+ X : np.ndarray
76
+ Batch of instances.
77
+ batch_size : int, default int(1e10)
78
+ Batch size used when making predictions with the autoencoder.
79
+
80
+ Returns
81
+ -------
82
+ Instance and feature level outlier scores.
83
+ """
84
+
85
+ def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
86
+ return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
87
+
88
+ def fit(
89
+ self,
90
+ x_ref: np.ndarray,
91
+ threshold_perc: float,
92
+ loss_fn: Callable,
93
+ optimizer: keras.optimizers.Optimizer,
94
+ epochs: int,
95
+ batch_size: int,
96
+ verbose: bool,
97
+ ) -> None:
98
+ """
99
+ Train the model and infer the threshold value.
100
+
101
+ Parameters
102
+ ----------
103
+ x_ref: : np.ndarray
104
+ Training batch.
105
+ threshold_perc : float
106
+ Percentage of reference data that is normal.
107
+ loss_fn : Callable
108
+ Loss function used for training.
109
+ optimizer : keras.optimizers.Optimizer
110
+ Optimizer used for training.
111
+ epochs : int
112
+ Number of training epochs.
113
+ batch_size : int
114
+ Batch size used for training.
115
+ verbose : bool
116
+ Whether to print training progress.
117
+ """
118
+ # Train the model
119
+ trainer(
120
+ model=self.model,
121
+ loss_fn=loss_fn,
122
+ x_train=x_ref,
123
+ optimizer=optimizer,
124
+ epochs=epochs,
125
+ batch_size=batch_size,
126
+ verbose=verbose,
127
+ )
128
+
129
+ # Infer the threshold values
130
+ self._ref_score = self.score(x_ref, batch_size)
131
+ self._threshold_perc = threshold_perc
132
+
133
+ def predict(
134
+ self,
135
+ X: np.ndarray,
136
+ batch_size: int = int(1e10),
137
+ ood_type: Literal["feature", "instance"] = "instance",
138
+ ) -> Dict[str, np.ndarray]:
139
+ """
140
+ Predict whether instances are out-of-distribution or not.
141
+
142
+ Parameters
143
+ ----------
144
+ X
145
+ Batch of instances.
146
+ ood_type
147
+ Predict out-of-distribution at the 'feature' or 'instance' level.
148
+ batch_size
149
+ Batch size used when making predictions with the autoencoder.
150
+
151
+ Returns
152
+ -------
153
+ Dictionary containing the outlier predictions and both feature and instance level outlier scores.
154
+ """
155
+ self._validate_state(X)
156
+ # compute outlier scores
157
+ score = self.score(X, batch_size=batch_size)
158
+ ood_pred = (score.get(ood_type) > self._threshold_score(ood_type)).astype(int)
159
+ return {**{"is_ood": ood_pred}, **score._asdict()}
160
+
161
+
162
+ class OODGMMBase(OODBase):
163
+ def __init__(self, model: keras.Model) -> None:
164
+ super().__init__(model)
165
+ self.gmm_params: GaussianMixtureModelParams
166
+
167
+ def _validate_state(self, X: np.ndarray, additional_attrs: Optional[List[str]] = None) -> None:
168
+ if additional_attrs is None:
169
+ additional_attrs = ["gmm_params"]
170
+ super()._validate_state(X, additional_attrs)
171
+
172
+ def fit(
173
+ self,
174
+ x_ref: np.ndarray,
175
+ threshold_perc: float,
176
+ loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
177
+ optimizer: keras.optimizers.Optimizer,
178
+ epochs: int,
179
+ batch_size: int,
180
+ verbose: bool,
181
+ ) -> None:
182
+ # Train the model
183
+ trainer(
184
+ model=self.model,
185
+ loss_fn=loss_fn,
186
+ x_train=x_ref,
187
+ optimizer=optimizer,
188
+ epochs=epochs,
189
+ batch_size=batch_size,
190
+ verbose=verbose,
191
+ )
192
+
193
+ # Calculate the GMM parameters
194
+ _, z, gamma = cast(Tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
195
+ self.gmm_params = gmm_params(z, gamma)
196
+
197
+ # Infer the threshold values
198
+ self._ref_score = self.score(x_ref, batch_size)
199
+ self._threshold_perc = threshold_perc
@@ -0,0 +1,284 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from functools import partial
10
+ from typing import Callable, Optional, Tuple
11
+
12
+ import keras
13
+ import numpy as np
14
+ import tensorflow as tf
15
+ from keras.layers import Input
16
+ from keras.models import Model
17
+
18
+ from dataeval._internal.detectors.ood.base import OODBase, OODScore
19
+ from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
20
+ from dataeval._internal.models.tensorflow.trainer import trainer
21
+ from dataeval._internal.models.tensorflow.utils import predict_batch
22
+
23
+
24
+ def build_model(
25
+ dist: PixelCNN, input_shape: Optional[tuple] = None, filepath: Optional[str] = None
26
+ ) -> Tuple[keras.Model, PixelCNN]:
27
+ """
28
+ Create keras.Model from TF distribution.
29
+
30
+ Parameters
31
+ ----------
32
+ dist
33
+ TensorFlow distribution.
34
+ input_shape
35
+ Input shape of the model.
36
+ filepath
37
+ File to load model weights from.
38
+
39
+ Returns
40
+ -------
41
+ TensorFlow model.
42
+ """
43
+ x_in = Input(shape=input_shape)
44
+ log_prob = dist.log_prob(x_in)
45
+ model = Model(inputs=x_in, outputs=log_prob)
46
+ model.add_loss(-tf.reduce_mean(log_prob))
47
+ if isinstance(filepath, str):
48
+ model.load_weights(filepath)
49
+ return model, dist
50
+
51
+
52
+ def mutate_categorical(
53
+ X: np.ndarray,
54
+ rate: float,
55
+ seed: int = 0,
56
+ feature_range: tuple = (0, 255),
57
+ ) -> tf.Tensor:
58
+ """
59
+ Randomly change integer feature values to values within a set range
60
+ with a specified permutation rate.
61
+
62
+ Parameters
63
+ ----------
64
+ X
65
+ Batch of data to be perturbed.
66
+ rate
67
+ Permutation rate (between 0 and 1).
68
+ seed
69
+ Random seed.
70
+ feature_range
71
+ Min and max range for perturbed features.
72
+
73
+ Returns
74
+ -------
75
+ Array with perturbed data.
76
+ """
77
+ frange = (feature_range[0] + 1, feature_range[1] + 1)
78
+ shape = X.shape
79
+ n_samples = np.prod(shape)
80
+ mask = tf.random.categorical(tf.math.log([[1.0 - rate, rate]]), n_samples, seed=seed, dtype=tf.int32)
81
+ mask = tf.reshape(mask, shape)
82
+ possible_mutations = tf.random.uniform(shape, minval=frange[0], maxval=frange[1], dtype=tf.int32, seed=seed + 1)
83
+ X = tf.math.floormod(tf.cast(X, tf.int32) + mask * possible_mutations, frange[1]) # type: ignore py38
84
+ return tf.cast(X, tf.float32) # type: ignore
85
+
86
+
87
+ class OOD_LLR(OODBase):
88
+ def __init__(
89
+ self,
90
+ model: PixelCNN,
91
+ model_background: Optional[PixelCNN] = None,
92
+ log_prob: Optional[Callable] = None,
93
+ sequential: bool = False,
94
+ ) -> None:
95
+ """
96
+ Likelihood Ratios based outlier detector.
97
+
98
+ Parameters
99
+ ----------
100
+ model : PixelCNN
101
+ Generative distribution model.
102
+ model_background : Optional[PixelCNN], default None
103
+ Optional model for the background. Only needed if it is different from `model`.
104
+ log_prob : Optional[Callable], default None
105
+ Function used to evaluate log probabilities under the model
106
+ if the model does not have a `log_prob` function.
107
+ sequential : bool, default False
108
+ Whether the data is sequential. Used to create targets during training.
109
+ """
110
+ self.dist_s = model
111
+ self.dist_b = (
112
+ model.copy()
113
+ if hasattr(model, "copy")
114
+ else keras.models.clone_model(model)
115
+ if model_background is None
116
+ else model_background
117
+ )
118
+ self.has_log_prob = hasattr(model, "log_prob")
119
+ self.sequential = sequential
120
+ self.log_prob = log_prob
121
+
122
+ self._ref_score: OODScore
123
+ self._threshold_perc: float
124
+ self._data_info: Optional[Tuple[tuple, type]] = None
125
+
126
+ def fit(
127
+ self,
128
+ x_ref: np.ndarray,
129
+ threshold_perc: float = 100.0,
130
+ loss_fn: Optional[Callable] = None,
131
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
132
+ epochs: int = 20,
133
+ batch_size: int = 64,
134
+ verbose: bool = True,
135
+ mutate_fn: Callable = mutate_categorical,
136
+ mutate_fn_kwargs: dict = {"rate": 0.2, "seed": 0, "feature_range": (0, 255)},
137
+ mutate_batch_size: int = int(1e10),
138
+ ) -> None:
139
+ """
140
+ Train semantic and background generative models.
141
+
142
+ Parameters
143
+ ----------
144
+ x_ref : np.ndarray
145
+ Training batch.
146
+ threshold_perc : float, default 100.0
147
+ Percentage of reference data that is normal.
148
+ loss_fn : Optional[Callable], default None
149
+ Loss function used for training.
150
+ optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
151
+ Optimizer used for training.
152
+ epochs : int, default 20
153
+ Number of training epochs.
154
+ batch_size : int, default 64
155
+ Batch size used for training.
156
+ verbose : bool, default True
157
+ Whether to print training progress.
158
+ mutate_fn : Callable, default mutate_categorical
159
+ Mutation function used to generate the background dataset.
160
+ mutate_fn_kwargs : dict, default {"rate": 0.2, "seed": 0, "feature_range": (0, 255)}
161
+ Kwargs for the mutation function used to generate the background dataset.
162
+ Default values set for an image dataset.
163
+ mutate_batch_size: int, default int(1e10)
164
+ Batch size used to generate the mutations for the background dataset.
165
+ """
166
+ input_shape = x_ref.shape[1:]
167
+ optimizer = optimizer() if isinstance(optimizer, type) else optimizer
168
+ # Separate into two separate optimizers, one for semantic model and one for background model
169
+ optimizer_s = optimizer
170
+ optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
171
+
172
+ # training arguments
173
+ kwargs = {
174
+ "epochs": epochs,
175
+ "batch_size": batch_size,
176
+ "verbose": verbose,
177
+ }
178
+
179
+ # create background data
180
+ mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
181
+ X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype)
182
+
183
+ # prepare sequential data
184
+ if self.sequential and not self.has_log_prob:
185
+ y, y_back = x_ref[:, 1:], X_back[:, 1:] # type: ignore
186
+ X, X_back = x_ref[:, :-1], X_back[:, :-1] # type: ignore
187
+ else:
188
+ X = x_ref
189
+ y, y_back = None, None
190
+
191
+ # check if model needs to be built
192
+ use_build = self.has_log_prob and not isinstance(self.dist_s, keras.Model)
193
+
194
+ if use_build:
195
+ # build and train semantic model
196
+ self.model_s = build_model(self.dist_s, input_shape)[0]
197
+ self.model_s.compile(optimizer=optimizer_s)
198
+ self.model_s.fit(X, **kwargs)
199
+ # build and train background model
200
+ self.model_b = build_model(self.dist_b, input_shape)[0]
201
+ self.model_b.compile(optimizer=optimizer_b)
202
+ self.model_b.fit(X_back, **kwargs)
203
+ else:
204
+ # train semantic model
205
+ args = [self.dist_s, X]
206
+ kwargs.update({"y_train": y, "loss_fn": loss_fn, "optimizer": optimizer_s})
207
+ trainer(*args, **kwargs)
208
+
209
+ # train background model
210
+ args = [self.dist_b, X_back]
211
+ kwargs.update({"y_train": y_back, "loss_fn": loss_fn, "optimizer": optimizer_b})
212
+ trainer(*args, **kwargs)
213
+
214
+ self._datainfo = self._get_data_info(x_ref)
215
+ self._ref_score = self.score(x_ref, batch_size=batch_size)
216
+ self._threshold_perc = threshold_perc
217
+
218
+ def _logp(
219
+ self,
220
+ dist,
221
+ X: np.ndarray,
222
+ return_per_feature: bool = False,
223
+ batch_size: int = int(1e10),
224
+ ) -> np.ndarray:
225
+ """
226
+ Compute log probability of a batch of instances under the generative model.
227
+ """
228
+ logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature)
229
+ # TODO: TBD: can this be any of the other types from predict_batch? i.e. tf.Tensor or tuple
230
+ return predict_batch(X, logp_fn, batch_size=batch_size) # type: ignore[return-value]
231
+
232
+ def _logp_alt(
233
+ self,
234
+ model: keras.Model,
235
+ X: np.ndarray,
236
+ return_per_feature: bool = False,
237
+ batch_size: int = int(1e10),
238
+ ) -> np.ndarray:
239
+ """
240
+ Compute log probability of a batch of instances with the user defined log_prob function.
241
+ """
242
+ if self.sequential:
243
+ y, X = X[:, 1:], X[:, :-1]
244
+ else:
245
+ y = X.copy()
246
+ y_preds = predict_batch(X, model, batch_size=batch_size)
247
+ logp = self.log_prob(y, y_preds).numpy() # type: ignore
248
+ if return_per_feature:
249
+ return logp
250
+ else:
251
+ axis = tuple(np.arange(len(logp.shape))[1:])
252
+ return np.mean(logp, axis=axis)
253
+
254
+ def _llr(self, X: np.ndarray, return_per_feature: bool, batch_size: int = int(1e10)) -> np.ndarray:
255
+ """
256
+ Compute likelihood ratios.
257
+
258
+ Parameters
259
+ ----------
260
+ X
261
+ Batch of instances.
262
+ return_per_feature
263
+ Return likelihood ratio per feature.
264
+ batch_size
265
+ Batch size for the generative model evaluations.
266
+
267
+ Returns
268
+ -------
269
+ Likelihood ratios.
270
+ """
271
+ logp_fn = self._logp if not isinstance(self.log_prob, Callable) else self._logp_alt # type: ignore
272
+ logp_s = logp_fn(self.dist_s, X, return_per_feature=return_per_feature, batch_size=batch_size)
273
+ logp_b = logp_fn(self.dist_b, X, return_per_feature=return_per_feature, batch_size=batch_size)
274
+ return logp_s - logp_b
275
+
276
+ def score(
277
+ self,
278
+ X: np.ndarray,
279
+ batch_size: int = int(1e10),
280
+ ) -> OODScore:
281
+ self._validate(X)
282
+ fscore = -self._llr(X, True, batch_size=batch_size)
283
+ iscore = -self._llr(X, False, batch_size=batch_size)
284
+ return OODScore(iscore, fscore)
@@ -0,0 +1,86 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from typing import Callable
10
+
11
+ import keras
12
+ import numpy as np
13
+
14
+ from dataeval._internal.detectors.ood.base import OODBase, OODScore
15
+ from dataeval._internal.models.tensorflow.autoencoder import VAE
16
+ from dataeval._internal.models.tensorflow.losses import Elbo
17
+ from dataeval._internal.models.tensorflow.utils import predict_batch
18
+
19
+
20
+ class OOD_VAE(OODBase):
21
+ def __init__(self, model: VAE, samples: int = 10) -> None:
22
+ """
23
+ VAE based outlier detector.
24
+
25
+ Parameters
26
+ ----------
27
+ model : VAE
28
+ A VAE model.
29
+ samples : int, default 10
30
+ Number of samples sampled to evaluate each instance.
31
+ """
32
+ super().__init__(model)
33
+ self.samples = samples
34
+
35
+ def fit(
36
+ self,
37
+ x_ref: np.ndarray,
38
+ threshold_perc: float = 100.0,
39
+ loss_fn: Callable = Elbo(0.05),
40
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
41
+ epochs: int = 20,
42
+ batch_size: int = 64,
43
+ verbose: bool = True,
44
+ ) -> None:
45
+ """
46
+ Train the VAE model.
47
+
48
+ Parameters
49
+ ----------
50
+ x_ref : np.ndarray
51
+ Training batch.
52
+ threshold_perc : float, default 100.0
53
+ Percentage of reference data that is normal.
54
+ loss_fn : Callable, default Elbo(0.05)
55
+ Loss function used for training.
56
+ optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
57
+ Optimizer used for training.
58
+ epochs : int, default 20
59
+ Number of training epochs.
60
+ batch_size : int, default 64
61
+ Batch size used for training.
62
+ verbose : bool, default True
63
+ Whether to print training progress.
64
+ """
65
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
66
+
67
+ def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
68
+ self._validate(X)
69
+
70
+ # sample reconstructed instances
71
+ X_samples = np.repeat(X, self.samples, axis=0)
72
+ X_recon = predict_batch(X_samples, model=self.model, batch_size=batch_size)
73
+
74
+ # compute feature scores
75
+ fscore = np.power(X_samples - X_recon, 2)
76
+ fscore = fscore.reshape((-1, self.samples) + X_samples.shape[1:])
77
+ fscore = np.mean(fscore, axis=1)
78
+
79
+ # compute instance scores
80
+ fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
81
+ n_score_features = int(np.ceil(fscore_flat.shape[1]))
82
+ sorted_fscore = np.sort(fscore_flat, axis=1)
83
+ sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
84
+ iscore = np.mean(sorted_fscore_perc, axis=1)
85
+
86
+ return OODScore(iscore, fscore)