dataeval 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +18 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/clusterer.py +469 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/drift/base.py +265 -0
- dataeval/_internal/detectors/drift/cvm.py +97 -0
- dataeval/_internal/detectors/drift/ks.py +100 -0
- dataeval/_internal/detectors/drift/mmd.py +166 -0
- dataeval/_internal/detectors/drift/torch.py +310 -0
- dataeval/_internal/detectors/drift/uncertainty.py +149 -0
- dataeval/_internal/detectors/duplicates.py +49 -0
- dataeval/_internal/detectors/linter.py +78 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/ae.py +77 -0
- dataeval/_internal/detectors/ood/aegmm.py +69 -0
- dataeval/_internal/detectors/ood/base.py +199 -0
- dataeval/_internal/detectors/ood/llr.py +284 -0
- dataeval/_internal/detectors/ood/vae.py +86 -0
- dataeval/_internal/detectors/ood/vaegmm.py +79 -0
- dataeval/_internal/flags.py +47 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/base.py +92 -0
- dataeval/_internal/metrics/ber.py +124 -0
- dataeval/_internal/metrics/coverage.py +80 -0
- dataeval/_internal/metrics/divergence.py +94 -0
- dataeval/_internal/metrics/hash.py +79 -0
- dataeval/_internal/metrics/parity.py +180 -0
- dataeval/_internal/metrics/stats.py +332 -0
- dataeval/_internal/metrics/uap.py +45 -0
- dataeval/_internal/metrics/utils.py +158 -0
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/autoencoder.py +202 -0
- dataeval/_internal/models/pytorch/blocks.py +46 -0
- dataeval/_internal/models/pytorch/utils.py +67 -0
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
- dataeval/_internal/models/tensorflow/gmm.py +115 -0
- dataeval/_internal/models/tensorflow/losses.py +107 -0
- dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
- dataeval/_internal/models/tensorflow/trainer.py +102 -0
- dataeval/_internal/models/tensorflow/utils.py +254 -0
- dataeval/_internal/workflows/sufficiency.py +555 -0
- dataeval/detectors/__init__.py +29 -0
- dataeval/flags/__init__.py +3 -0
- dataeval/metrics/__init__.py +7 -0
- dataeval/models/__init__.py +15 -0
- dataeval/models/tensorflow/__init__.py +6 -0
- dataeval/models/torch/__init__.py +8 -0
- dataeval/py.typed +0 -0
- dataeval/workflows/__init__.py +8 -0
- dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
- dataeval-0.61.0.dist-info/METADATA +114 -0
- dataeval-0.61.0.dist-info/RECORD +55 -0
- dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,102 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Callable, Iterable, Optional, Tuple, cast
|
10
|
+
|
11
|
+
import keras
|
12
|
+
import numpy as np
|
13
|
+
import tensorflow as tf
|
14
|
+
|
15
|
+
|
16
|
+
def trainer(
|
17
|
+
model: keras.Model,
|
18
|
+
x_train: np.ndarray,
|
19
|
+
y_train: Optional[np.ndarray] = None,
|
20
|
+
loss_fn: Optional[Callable[..., tf.Tensor]] = None,
|
21
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
22
|
+
preprocess_fn: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
|
23
|
+
epochs: int = 20,
|
24
|
+
reg_loss_fn: Callable[[keras.Model], tf.Tensor] = (lambda _: cast(tf.Tensor, tf.Variable(0, dtype=tf.float32))),
|
25
|
+
batch_size: int = 64,
|
26
|
+
buffer_size: int = 1024,
|
27
|
+
verbose: bool = True,
|
28
|
+
) -> None:
|
29
|
+
"""
|
30
|
+
Train TensorFlow model.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
model
|
35
|
+
Model to train.
|
36
|
+
loss_fn
|
37
|
+
Loss function used for training.
|
38
|
+
x_train
|
39
|
+
Training data.
|
40
|
+
y_train
|
41
|
+
Training labels.
|
42
|
+
optimizer
|
43
|
+
Optimizer used for training.
|
44
|
+
preprocess_fn
|
45
|
+
Preprocessing function applied to each training batch.
|
46
|
+
epochs
|
47
|
+
Number of training epochs.
|
48
|
+
reg_loss_fn
|
49
|
+
Allows an additional regularisation term to be defined as reg_loss_fn(model)
|
50
|
+
batch_size
|
51
|
+
Batch size used for training.
|
52
|
+
buffer_size
|
53
|
+
Maximum number of elements that will be buffered when prefetching.
|
54
|
+
verbose
|
55
|
+
Whether to print training progress.
|
56
|
+
"""
|
57
|
+
loss_fn = loss_fn() if isinstance(loss_fn, type) else loss_fn
|
58
|
+
optimizer = optimizer() if isinstance(optimizer, type) else optimizer
|
59
|
+
|
60
|
+
train_data = x_train if y_train is None else (x_train, y_train)
|
61
|
+
dataset = tf.data.Dataset.from_tensor_slices(train_data)
|
62
|
+
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size)
|
63
|
+
n_minibatch = len(dataset)
|
64
|
+
|
65
|
+
# iterate over epochs
|
66
|
+
for epoch in range(epochs):
|
67
|
+
pbar = keras.utils.Progbar(n_minibatch, 1) if verbose else None
|
68
|
+
if hasattr(dataset, "on_epoch_end"):
|
69
|
+
dataset.on_epoch_end() # type: ignore py39
|
70
|
+
loss_val_ma = 0.0
|
71
|
+
for step, data in enumerate(dataset):
|
72
|
+
x, y = cast(Tuple[tf.Tensor, Optional[tf.Tensor]], data if isinstance(data, tuple) else (data, None))
|
73
|
+
if isinstance(preprocess_fn, Callable):
|
74
|
+
x = preprocess_fn(x)
|
75
|
+
with tf.GradientTape() as tape:
|
76
|
+
y_hat = model(x)
|
77
|
+
y = x if y is None else y
|
78
|
+
if isinstance(loss_fn, Callable):
|
79
|
+
args = [y] + list(y_hat) if isinstance(y_hat, Tuple) else [y, y_hat]
|
80
|
+
loss = loss_fn(*args)
|
81
|
+
else:
|
82
|
+
loss = cast(tf.Tensor, tf.constant(0.0, dtype=tf.float32))
|
83
|
+
if model.losses: # additional model losses
|
84
|
+
loss = cast(tf.Tensor, tf.add(sum(model.losses), loss))
|
85
|
+
loss = cast(tf.Tensor, tf.add(reg_loss_fn(model), loss)) # alternative way they might be specified
|
86
|
+
|
87
|
+
grads = cast(Iterable, tape.gradient(loss, model.trainable_weights))
|
88
|
+
optimizer.apply_gradients(zip(grads, model.trainable_weights))
|
89
|
+
if pbar is not None:
|
90
|
+
loss_val = getattr(loss, "numpy")() if hasattr(loss, "numpy") else np.float32(0.0)
|
91
|
+
if loss_val.shape and loss_val.shape[0] != batch_size:
|
92
|
+
if len(loss_val.shape) == 1:
|
93
|
+
shape = (batch_size - loss_val.shape[0],)
|
94
|
+
elif len(loss_val.shape) == 2:
|
95
|
+
shape = (batch_size - loss_val.shape[0], loss_val.shape[1])
|
96
|
+
else:
|
97
|
+
continue
|
98
|
+
add_mean = np.ones(shape) * loss_val.mean()
|
99
|
+
loss_val = np.r_[loss_val, add_mean]
|
100
|
+
loss_val_ma = loss_val_ma + (loss_val - loss_val_ma) / (step + 1)
|
101
|
+
pbar_values = [("loss_ma", loss_val_ma)]
|
102
|
+
pbar.add(1, values=pbar_values)
|
@@ -0,0 +1,254 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
import math
|
10
|
+
from typing import Callable, Optional, Tuple, Type, Union, cast
|
11
|
+
|
12
|
+
import keras as keras
|
13
|
+
import numpy as np
|
14
|
+
import tensorflow as tf
|
15
|
+
from keras import Sequential
|
16
|
+
from keras.layers import (
|
17
|
+
Conv2D,
|
18
|
+
Conv2DTranspose,
|
19
|
+
Dense,
|
20
|
+
Flatten,
|
21
|
+
InputLayer,
|
22
|
+
Reshape,
|
23
|
+
)
|
24
|
+
from tensorflow._api.v2.nn import relu, softmax, tanh
|
25
|
+
|
26
|
+
from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
|
27
|
+
from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
|
28
|
+
|
29
|
+
|
30
|
+
def predict_batch(
|
31
|
+
x: Union[list, np.ndarray, tf.Tensor],
|
32
|
+
model: Union[Callable, keras.Model],
|
33
|
+
batch_size: int = int(1e10),
|
34
|
+
preprocess_fn: Optional[Callable] = None,
|
35
|
+
dtype: Union[Type[np.generic], tf.DType] = np.float32,
|
36
|
+
) -> Union[np.ndarray, tf.Tensor, tuple, list]:
|
37
|
+
"""
|
38
|
+
Make batch predictions on a model.
|
39
|
+
|
40
|
+
Parameters
|
41
|
+
----------
|
42
|
+
x
|
43
|
+
Batch of instances.
|
44
|
+
model
|
45
|
+
tf.keras model or one of the other permitted types defined in Data.
|
46
|
+
batch_size
|
47
|
+
Batch size used during prediction.
|
48
|
+
preprocess_fn
|
49
|
+
Optional preprocessing function for each batch.
|
50
|
+
dtype
|
51
|
+
Model output type, e.g. np.float32 or tf.float32.
|
52
|
+
|
53
|
+
Returns
|
54
|
+
-------
|
55
|
+
Numpy array, tensorflow tensor or tuples of those with model outputs.
|
56
|
+
"""
|
57
|
+
n = len(x)
|
58
|
+
n_minibatch = int(np.ceil(n / batch_size))
|
59
|
+
return_np = not isinstance(dtype, tf.DType)
|
60
|
+
return_list = False
|
61
|
+
preds: Union[list, tuple] = []
|
62
|
+
for i in range(n_minibatch):
|
63
|
+
istart, istop = i * batch_size, min((i + 1) * batch_size, n)
|
64
|
+
x_batch = x[istart:istop] # type: ignore
|
65
|
+
if isinstance(preprocess_fn, Callable): # type: ignore
|
66
|
+
x_batch = preprocess_fn(x_batch)
|
67
|
+
preds_tmp = model(x_batch)
|
68
|
+
if isinstance(preds_tmp, (list, tuple)):
|
69
|
+
if len(preds) == 0: # init tuple with lists to store predictions
|
70
|
+
preds = tuple([] for _ in range(len(preds_tmp)))
|
71
|
+
return_list = isinstance(preds_tmp, list)
|
72
|
+
for j, p in enumerate(preds_tmp):
|
73
|
+
preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
|
74
|
+
elif isinstance(preds_tmp, (np.ndarray, tf.Tensor)):
|
75
|
+
preds.append( # type: ignore
|
76
|
+
preds_tmp
|
77
|
+
if not return_np or isinstance(preds_tmp, np.ndarray) # type: ignore
|
78
|
+
else preds_tmp.numpy() # type: ignore
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
raise TypeError(
|
82
|
+
f"Model output type {type(preds_tmp)} not supported. The model output "
|
83
|
+
f"type needs to be one of list, tuple, np.ndarray or tf.Tensor."
|
84
|
+
)
|
85
|
+
concat = np.concatenate if return_np else tf.concat
|
86
|
+
out = cast(
|
87
|
+
Union[tuple, tf.Tensor, np.ndarray],
|
88
|
+
tuple(concat(p, axis=0) for p in preds) if isinstance(preds, tuple) else concat(preds, axis=0),
|
89
|
+
)
|
90
|
+
if return_list:
|
91
|
+
out = list(out)
|
92
|
+
return out
|
93
|
+
|
94
|
+
|
95
|
+
def _get_default_encoder_net(input_shape: Tuple[int, int, int], encoding_dim: int):
|
96
|
+
return Sequential(
|
97
|
+
[
|
98
|
+
InputLayer(input_shape=input_shape),
|
99
|
+
Conv2D(64, 4, strides=2, padding="same", activation=relu),
|
100
|
+
Conv2D(128, 4, strides=2, padding="same", activation=relu),
|
101
|
+
Conv2D(512, 4, strides=2, padding="same", activation=relu),
|
102
|
+
Flatten(),
|
103
|
+
Dense(encoding_dim),
|
104
|
+
]
|
105
|
+
)
|
106
|
+
|
107
|
+
|
108
|
+
def _get_default_decoder_net(input_shape: Tuple[int, int, int], encoding_dim: int):
|
109
|
+
return Sequential(
|
110
|
+
[
|
111
|
+
InputLayer(input_shape=(encoding_dim,)),
|
112
|
+
Dense(4 * 4 * 128),
|
113
|
+
Reshape(target_shape=(4, 4, 128)),
|
114
|
+
Conv2DTranspose(256, 4, strides=2, padding="same", activation=relu),
|
115
|
+
Conv2DTranspose(64, 4, strides=2, padding="same", activation=relu),
|
116
|
+
Flatten(),
|
117
|
+
Dense(math.prod(input_shape)),
|
118
|
+
Reshape(target_shape=input_shape),
|
119
|
+
]
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
def create_model(
|
124
|
+
model_type: Union[AE, AEGMM, PixelCNN, VAE, VAEGMM],
|
125
|
+
input_shape: Tuple[int, int, int],
|
126
|
+
encoding_dim: Optional[int] = None,
|
127
|
+
n_gmm: Optional[int] = None,
|
128
|
+
gmm_latent_dim: Optional[int] = None,
|
129
|
+
):
|
130
|
+
"""
|
131
|
+
Create a default model for the specified model type.
|
132
|
+
|
133
|
+
Parameters
|
134
|
+
----------
|
135
|
+
model_type
|
136
|
+
The model type to create.
|
137
|
+
input_shape
|
138
|
+
The input shape of the data used.
|
139
|
+
encoding_dim
|
140
|
+
The target encoding dimensionality.
|
141
|
+
n_gmm
|
142
|
+
Number of components used in the GMM layer.
|
143
|
+
gmm_latent_dim
|
144
|
+
Latent dimensionality of the GMM layer.
|
145
|
+
"""
|
146
|
+
input_dim = math.prod(input_shape)
|
147
|
+
encoding_dim = int(math.pow(2, int(input_dim.bit_length() * 0.8)) if encoding_dim is None else encoding_dim)
|
148
|
+
if model_type == AE:
|
149
|
+
return AE(
|
150
|
+
_get_default_encoder_net(input_shape, encoding_dim),
|
151
|
+
_get_default_decoder_net(input_shape, encoding_dim),
|
152
|
+
)
|
153
|
+
|
154
|
+
if model_type == VAE:
|
155
|
+
return VAE(
|
156
|
+
_get_default_encoder_net(input_shape, encoding_dim),
|
157
|
+
_get_default_decoder_net(input_shape, encoding_dim),
|
158
|
+
encoding_dim,
|
159
|
+
)
|
160
|
+
|
161
|
+
if model_type == AEGMM:
|
162
|
+
n_gmm = 2 if n_gmm is None else n_gmm
|
163
|
+
gmm_latent_dim = 1 if gmm_latent_dim is None else gmm_latent_dim
|
164
|
+
# The outlier detector is an encoder/decoder architecture
|
165
|
+
encoder_net = Sequential(
|
166
|
+
[
|
167
|
+
Flatten(),
|
168
|
+
InputLayer(input_shape=(input_dim,)),
|
169
|
+
Dense(60, activation=tanh),
|
170
|
+
Dense(30, activation=tanh),
|
171
|
+
Dense(10, activation=tanh),
|
172
|
+
Dense(gmm_latent_dim, activation=None),
|
173
|
+
]
|
174
|
+
)
|
175
|
+
# Here we define the decoder
|
176
|
+
decoder_net = Sequential(
|
177
|
+
[
|
178
|
+
InputLayer(input_shape=(gmm_latent_dim,)),
|
179
|
+
Dense(10, activation=tanh),
|
180
|
+
Dense(30, activation=tanh),
|
181
|
+
Dense(60, activation=tanh),
|
182
|
+
Dense(input_dim, activation=None),
|
183
|
+
Reshape(target_shape=input_shape),
|
184
|
+
]
|
185
|
+
)
|
186
|
+
# GMM autoencoders have a density network too
|
187
|
+
gmm_density_net = Sequential(
|
188
|
+
[
|
189
|
+
InputLayer(input_shape=(gmm_latent_dim + 2,)),
|
190
|
+
Dense(10, activation=tanh),
|
191
|
+
Dense(n_gmm, activation=softmax),
|
192
|
+
]
|
193
|
+
)
|
194
|
+
return AEGMM(
|
195
|
+
encoder_net=encoder_net,
|
196
|
+
decoder_net=decoder_net,
|
197
|
+
gmm_density_net=gmm_density_net,
|
198
|
+
n_gmm=n_gmm,
|
199
|
+
)
|
200
|
+
|
201
|
+
if model_type == VAEGMM:
|
202
|
+
n_gmm = 2 if n_gmm is None else n_gmm
|
203
|
+
gmm_latent_dim = 2 if gmm_latent_dim is None else gmm_latent_dim
|
204
|
+
# The outlier detector is an encoder/decoder architecture
|
205
|
+
# Here we define the encoder
|
206
|
+
encoder_net = Sequential(
|
207
|
+
[
|
208
|
+
Flatten(),
|
209
|
+
InputLayer(input_shape=(input_dim,)),
|
210
|
+
Dense(20, activation=relu),
|
211
|
+
Dense(15, activation=relu),
|
212
|
+
Dense(7, activation=relu),
|
213
|
+
]
|
214
|
+
)
|
215
|
+
# Here we define the decoder
|
216
|
+
decoder_net = Sequential(
|
217
|
+
[
|
218
|
+
InputLayer(input_shape=(gmm_latent_dim,)),
|
219
|
+
Dense(7, activation=relu),
|
220
|
+
Dense(15, activation=relu),
|
221
|
+
Dense(20, activation=relu),
|
222
|
+
Dense(input_dim, activation=None),
|
223
|
+
Reshape(target_shape=input_shape),
|
224
|
+
]
|
225
|
+
)
|
226
|
+
# GMM autoencoders have a density network too
|
227
|
+
gmm_density_net = Sequential(
|
228
|
+
[
|
229
|
+
InputLayer(input_shape=(gmm_latent_dim + 2,)),
|
230
|
+
Dense(10, activation=relu),
|
231
|
+
Dense(n_gmm, activation=softmax),
|
232
|
+
]
|
233
|
+
)
|
234
|
+
return VAEGMM(
|
235
|
+
encoder_net=encoder_net,
|
236
|
+
decoder_net=decoder_net,
|
237
|
+
gmm_density_net=gmm_density_net,
|
238
|
+
n_gmm=n_gmm,
|
239
|
+
latent_dim=gmm_latent_dim,
|
240
|
+
)
|
241
|
+
|
242
|
+
if model_type == PixelCNN:
|
243
|
+
return PixelCNN(
|
244
|
+
image_shape=input_shape,
|
245
|
+
num_resnet=5,
|
246
|
+
num_hierarchies=2,
|
247
|
+
num_filters=32,
|
248
|
+
num_logistic_mix=1,
|
249
|
+
receptive_field_dims=(3, 3),
|
250
|
+
dropout_p=0.3,
|
251
|
+
l2_weight=0.0,
|
252
|
+
)
|
253
|
+
|
254
|
+
raise TypeError(f"Unknown model specified: {model_type}.")
|