dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,102 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from typing import Callable, Iterable, Optional, Tuple, cast
10
+
11
+ import keras
12
+ import numpy as np
13
+ import tensorflow as tf
14
+
15
+
16
+ def trainer(
17
+ model: keras.Model,
18
+ x_train: np.ndarray,
19
+ y_train: Optional[np.ndarray] = None,
20
+ loss_fn: Optional[Callable[..., tf.Tensor]] = None,
21
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
22
+ preprocess_fn: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
23
+ epochs: int = 20,
24
+ reg_loss_fn: Callable[[keras.Model], tf.Tensor] = (lambda _: cast(tf.Tensor, tf.Variable(0, dtype=tf.float32))),
25
+ batch_size: int = 64,
26
+ buffer_size: int = 1024,
27
+ verbose: bool = True,
28
+ ) -> None:
29
+ """
30
+ Train TensorFlow model.
31
+
32
+ Parameters
33
+ ----------
34
+ model
35
+ Model to train.
36
+ loss_fn
37
+ Loss function used for training.
38
+ x_train
39
+ Training data.
40
+ y_train
41
+ Training labels.
42
+ optimizer
43
+ Optimizer used for training.
44
+ preprocess_fn
45
+ Preprocessing function applied to each training batch.
46
+ epochs
47
+ Number of training epochs.
48
+ reg_loss_fn
49
+ Allows an additional regularisation term to be defined as reg_loss_fn(model)
50
+ batch_size
51
+ Batch size used for training.
52
+ buffer_size
53
+ Maximum number of elements that will be buffered when prefetching.
54
+ verbose
55
+ Whether to print training progress.
56
+ """
57
+ loss_fn = loss_fn() if isinstance(loss_fn, type) else loss_fn
58
+ optimizer = optimizer() if isinstance(optimizer, type) else optimizer
59
+
60
+ train_data = x_train if y_train is None else (x_train, y_train)
61
+ dataset = tf.data.Dataset.from_tensor_slices(train_data)
62
+ dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size)
63
+ n_minibatch = len(dataset)
64
+
65
+ # iterate over epochs
66
+ for epoch in range(epochs):
67
+ pbar = keras.utils.Progbar(n_minibatch, 1) if verbose else None
68
+ if hasattr(dataset, "on_epoch_end"):
69
+ dataset.on_epoch_end() # type: ignore py39
70
+ loss_val_ma = 0.0
71
+ for step, data in enumerate(dataset):
72
+ x, y = cast(Tuple[tf.Tensor, Optional[tf.Tensor]], data if isinstance(data, tuple) else (data, None))
73
+ if isinstance(preprocess_fn, Callable):
74
+ x = preprocess_fn(x)
75
+ with tf.GradientTape() as tape:
76
+ y_hat = model(x)
77
+ y = x if y is None else y
78
+ if isinstance(loss_fn, Callable):
79
+ args = [y] + list(y_hat) if isinstance(y_hat, Tuple) else [y, y_hat]
80
+ loss = loss_fn(*args)
81
+ else:
82
+ loss = cast(tf.Tensor, tf.constant(0.0, dtype=tf.float32))
83
+ if model.losses: # additional model losses
84
+ loss = cast(tf.Tensor, tf.add(sum(model.losses), loss))
85
+ loss = cast(tf.Tensor, tf.add(reg_loss_fn(model), loss)) # alternative way they might be specified
86
+
87
+ grads = cast(Iterable, tape.gradient(loss, model.trainable_weights))
88
+ optimizer.apply_gradients(zip(grads, model.trainable_weights))
89
+ if pbar is not None:
90
+ loss_val = getattr(loss, "numpy")() if hasattr(loss, "numpy") else np.float32(0.0)
91
+ if loss_val.shape and loss_val.shape[0] != batch_size:
92
+ if len(loss_val.shape) == 1:
93
+ shape = (batch_size - loss_val.shape[0],)
94
+ elif len(loss_val.shape) == 2:
95
+ shape = (batch_size - loss_val.shape[0], loss_val.shape[1])
96
+ else:
97
+ continue
98
+ add_mean = np.ones(shape) * loss_val.mean()
99
+ loss_val = np.r_[loss_val, add_mean]
100
+ loss_val_ma = loss_val_ma + (loss_val - loss_val_ma) / (step + 1)
101
+ pbar_values = [("loss_ma", loss_val_ma)]
102
+ pbar.add(1, values=pbar_values)
@@ -0,0 +1,254 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ import math
10
+ from typing import Callable, Optional, Tuple, Type, Union, cast
11
+
12
+ import keras as keras
13
+ import numpy as np
14
+ import tensorflow as tf
15
+ from keras import Sequential
16
+ from keras.layers import (
17
+ Conv2D,
18
+ Conv2DTranspose,
19
+ Dense,
20
+ Flatten,
21
+ InputLayer,
22
+ Reshape,
23
+ )
24
+ from tensorflow._api.v2.nn import relu, softmax, tanh
25
+
26
+ from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
27
+ from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
28
+
29
+
30
+ def predict_batch(
31
+ x: Union[list, np.ndarray, tf.Tensor],
32
+ model: Union[Callable, keras.Model],
33
+ batch_size: int = int(1e10),
34
+ preprocess_fn: Optional[Callable] = None,
35
+ dtype: Union[Type[np.generic], tf.DType] = np.float32,
36
+ ) -> Union[np.ndarray, tf.Tensor, tuple, list]:
37
+ """
38
+ Make batch predictions on a model.
39
+
40
+ Parameters
41
+ ----------
42
+ x
43
+ Batch of instances.
44
+ model
45
+ tf.keras model or one of the other permitted types defined in Data.
46
+ batch_size
47
+ Batch size used during prediction.
48
+ preprocess_fn
49
+ Optional preprocessing function for each batch.
50
+ dtype
51
+ Model output type, e.g. np.float32 or tf.float32.
52
+
53
+ Returns
54
+ -------
55
+ Numpy array, tensorflow tensor or tuples of those with model outputs.
56
+ """
57
+ n = len(x)
58
+ n_minibatch = int(np.ceil(n / batch_size))
59
+ return_np = not isinstance(dtype, tf.DType)
60
+ return_list = False
61
+ preds: Union[list, tuple] = []
62
+ for i in range(n_minibatch):
63
+ istart, istop = i * batch_size, min((i + 1) * batch_size, n)
64
+ x_batch = x[istart:istop] # type: ignore
65
+ if isinstance(preprocess_fn, Callable): # type: ignore
66
+ x_batch = preprocess_fn(x_batch)
67
+ preds_tmp = model(x_batch)
68
+ if isinstance(preds_tmp, (list, tuple)):
69
+ if len(preds) == 0: # init tuple with lists to store predictions
70
+ preds = tuple([] for _ in range(len(preds_tmp)))
71
+ return_list = isinstance(preds_tmp, list)
72
+ for j, p in enumerate(preds_tmp):
73
+ preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
74
+ elif isinstance(preds_tmp, (np.ndarray, tf.Tensor)):
75
+ preds.append( # type: ignore
76
+ preds_tmp
77
+ if not return_np or isinstance(preds_tmp, np.ndarray) # type: ignore
78
+ else preds_tmp.numpy() # type: ignore
79
+ )
80
+ else:
81
+ raise TypeError(
82
+ f"Model output type {type(preds_tmp)} not supported. The model output "
83
+ f"type needs to be one of list, tuple, np.ndarray or tf.Tensor."
84
+ )
85
+ concat = np.concatenate if return_np else tf.concat
86
+ out = cast(
87
+ Union[tuple, tf.Tensor, np.ndarray],
88
+ tuple(concat(p, axis=0) for p in preds) if isinstance(preds, tuple) else concat(preds, axis=0),
89
+ )
90
+ if return_list:
91
+ out = list(out)
92
+ return out
93
+
94
+
95
+ def _get_default_encoder_net(input_shape: Tuple[int, int, int], encoding_dim: int):
96
+ return Sequential(
97
+ [
98
+ InputLayer(input_shape=input_shape),
99
+ Conv2D(64, 4, strides=2, padding="same", activation=relu),
100
+ Conv2D(128, 4, strides=2, padding="same", activation=relu),
101
+ Conv2D(512, 4, strides=2, padding="same", activation=relu),
102
+ Flatten(),
103
+ Dense(encoding_dim),
104
+ ]
105
+ )
106
+
107
+
108
+ def _get_default_decoder_net(input_shape: Tuple[int, int, int], encoding_dim: int):
109
+ return Sequential(
110
+ [
111
+ InputLayer(input_shape=(encoding_dim,)),
112
+ Dense(4 * 4 * 128),
113
+ Reshape(target_shape=(4, 4, 128)),
114
+ Conv2DTranspose(256, 4, strides=2, padding="same", activation=relu),
115
+ Conv2DTranspose(64, 4, strides=2, padding="same", activation=relu),
116
+ Flatten(),
117
+ Dense(math.prod(input_shape)),
118
+ Reshape(target_shape=input_shape),
119
+ ]
120
+ )
121
+
122
+
123
+ def create_model(
124
+ model_type: Union[AE, AEGMM, PixelCNN, VAE, VAEGMM],
125
+ input_shape: Tuple[int, int, int],
126
+ encoding_dim: Optional[int] = None,
127
+ n_gmm: Optional[int] = None,
128
+ gmm_latent_dim: Optional[int] = None,
129
+ ):
130
+ """
131
+ Create a default model for the specified model type.
132
+
133
+ Parameters
134
+ ----------
135
+ model_type
136
+ The model type to create.
137
+ input_shape
138
+ The input shape of the data used.
139
+ encoding_dim
140
+ The target encoding dimensionality.
141
+ n_gmm
142
+ Number of components used in the GMM layer.
143
+ gmm_latent_dim
144
+ Latent dimensionality of the GMM layer.
145
+ """
146
+ input_dim = math.prod(input_shape)
147
+ encoding_dim = int(math.pow(2, int(input_dim.bit_length() * 0.8)) if encoding_dim is None else encoding_dim)
148
+ if model_type == AE:
149
+ return AE(
150
+ _get_default_encoder_net(input_shape, encoding_dim),
151
+ _get_default_decoder_net(input_shape, encoding_dim),
152
+ )
153
+
154
+ if model_type == VAE:
155
+ return VAE(
156
+ _get_default_encoder_net(input_shape, encoding_dim),
157
+ _get_default_decoder_net(input_shape, encoding_dim),
158
+ encoding_dim,
159
+ )
160
+
161
+ if model_type == AEGMM:
162
+ n_gmm = 2 if n_gmm is None else n_gmm
163
+ gmm_latent_dim = 1 if gmm_latent_dim is None else gmm_latent_dim
164
+ # The outlier detector is an encoder/decoder architecture
165
+ encoder_net = Sequential(
166
+ [
167
+ Flatten(),
168
+ InputLayer(input_shape=(input_dim,)),
169
+ Dense(60, activation=tanh),
170
+ Dense(30, activation=tanh),
171
+ Dense(10, activation=tanh),
172
+ Dense(gmm_latent_dim, activation=None),
173
+ ]
174
+ )
175
+ # Here we define the decoder
176
+ decoder_net = Sequential(
177
+ [
178
+ InputLayer(input_shape=(gmm_latent_dim,)),
179
+ Dense(10, activation=tanh),
180
+ Dense(30, activation=tanh),
181
+ Dense(60, activation=tanh),
182
+ Dense(input_dim, activation=None),
183
+ Reshape(target_shape=input_shape),
184
+ ]
185
+ )
186
+ # GMM autoencoders have a density network too
187
+ gmm_density_net = Sequential(
188
+ [
189
+ InputLayer(input_shape=(gmm_latent_dim + 2,)),
190
+ Dense(10, activation=tanh),
191
+ Dense(n_gmm, activation=softmax),
192
+ ]
193
+ )
194
+ return AEGMM(
195
+ encoder_net=encoder_net,
196
+ decoder_net=decoder_net,
197
+ gmm_density_net=gmm_density_net,
198
+ n_gmm=n_gmm,
199
+ )
200
+
201
+ if model_type == VAEGMM:
202
+ n_gmm = 2 if n_gmm is None else n_gmm
203
+ gmm_latent_dim = 2 if gmm_latent_dim is None else gmm_latent_dim
204
+ # The outlier detector is an encoder/decoder architecture
205
+ # Here we define the encoder
206
+ encoder_net = Sequential(
207
+ [
208
+ Flatten(),
209
+ InputLayer(input_shape=(input_dim,)),
210
+ Dense(20, activation=relu),
211
+ Dense(15, activation=relu),
212
+ Dense(7, activation=relu),
213
+ ]
214
+ )
215
+ # Here we define the decoder
216
+ decoder_net = Sequential(
217
+ [
218
+ InputLayer(input_shape=(gmm_latent_dim,)),
219
+ Dense(7, activation=relu),
220
+ Dense(15, activation=relu),
221
+ Dense(20, activation=relu),
222
+ Dense(input_dim, activation=None),
223
+ Reshape(target_shape=input_shape),
224
+ ]
225
+ )
226
+ # GMM autoencoders have a density network too
227
+ gmm_density_net = Sequential(
228
+ [
229
+ InputLayer(input_shape=(gmm_latent_dim + 2,)),
230
+ Dense(10, activation=relu),
231
+ Dense(n_gmm, activation=softmax),
232
+ ]
233
+ )
234
+ return VAEGMM(
235
+ encoder_net=encoder_net,
236
+ decoder_net=decoder_net,
237
+ gmm_density_net=gmm_density_net,
238
+ n_gmm=n_gmm,
239
+ latent_dim=gmm_latent_dim,
240
+ )
241
+
242
+ if model_type == PixelCNN:
243
+ return PixelCNN(
244
+ image_shape=input_shape,
245
+ num_resnet=5,
246
+ num_hierarchies=2,
247
+ num_filters=32,
248
+ num_logistic_mix=1,
249
+ receptive_field_dims=(3, 3),
250
+ dropout_p=0.3,
251
+ l2_weight=0.0,
252
+ )
253
+
254
+ raise TypeError(f"Unknown model specified: {model_type}.")