dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,310 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from functools import partial
10
+ from typing import Callable, Optional, Type, Union
11
+
12
+ import numpy as np
13
+ import torch
14
+ import torch.nn as nn
15
+
16
+
17
+ def get_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
18
+ """
19
+ Instantiates a PyTorch device object.
20
+
21
+ Parameters
22
+ ----------
23
+ device
24
+ Either `None`, a str ('gpu' or 'cpu') indicating the device to choose, or an
25
+ already instantiated device object. If `None`, the GPU is selected if it is
26
+ detected, otherwise the CPU is used as a fallback.
27
+
28
+ Returns
29
+ -------
30
+ The instantiated device object.
31
+ """
32
+ if isinstance(device, torch.device): # Already a torch device
33
+ return device
34
+ else: # Instantiate device
35
+ if device is None or device.lower() in ["gpu", "cuda"]:
36
+ torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ else:
38
+ torch_device = torch.device("cpu")
39
+ return torch_device
40
+
41
+
42
+ def mmd2_from_kernel_matrix(
43
+ kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
44
+ ) -> torch.Tensor:
45
+ """
46
+ Compute maximum mean discrepancy (MMD^2) between 2 samples x and y from the
47
+ full kernel matrix between the samples.
48
+
49
+ Parameters
50
+ ----------
51
+ kernel_mat
52
+ Kernel matrix between samples x and y.
53
+ m
54
+ Number of instances in y.
55
+ permute
56
+ Whether to permute the row indices. Used for permutation tests.
57
+ zero_diag
58
+ Whether to zero out the diagonal of the kernel matrix.
59
+
60
+ Returns
61
+ -------
62
+ MMD^2 between the samples from the kernel matrix.
63
+ """
64
+ n = kernel_mat.shape[0] - m
65
+ if zero_diag:
66
+ kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())
67
+ if permute:
68
+ idx = torch.randperm(kernel_mat.shape[0])
69
+ kernel_mat = kernel_mat[idx][:, idx]
70
+ k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
71
+ c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
72
+ mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
73
+ return mmd2
74
+
75
+
76
+ def predict_batch(
77
+ x: Union[np.ndarray, torch.Tensor],
78
+ model: Union[Callable, nn.Module, nn.Sequential],
79
+ device: Optional[torch.device] = None,
80
+ batch_size: int = int(1e10),
81
+ preprocess_fn: Optional[Callable] = None,
82
+ dtype: Union[Type[np.generic], torch.dtype] = np.float32,
83
+ ) -> Union[np.ndarray, torch.Tensor, tuple]:
84
+ """
85
+ Make batch predictions on a model.
86
+
87
+ Parameters
88
+ ----------
89
+ x
90
+ Batch of instances.
91
+ model
92
+ PyTorch model.
93
+ device
94
+ Device type used. The default None tries to use the GPU and falls back on CPU.
95
+ Can be specified by passing either torch.device('cuda') or torch.device('cpu').
96
+ batch_size
97
+ Batch size used during prediction.
98
+ preprocess_fn
99
+ Optional preprocessing function for each batch.
100
+ dtype
101
+ Model output type, e.g. np.float32 or torch.float32.
102
+
103
+ Returns
104
+ -------
105
+ Numpy array, torch tensor or tuples of those with model outputs.
106
+ """
107
+ device = get_device(device)
108
+ if isinstance(x, np.ndarray):
109
+ x = torch.from_numpy(x)
110
+ n = len(x)
111
+ n_minibatch = int(np.ceil(n / batch_size))
112
+ return_np = not isinstance(dtype, torch.dtype)
113
+ preds = []
114
+ with torch.no_grad():
115
+ for i in range(n_minibatch):
116
+ istart, istop = i * batch_size, min((i + 1) * batch_size, n)
117
+ x_batch = x[istart:istop]
118
+ if isinstance(preprocess_fn, Callable):
119
+ x_batch = preprocess_fn(x_batch)
120
+ preds_tmp = model(x_batch.to(device))
121
+ if isinstance(preds_tmp, (list, tuple)):
122
+ if len(preds) == 0: # init tuple with lists to store predictions
123
+ preds = tuple([] for _ in range(len(preds_tmp)))
124
+ for j, p in enumerate(preds_tmp):
125
+ if isinstance(p, torch.Tensor):
126
+ p = p.cpu()
127
+ preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
128
+ elif isinstance(preds_tmp, (np.ndarray, torch.Tensor)):
129
+ if isinstance(preds_tmp, torch.Tensor):
130
+ preds_tmp = preds_tmp.cpu()
131
+ if isinstance(preds, tuple):
132
+ preds = list(preds)
133
+ preds.append(
134
+ preds_tmp
135
+ if not return_np or isinstance(preds_tmp, np.ndarray) # type: ignore
136
+ else preds_tmp.numpy()
137
+ )
138
+ else:
139
+ raise TypeError(
140
+ f"Model output type {type(preds_tmp)} not supported. The model \
141
+ output type needs to be one of list, tuple, np.ndarray or \
142
+ torch.Tensor."
143
+ )
144
+ concat = partial(np.concatenate, axis=0) if return_np else partial(torch.cat, dim=0)
145
+ out: Union[tuple, np.ndarray, torch.Tensor] = (
146
+ tuple(concat(p) for p in preds) if isinstance(preds, tuple) else concat(preds) # type: ignore
147
+ )
148
+ return out
149
+
150
+
151
+ def preprocess_drift(
152
+ x: np.ndarray,
153
+ model: nn.Module,
154
+ device: Optional[torch.device] = None,
155
+ preprocess_batch_fn: Optional[Callable] = None,
156
+ batch_size: int = int(1e10),
157
+ dtype: Union[Type[np.generic], torch.dtype] = np.float32,
158
+ ) -> Union[np.ndarray, torch.Tensor, tuple]:
159
+ """
160
+ Prediction function used for preprocessing step of drift detector.
161
+
162
+ Parameters
163
+ ----------
164
+ x
165
+ Batch of instances.
166
+ model
167
+ Model used for preprocessing.
168
+ device
169
+ Device type used. The default None tries to use the GPU and falls back on CPU.
170
+ Can be specified by passing either torch.device('cuda') or torch.device('cpu').
171
+ preprocess_batch_fn
172
+ Optional batch preprocessing function. For example to convert a list of objects
173
+ to a batch which can be processed by the PyTorch model.
174
+ batch_size
175
+ Batch size used during prediction.
176
+ dtype
177
+ Model output type, e.g. np.float32 or torch.float32.
178
+
179
+ Returns
180
+ -------
181
+ Numpy array or torch tensor with predictions.
182
+ """
183
+ return predict_batch(
184
+ x,
185
+ model,
186
+ device=device,
187
+ batch_size=batch_size,
188
+ preprocess_fn=preprocess_batch_fn,
189
+ dtype=dtype,
190
+ )
191
+
192
+
193
+ @torch.jit.script
194
+ def squared_pairwise_distance(
195
+ x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
196
+ ) -> torch.Tensor: # pragma: no cover - torch.jit.script code is compiled and copied
197
+ """
198
+ PyTorch pairwise squared Euclidean distance between samples x and y.
199
+
200
+ Parameters
201
+ ----------
202
+ x
203
+ Batch of instances of shape [Nx, features].
204
+ y
205
+ Batch of instances of shape [Ny, features].
206
+ a_min
207
+ Lower bound to clip distance values.
208
+ Returns
209
+ -------
210
+ Pairwise squared Euclidean distance [Nx, Ny].
211
+ """
212
+ x2 = x.pow(2).sum(dim=-1, keepdim=True)
213
+ y2 = y.pow(2).sum(dim=-1, keepdim=True)
214
+ dist = torch.addmm(y2.transpose(-2, -1), x, y.transpose(-2, -1), alpha=-2).add_(x2)
215
+ return dist.clamp_min_(a_min)
216
+
217
+
218
+ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
219
+ """
220
+ Bandwidth estimation using the median heuristic :cite:t:`Gretton2012`.
221
+
222
+ Parameters
223
+ ----------
224
+ x
225
+ Tensor of instances with dimension [Nx, features].
226
+ y
227
+ Tensor of instances with dimension [Ny, features].
228
+ dist
229
+ Tensor with dimensions [Nx, Ny], containing the pairwise distances
230
+ between `x` and `y`.
231
+
232
+ Returns
233
+ -------
234
+ The computed bandwidth, `sigma`.
235
+ """
236
+ n = min(x.shape[0], y.shape[0])
237
+ n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
238
+ n_median = n + (np.prod(dist.shape) - n) // 2 - 1
239
+ sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
240
+ return sigma
241
+
242
+
243
+ class GaussianRBF(nn.Module):
244
+ """
245
+ Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2). A forward pass
246
+ takes a batch of instances x [Nx, features] and y [Ny, features] and returns
247
+ the kernel matrix [Nx, Ny].
248
+
249
+ Parameters
250
+ ----------
251
+ sigma : Optional[torch.Tensor], default None
252
+ Bandwidth used for the kernel. Needn't be specified if being inferred or
253
+ trained. Can pass multiple values to eval kernel with and then average.
254
+ init_sigma_fn : Optional[Callable], default None
255
+ Function used to compute the bandwidth `sigma`. Used when `sigma` is to be
256
+ inferred. The function's signature should take in the tensors `x`, `y` and
257
+ `dist` and return `sigma`. If `None`, it is set to
258
+ :func:`~dataeval._internal.detectors.drift.torch.sigma_median`.
259
+ trainable : bool, default False
260
+ Whether or not to track gradients w.r.t. `sigma` to allow it to be trained.
261
+ """
262
+
263
+ def __init__(
264
+ self,
265
+ sigma: Optional[torch.Tensor] = None,
266
+ init_sigma_fn: Optional[Callable] = None,
267
+ trainable: bool = False,
268
+ ) -> None:
269
+ super().__init__()
270
+ init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
271
+ self.config = {
272
+ "sigma": sigma,
273
+ "trainable": trainable,
274
+ "init_sigma_fn": init_sigma_fn,
275
+ }
276
+ if sigma is None:
277
+ self.log_sigma = nn.Parameter(torch.empty(1), requires_grad=trainable)
278
+ self.init_required = True
279
+ else:
280
+ sigma = sigma.reshape(-1) # [Ns,]
281
+ self.log_sigma = nn.Parameter(sigma.log(), requires_grad=trainable)
282
+ self.init_required = False
283
+ self.init_sigma_fn = init_sigma_fn
284
+ self.trainable = trainable
285
+
286
+ @property
287
+ def sigma(self) -> torch.Tensor:
288
+ return self.log_sigma.exp()
289
+
290
+ def forward(
291
+ self,
292
+ x: Union[np.ndarray, torch.Tensor],
293
+ y: Union[np.ndarray, torch.Tensor],
294
+ infer_sigma: bool = False,
295
+ ) -> torch.Tensor:
296
+ x, y = torch.as_tensor(x), torch.as_tensor(y)
297
+ dist = squared_pairwise_distance(x.flatten(1), y.flatten(1)) # [Nx, Ny]
298
+
299
+ if infer_sigma or self.init_required:
300
+ if self.trainable and infer_sigma:
301
+ raise ValueError("Gradients cannot be computed w.r.t. an inferred sigma value")
302
+ sigma = self.init_sigma_fn(x, y, dist)
303
+ with torch.no_grad():
304
+ self.log_sigma.copy_(sigma.log().clone())
305
+ self.init_required = False
306
+
307
+ gamma = 1.0 / (2.0 * self.sigma**2) # [Ns,]
308
+ # TODO: do matrix multiplication after all?
309
+ kernel_mat = torch.exp(-torch.cat([(g * dist)[None, :, :] for g in gamma], dim=0)) # [Ns, Nx, Ny]
310
+ return kernel_mat.mean(dim=0) # [Nx, Ny]
@@ -0,0 +1,149 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from functools import partial
10
+ from typing import Callable, Dict, Literal, Optional, Union
11
+
12
+ import numpy as np
13
+ from scipy.special import softmax
14
+ from scipy.stats import entropy
15
+
16
+ from .base import UpdateStrategy
17
+ from .ks import DriftKS
18
+ from .torch import get_device, preprocess_drift
19
+
20
+
21
+ def classifier_uncertainty(
22
+ x: np.ndarray,
23
+ model_fn: Callable,
24
+ preds_type: Literal["probs", "logits"] = "probs",
25
+ ) -> np.ndarray:
26
+ """
27
+ Evaluate model_fn on x and transform predictions to prediction uncertainties.
28
+
29
+ Parameters
30
+ ----------
31
+ x
32
+ Batch of instances.
33
+ model_fn
34
+ Function that evaluates a classification model on x in a single call (contains
35
+ batching logic if necessary).
36
+ preds_type
37
+ Type of prediction output by the model. Options are 'probs' (in [0,1]) or
38
+ 'logits' (in [-inf,inf]).
39
+
40
+ Returns
41
+ -------
42
+ A scalar indication of uncertainty of the model on each instance in x.
43
+ """
44
+
45
+ preds = model_fn(x)
46
+
47
+ if preds_type == "probs":
48
+ if np.abs(1 - np.sum(preds, axis=-1)).mean() > 1e-6:
49
+ raise ValueError("Probabilities across labels should sum to 1")
50
+ probs = preds
51
+ elif preds_type == "logits":
52
+ probs = softmax(preds, axis=-1)
53
+ else:
54
+ raise NotImplementedError("Only prediction types 'probs' and 'logits' supported.")
55
+
56
+ uncertainties = entropy(probs, axis=-1)
57
+ return uncertainties[:, None] # Detectors expect N x d # type: ignore
58
+
59
+
60
+ class DriftUncertainty:
61
+ """
62
+ Test for a change in the number of instances falling into regions on which the
63
+ model is uncertain. Performs a K-S test on prediction entropies.
64
+
65
+ Parameters
66
+ ----------
67
+ x_ref : np.ndarray
68
+ Data used as reference distribution. Should be disjoint from the data the
69
+ model was trained on for accurate p-values.
70
+ model : Callable
71
+ Classification model outputting class probabilities (or logits)
72
+ p_val : float, default 0.05
73
+ p-value used for the significance of the test.
74
+ x_ref_preprocessed : bool, default False
75
+ Whether the given reference data `x_ref` has been preprocessed yet. If
76
+ `x_ref_preprocessed=True`, only the test data `x` will be preprocessed at
77
+ prediction time. If `x_ref_preprocessed=False`, the reference data will
78
+ also be preprocessed.
79
+ update_x_ref : Optional[UpdateStrategy], default None
80
+ Reference data can optionally be updated using an UpdateStrategy class. Update
81
+ using the last n instances seen by the detector with
82
+ :py:class:`dataeval.detectors.LastSeenUpdateStrategy`
83
+ or via reservoir sampling with
84
+ :py:class:`dataeval.detectors.ReservoirSamplingUpdateStrategy`.
85
+ preds_type : Literal["probs", "logits"], default "logits"
86
+ Type of prediction output by the model. Options are 'probs' (in [0,1]) or
87
+ 'logits' (in [-inf,inf]).
88
+ batch_size : int, default 32
89
+ Batch size used to evaluate model. Only relevant when backend has been
90
+ specified for batch prediction.
91
+ preprocess_batch_fn : Optional[Callable], default None
92
+ Optional batch preprocessing function. For example to convert a list of
93
+ objects to a batch which can be processed by the model.
94
+ device : Optional[str], default None
95
+ Device type used. The default None tries to use the GPU and falls back on
96
+ CPU if needed. Can be specified by passing either 'cuda', 'gpu' or 'cpu'.
97
+ input_shape : Optional[tuple], default None
98
+ Shape of input data.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ x_ref: np.ndarray,
104
+ model: Callable,
105
+ p_val: float = 0.05,
106
+ x_ref_preprocessed: bool = False,
107
+ update_x_ref: Optional[UpdateStrategy] = None,
108
+ preds_type: Literal["probs", "logits"] = "probs",
109
+ batch_size: int = 32,
110
+ preprocess_batch_fn: Optional[Callable] = None,
111
+ device: Optional[str] = None,
112
+ ) -> None:
113
+ def model_fn(x: np.ndarray) -> np.ndarray:
114
+ return preprocess_drift(
115
+ x,
116
+ model, # type: ignore
117
+ batch_size=batch_size,
118
+ preprocess_batch_fn=preprocess_batch_fn,
119
+ device=get_device(device),
120
+ )
121
+
122
+ preprocess_fn = partial(
123
+ classifier_uncertainty,
124
+ model_fn=model_fn,
125
+ preds_type=preds_type,
126
+ )
127
+
128
+ self._detector = DriftKS(
129
+ x_ref=x_ref,
130
+ p_val=p_val,
131
+ x_ref_preprocessed=x_ref_preprocessed,
132
+ update_x_ref=update_x_ref,
133
+ preprocess_fn=preprocess_fn,
134
+ )
135
+
136
+ def predict(self, x: np.ndarray) -> Dict[str, Union[int, float, np.ndarray]]:
137
+ """
138
+ Predict whether a batch of data has drifted from the reference data.
139
+
140
+ Parameters
141
+ ----------
142
+ x
143
+ Batch of instances.
144
+
145
+ Returns
146
+ -------
147
+ Dictionary containing the drift prediction, p-value, and threshold statistics.
148
+ """
149
+ return self._detector.predict(x)
@@ -0,0 +1,49 @@
1
+ from typing import Dict, List, Literal
2
+
3
+ import numpy as np
4
+
5
+ from dataeval._internal.flags import ImageHash
6
+ from dataeval._internal.metrics.stats import ImageStats
7
+
8
+
9
+ class Duplicates:
10
+ """
11
+ Finds the duplicate images in a dataset using xxhash for exact duplicates
12
+ and pchash for near duplicates
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ images: np.ndarray,
18
+ ):
19
+ self.stats = ImageStats(ImageHash.ALL)
20
+ self.images = images
21
+
22
+ def _get_duplicates(self) -> dict:
23
+ exact = {}
24
+ near = {}
25
+ for i, value in enumerate(self.results["xxhash"]):
26
+ exact.setdefault(value, []).append(i)
27
+ for i, value in enumerate(self.results["pchash"]):
28
+ near.setdefault(value, []).append(i)
29
+ exact = [v for v in exact.values() if len(v) > 1]
30
+ near = [v for v in near.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
31
+
32
+ return {
33
+ "exact": sorted(exact),
34
+ "near": sorted(near),
35
+ }
36
+
37
+ def evaluate(self) -> Dict[Literal["exact", "near"], List[int]]:
38
+ """
39
+ Returns duplicate image indices for both exact matches and near matches
40
+
41
+ Returns
42
+ -------
43
+ Dict[Literal["exact", "near"], List[int]]
44
+ Dictionary of exact and near match indices
45
+ """
46
+ self.stats.reset()
47
+ self.stats.update(self.images)
48
+ self.results = self.stats.compute()
49
+ return self._get_duplicates()
@@ -0,0 +1,78 @@
1
+ from typing import Literal, Optional, Sequence, Union
2
+
3
+ import numpy as np
4
+
5
+ from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
6
+ from dataeval._internal.metrics.stats import ImageStats
7
+
8
+
9
+ def _get_outlier_mask(
10
+ values: np.ndarray, method: Literal["zscore", "modzscore", "iqr"], threshold: Optional[float]
11
+ ) -> np.ndarray:
12
+ if method == "zscore":
13
+ threshold = threshold if threshold else 3.0
14
+ std = np.std(values)
15
+ abs_diff = np.abs(values - np.mean(values))
16
+ return (abs_diff / std) > threshold
17
+ elif method == "modzscore":
18
+ threshold = threshold if threshold else 3.5
19
+ abs_diff = np.abs(values - np.median(values))
20
+ med_abs_diff = np.median(abs_diff)
21
+ mod_z_score = 0.6745 * abs_diff / med_abs_diff
22
+ return mod_z_score > threshold
23
+ elif method == "iqr":
24
+ threshold = threshold if threshold else 1.5
25
+ qrt = np.percentile(values, q=(25, 75), method="midpoint")
26
+ iqr = (qrt[1] - qrt[0]) * threshold
27
+ return (values < (qrt[0] - iqr)) | (values > (qrt[1] + iqr))
28
+ else:
29
+ raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
30
+
31
+
32
+ class Linter:
33
+ """
34
+ Calculates statistical outliers of a dataset using various statistical
35
+ tests applied to each image
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ images: np.ndarray,
41
+ flags: Optional[Union[LinterFlags, Sequence[LinterFlags]]] = None,
42
+ ):
43
+ flags = flags if flags is not None else (ImageProperty.ALL, ImageVisuals.ALL)
44
+ self.stats = ImageStats(flags)
45
+ self.images = images
46
+
47
+ def _get_outliers(
48
+ self,
49
+ outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
50
+ outlier_threshold: Optional[float] = None,
51
+ ) -> dict:
52
+ flagged_images = {}
53
+
54
+ for stat, values in self.results.items():
55
+ if not isinstance(values, np.ndarray):
56
+ continue
57
+
58
+ if values.ndim == 1 and np.std(values) != 0:
59
+ mask = _get_outlier_mask(values, outlier_method, outlier_threshold)
60
+ indices = np.flatnonzero(mask)
61
+ for i, value in zip(indices, values[mask]):
62
+ flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
63
+
64
+ return dict(sorted(flagged_images.items()))
65
+
66
+ def evaluate(self) -> dict:
67
+ """
68
+ Returns indices of outliers with and the issues identified for each
69
+
70
+ Returns
71
+ -------
72
+ Dict[int, Dict[str, float]]
73
+ Dictionary containing the indices of outliers and a dictionary issues and calculated values
74
+ """
75
+ self.stats.reset()
76
+ self.stats.update(self.images)
77
+ self.results = self.stats.compute()
78
+ return self._get_outliers()
File without changes
@@ -0,0 +1,77 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from typing import Callable
10
+
11
+ import keras
12
+ import numpy as np
13
+
14
+ from dataeval._internal.detectors.ood.base import OODBase, OODScore
15
+ from dataeval._internal.models.tensorflow.autoencoder import AE
16
+ from dataeval._internal.models.tensorflow.utils import predict_batch
17
+
18
+
19
+ class OOD_AE(OODBase):
20
+ def __init__(self, model: AE) -> None:
21
+ """
22
+ Autoencoder based out-of-distribution detector.
23
+
24
+ Parameters
25
+ ----------
26
+ model : AE
27
+ An Autoencoder model.
28
+ """
29
+ super().__init__(model)
30
+
31
+ def fit(
32
+ self,
33
+ x_ref: np.ndarray,
34
+ threshold_perc: float = 100.0,
35
+ loss_fn: Callable = keras.losses.MeanSquaredError(),
36
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
37
+ epochs: int = 20,
38
+ batch_size: int = 64,
39
+ verbose: bool = True,
40
+ ) -> None:
41
+ """
42
+ Train the AE model with recommended loss function and optimizer.
43
+
44
+ Parameters
45
+ ----------
46
+ x_ref : np.ndarray
47
+ Training batch.
48
+ threshold_perc : float, default 100.0
49
+ Percentage of reference data that is normal.
50
+ loss_fn : Callable, default keras.losses.MeanSquaredError()
51
+ Loss function used for training.
52
+ optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
53
+ Optimizer used for training.
54
+ epochs : int, default 20
55
+ Number of training epochs.
56
+ batch_size : int, default 64
57
+ Batch size used for training.
58
+ verbose : bool, default True
59
+ Whether to print training progress.
60
+ """
61
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
62
+
63
+ def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
64
+ self._validate(X)
65
+
66
+ # reconstruct instances
67
+ X_recon = predict_batch(X, self.model, batch_size=batch_size)
68
+
69
+ # compute feature and instance level scores
70
+ fscore = np.power(X - X_recon, 2)
71
+ fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
72
+ n_score_features = int(np.ceil(fscore_flat.shape[1]))
73
+ sorted_fscore = np.sort(fscore_flat, axis=1)
74
+ sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
75
+ iscore = np.mean(sorted_fscore_perc, axis=1)
76
+
77
+ return OODScore(iscore, fscore)