dataeval 0.73.0__py3-none-any.whl → 0.74.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/detectors/__init__.py +1 -1
  3. dataeval/detectors/drift/__init__.py +1 -1
  4. dataeval/detectors/drift/base.py +2 -2
  5. dataeval/detectors/drift/torch.py +1 -101
  6. dataeval/detectors/linters/clusterer.py +1 -1
  7. dataeval/detectors/ood/__init__.py +11 -4
  8. dataeval/detectors/ood/ae.py +2 -1
  9. dataeval/detectors/ood/ae_torch.py +70 -0
  10. dataeval/detectors/ood/aegmm.py +4 -3
  11. dataeval/detectors/ood/base.py +58 -108
  12. dataeval/detectors/ood/base_tf.py +109 -0
  13. dataeval/detectors/ood/base_torch.py +109 -0
  14. dataeval/detectors/ood/llr.py +2 -2
  15. dataeval/detectors/ood/metadata_ks_compare.py +53 -14
  16. dataeval/detectors/ood/vae.py +3 -2
  17. dataeval/detectors/ood/vaegmm.py +5 -4
  18. dataeval/metrics/bias/__init__.py +3 -0
  19. dataeval/metrics/bias/balance.py +77 -64
  20. dataeval/metrics/bias/coverage.py +12 -12
  21. dataeval/metrics/bias/diversity.py +74 -114
  22. dataeval/metrics/bias/metadata_preprocessing.py +285 -0
  23. dataeval/metrics/bias/metadata_utils.py +229 -0
  24. dataeval/metrics/bias/parity.py +54 -158
  25. dataeval/utils/__init__.py +2 -2
  26. dataeval/utils/gmm.py +26 -0
  27. dataeval/utils/metadata.py +29 -9
  28. dataeval/utils/shared.py +1 -1
  29. dataeval/utils/split_dataset.py +12 -6
  30. dataeval/utils/tensorflow/_internal/gmm.py +4 -24
  31. dataeval/utils/torch/datasets.py +2 -2
  32. dataeval/utils/torch/gmm.py +98 -0
  33. dataeval/utils/torch/models.py +192 -0
  34. dataeval/utils/torch/trainer.py +84 -5
  35. dataeval/utils/torch/utils.py +107 -1
  36. dataeval/workflows/__init__.py +1 -1
  37. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/METADATA +1 -2
  38. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/RECORD +40 -34
  39. dataeval/metrics/bias/metadata.py +0 -358
  40. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/LICENSE.txt +0 -0
  41. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/WHEEL +0 -0
@@ -144,7 +144,7 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
144
144
  ----------
145
145
  group_ids : np.ndarray
146
146
  Identifies the group to which a sample at the same index belongs.
147
- num_partitions: int
147
+ num_partitions : int
148
148
  How many total (train, val) folds will be generated (+1 if also specifying a test fold).
149
149
 
150
150
  Warns
@@ -242,12 +242,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
242
242
 
243
243
  Returns
244
244
  -------
245
- group_ids: np.ndarray
245
+ group_ids : np.ndarray
246
246
  group identifiers from metadata
247
247
  """
248
248
  features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
249
249
  if not features2group:
250
- return np.zeros(num_samples, dtype=int)
250
+ return np.zeros(num_samples, dtype=np.int_)
251
251
  for name, feature in features2group.items():
252
252
  if len(feature) != num_samples:
253
253
  raise IndexError(f"""Feature length does not match number of labels.
@@ -300,7 +300,13 @@ def make_splits(
300
300
  splits = splitter.split(index, labels)
301
301
  for train_idx, eval_idx in splits:
302
302
  test_ratio = len(eval_idx) / index.shape[0]
303
- split_defs.append({"train": train_idx.astype(int), "eval": eval_idx.astype(int), "eval_frac": test_ratio})
303
+ split_defs.append(
304
+ {
305
+ "train": train_idx.astype(np.int_),
306
+ "eval": eval_idx.astype(np.int_),
307
+ "eval_frac": test_ratio,
308
+ }
309
+ )
304
310
  return split_defs
305
311
 
306
312
 
@@ -318,9 +324,9 @@ def find_best_split(
318
324
  split_defs : list[dict]
319
325
  List of dictionaries, which specifying train index, validation index, and the ratio of
320
326
  validation to all data.
321
- stratified: bool
327
+ stratified : bool
322
328
  If True, maintain dataset class balance within each train/val split
323
- eval_frac: float
329
+ eval_frac : float
324
330
  Desired fraction of the dataset sequestered for evaluation
325
331
 
326
332
  Returns
@@ -8,10 +8,11 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import TYPE_CHECKING, NamedTuple
11
+ from typing import TYPE_CHECKING
12
12
 
13
13
  import numpy as np
14
14
 
15
+ from dataeval.utils.gmm import GaussianMixtureModelParams
15
16
  from dataeval.utils.lazy import lazyload
16
17
 
17
18
  if TYPE_CHECKING:
@@ -20,28 +21,7 @@ else:
20
21
  tf = lazyload("tensorflow")
21
22
 
22
23
 
23
- class GaussianMixtureModelParams(NamedTuple):
24
- """
25
- phi : tf.Tensor
26
- Mixture component distribution weights.
27
- mu : tf.Tensor
28
- Mixture means.
29
- cov : tf.Tensor
30
- Mixture covariance.
31
- L : tf.Tensor
32
- Cholesky decomposition of `cov`.
33
- log_det_cov : tf.Tensor
34
- Log of the determinant of `cov`.
35
- """
36
-
37
- phi: tf.Tensor
38
- mu: tf.Tensor
39
- cov: tf.Tensor
40
- L: tf.Tensor
41
- log_det_cov: tf.Tensor
42
-
43
-
44
- def gmm_params(z: tf.Tensor, gamma: tf.Tensor) -> GaussianMixtureModelParams:
24
+ def gmm_params(z: tf.Tensor, gamma: tf.Tensor) -> GaussianMixtureModelParams[tf.Tensor]:
45
25
  """
46
26
  Compute parameters of Gaussian Mixture Model.
47
27
 
@@ -81,7 +61,7 @@ def gmm_params(z: tf.Tensor, gamma: tf.Tensor) -> GaussianMixtureModelParams:
81
61
 
82
62
  def gmm_energy(
83
63
  z: tf.Tensor,
84
- params: GaussianMixtureModelParams,
64
+ params: GaussianMixtureModelParams[tf.Tensor],
85
65
  return_mean: bool = True,
86
66
  ) -> tuple[tf.Tensor, tf.Tensor]:
87
67
  """
@@ -206,7 +206,7 @@ class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
206
206
  Option to select specific classes from dataset.
207
207
  balance : bool, default True
208
208
  If True, returns equal number of samples for each class.
209
- randomize : bool, default False
209
+ randomize : bool, default True
210
210
  If True, shuffles the data prior to selection - uses a set seed for reproducibility.
211
211
  slice_back : bool, default False
212
212
  If True and size has a value greater than 0, then grabs selection starting at the last image.
@@ -251,7 +251,7 @@ class MNIST(Dataset[tuple[NDArray[np.float64], int]]):
251
251
  corruption: CorruptionStringMap | None = None,
252
252
  classes: TClassMap | None = None,
253
253
  balance: bool = True,
254
- randomize: bool = False,
254
+ randomize: bool = True,
255
255
  slice_back: bool = False,
256
256
  verbose: bool = True,
257
257
  ) -> None:
@@ -0,0 +1,98 @@
1
+ """
2
+ Adapted for Pytorch from:
3
+
4
+ Source code derived from Alibi-Detect 0.11.4
5
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
6
+
7
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
8
+ Licensed under Apache Software License (Apache 2.0)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import numpy as np
14
+ import torch
15
+
16
+ from dataeval.utils.gmm import GaussianMixtureModelParams
17
+
18
+
19
+ def gmm_params(z: torch.Tensor, gamma: torch.Tensor) -> GaussianMixtureModelParams[torch.Tensor]:
20
+ """
21
+ Compute parameters of Gaussian Mixture Model.
22
+
23
+ Parameters
24
+ ----------
25
+ z : torch.Tensor
26
+ Observations.
27
+ gamma : torch.Tensor
28
+ Mixture probabilities to derive mixture distribution weights from.
29
+
30
+ Returns
31
+ -------
32
+ GaussianMixtureModelParams(phi, mu, cov, L, log_det_cov)
33
+ The parameters used to calculate energy.
34
+ """
35
+
36
+ # compute gmm parameters phi, mu and cov
37
+ N = gamma.shape[0] # nb of samples in batch
38
+ sum_gamma = torch.sum(gamma, 0) # K
39
+ phi = sum_gamma / N # K
40
+ # K x D (D = latent_dim)
41
+ mu = torch.sum(torch.unsqueeze(gamma, -1) * torch.unsqueeze(z, 1), 0) / torch.unsqueeze(sum_gamma, -1)
42
+ z_mu = torch.unsqueeze(z, 1) - torch.unsqueeze(mu, 0) # N x K x D
43
+ z_mu_outer = torch.unsqueeze(z_mu, -1) * torch.unsqueeze(z_mu, -2) # N x K x D x D
44
+
45
+ # K x D x D
46
+ cov = torch.sum(torch.unsqueeze(torch.unsqueeze(gamma, -1), -1) * z_mu_outer, 0) / torch.unsqueeze(
47
+ torch.unsqueeze(sum_gamma, -1), -1
48
+ )
49
+
50
+ # cholesky decomposition of covariance and determinant derivation
51
+ D = cov.shape[1]
52
+ eps = 1e-6
53
+ L = torch.linalg.cholesky(cov + torch.eye(D) * eps) # K x D x D
54
+ log_det_cov = 2.0 * torch.sum(torch.log(torch.diagonal(L, dim1=-2, dim2=-1)), 1) # K
55
+
56
+ return GaussianMixtureModelParams(phi, mu, cov, L, log_det_cov)
57
+
58
+
59
+ def gmm_energy(
60
+ z: torch.Tensor,
61
+ params: GaussianMixtureModelParams[torch.Tensor],
62
+ return_mean: bool = True,
63
+ ) -> tuple[torch.Tensor, torch.Tensor]:
64
+ """
65
+ Compute sample energy from Gaussian Mixture Model.
66
+
67
+ Parameters
68
+ ----------
69
+ params : GaussianMixtureModelParams
70
+ The gaussian mixture model parameters.
71
+ return_mean : bool, default True
72
+ Take mean across all sample energies in a batch.
73
+
74
+ Returns
75
+ -------
76
+ sample_energy
77
+ The sample energy of the GMM.
78
+ cov_diag
79
+ The inverse sum of the diagonal components of the covariance matrix.
80
+ """
81
+ D = params.cov.shape[1]
82
+ z_mu = torch.unsqueeze(z, 1) - torch.unsqueeze(params.mu, 0) # N x K x D
83
+ z_mu_T = torch.permute(z_mu, dims=[1, 2, 0]) # K x D x N
84
+ v = torch.linalg.solve_triangular(params.L, z_mu_T, upper=False) # K x D x D
85
+
86
+ # rewrite sample energy in logsumexp format for numerical stability
87
+ logits = torch.log(torch.unsqueeze(params.phi, -1)) - 0.5 * (
88
+ torch.sum(torch.square(v), 1) + float(D) * np.log(2.0 * np.pi) + torch.unsqueeze(params.log_det_cov, -1)
89
+ ) # K x N
90
+ sample_energy = -torch.logsumexp(logits, 0) # N
91
+
92
+ if return_mean:
93
+ sample_energy = torch.mean(sample_energy)
94
+
95
+ # inverse sum of variances
96
+ cov_diag = torch.sum(torch.divide(torch.tensor(1), torch.diagonal(params.cov, dim1=-2, dim2=-1)))
97
+
98
+ return sample_energy, cov_diag
@@ -2,8 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = ["AriaAutoencoder", "Encoder", "Decoder"]
4
4
 
5
+ import math
5
6
  from typing import Any
6
7
 
8
+ import torch
7
9
  import torch.nn as nn
8
10
 
9
11
 
@@ -136,3 +138,193 @@ class Decoder(nn.Module):
136
138
  The reconstructed output tensor.
137
139
  """
138
140
  return self.decoder(x)
141
+
142
+
143
+ class AE(nn.Module):
144
+ """
145
+ An autoencoder model with a separate encoder and decoder. Meant to replace the TensorFlow model called AE, which we
146
+ used as the core of an autoencoder-based OOD detector, i.e. as an argument to OOD_AE().
147
+
148
+ Parameters
149
+ ----------
150
+ input_shape : tuple[int, int, int]
151
+ Number of input channels, number of rows, number of columns.() Number of examples per batch will be inferred
152
+ at runtime.)
153
+ """
154
+
155
+ def __init__(self, input_shape: tuple[int, int, int]) -> None:
156
+ super().__init__()
157
+
158
+ input_dim = math.prod(input_shape)
159
+
160
+ # following is lifted from src/dataeval/utils/tensorflow/_internal/utils.py. It makes an odd staircase that is
161
+ # basically proportional to the number of numbers in the image to the 0.8 power. '
162
+ encoding_dim = int(math.pow(2, int(input_dim.bit_length() * 0.8)))
163
+
164
+ self.encoder: Encoder_AE = Encoder_AE(input_shape, encoding_dim)
165
+
166
+ self.decoder: Decoder_AE = Decoder_AE(input_shape, encoding_dim, self.encoder.post_op_shape)
167
+
168
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
169
+ """
170
+ Perform a forward pass through the encoder and decoder.
171
+
172
+ Parameters
173
+ ----------
174
+ x : torch.Tensor
175
+ Input tensor
176
+
177
+ Returns
178
+ -------
179
+ torch.Tensor
180
+ The reconstructed output tensor.
181
+ """
182
+ x = self.encoder(x)
183
+ x = self.decoder(x)
184
+ return x
185
+
186
+ def encode(self, x: torch.Tensor) -> torch.Tensor:
187
+ """
188
+ Encode the input tensor using the encoder.
189
+
190
+ Parameters
191
+ ----------
192
+ x : torch.Tensor
193
+ Input tensor
194
+
195
+ Returns
196
+ -------
197
+ torch.Tensor
198
+ The encoded representation of the input tensor.
199
+ """
200
+ return self.encoder(x)
201
+
202
+
203
+ class Encoder_AE(nn.Module):
204
+ """
205
+ A simple encoder to be used in an autoencoder model.
206
+
207
+ This is the encoder used to replicate AE, which was a TF function. It consists of a CNN followed by a fully
208
+ connected layer.
209
+
210
+ Parameters
211
+ ----------
212
+ channels : int
213
+ Number of input channels
214
+
215
+ input_shape : tuple[int, int, int]
216
+ number of channels, number of rows, number of columns in input images.
217
+
218
+ encoding_dim : the size of the 1D array that emerges from the fully connected layer.
219
+
220
+ """
221
+
222
+ def __init__(
223
+ self,
224
+ input_shape: tuple[int, int, int],
225
+ encoding_dim: int,
226
+ ) -> None:
227
+ super().__init__()
228
+
229
+ channels = input_shape[0]
230
+ nc_in, nc_mid, nc_done = 256, 128, 64
231
+
232
+ conv_in = nn.Conv2d(channels, nc_in, 2, stride=1, padding=1)
233
+ conv_mid = nn.Conv2d(nc_in, nc_mid, 2, stride=1, padding=1)
234
+ conv_done = nn.Conv2d(nc_mid, nc_done, 2, stride=1)
235
+
236
+ self.encoding_ops: nn.Sequential = nn.Sequential(
237
+ conv_in,
238
+ nn.LeakyReLU(),
239
+ nn.MaxPool2d(2),
240
+ conv_mid,
241
+ nn.LeakyReLU(),
242
+ nn.MaxPool2d(2),
243
+ conv_done,
244
+ )
245
+
246
+ ny, nx = input_shape[1:]
247
+ self.post_op_shape: tuple[int, int, int] = (nc_done, ny // 4 - 1, nx // 4 - 1)
248
+ self.flatcon: int = math.prod(self.post_op_shape)
249
+ self.flatten: nn.Sequential = nn.Sequential(
250
+ nn.Flatten(),
251
+ nn.Linear(
252
+ self.flatcon,
253
+ encoding_dim,
254
+ ),
255
+ )
256
+
257
+ def forward(self, x: Any) -> Any:
258
+ """
259
+ Perform a forward pass through the AE_torch encoder.
260
+
261
+ Parameters
262
+ ----------
263
+ x : torch.Tensor
264
+ Input tensor
265
+
266
+ Returns
267
+ -------
268
+ torch.Tensor
269
+ The encoded representation of the input tensor.
270
+ """
271
+ x = self.encoding_ops(x)
272
+
273
+ x = self.flatten(x)
274
+
275
+ return x
276
+
277
+
278
+ class Decoder_AE(nn.Module):
279
+ """
280
+ A simple decoder to be used in an autoencoder model.
281
+
282
+ This is the decoder used by the AriaAutoencoder model.
283
+
284
+ Parameters
285
+ ----------
286
+ channels : int
287
+ Number of output channels
288
+ """
289
+
290
+ def __init__(
291
+ self,
292
+ input_shape: tuple[int, int, int],
293
+ encoding_dim: int,
294
+ post_op_shape: tuple[int, int, int],
295
+ ) -> None:
296
+ super().__init__()
297
+
298
+ self.post_op_shape = post_op_shape
299
+ self.input_shape = input_shape # need to store this for use in forward().
300
+ channels = input_shape[0]
301
+
302
+ self.input: nn.Linear = nn.Linear(encoding_dim, math.prod(post_op_shape))
303
+
304
+ self.decoder: nn.Sequential = nn.Sequential(
305
+ nn.ConvTranspose2d(64, 128, 2, stride=1),
306
+ nn.LeakyReLU(),
307
+ nn.ConvTranspose2d(128, 256, 2, stride=2),
308
+ nn.LeakyReLU(),
309
+ nn.ConvTranspose2d(256, channels, 2, stride=2),
310
+ )
311
+
312
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
313
+ """
314
+ Perform a forward pass through the decoder.
315
+
316
+ Parameters
317
+ ----------
318
+ x : torch.Tensor
319
+ The encoded tensor.
320
+
321
+ Returns
322
+ -------
323
+ torch.Tensor
324
+ The reconstructed output tensor.
325
+ """
326
+ x = self.input(x)
327
+ x = x.reshape((-1, *self.post_op_shape))
328
+ x = self.decoder(x)
329
+ x = x.reshape((-1, *self.input_shape))
330
+ return x
@@ -1,15 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["AETrainer"]
4
-
5
- from typing import Any
3
+ from typing import Any, Callable
6
4
 
7
5
  import torch
8
6
  import torch.nn as nn
7
+ from numpy.typing import NDArray
9
8
  from torch.optim import Adam
10
- from torch.utils.data import DataLoader, Dataset
9
+ from torch.utils.data import DataLoader, Dataset, TensorDataset
10
+ from tqdm import tqdm
11
11
 
12
- torch.manual_seed(0)
12
+ __all__ = ["AETrainer", "trainer"]
13
13
 
14
14
 
15
15
  def get_images_from_batch(batch: Any) -> Any:
@@ -176,3 +176,82 @@ class AETrainer:
176
176
  encodings = torch.vstack((encodings, embeddings)) if len(encodings) else embeddings
177
177
 
178
178
  return encodings
179
+
180
+
181
+ def trainer(
182
+ model: torch.nn.Module,
183
+ x_train: NDArray[Any],
184
+ y_train: NDArray[Any] | None,
185
+ loss_fn: Callable[..., torch.Tensor | torch.nn.Module] | None,
186
+ optimizer: torch.optim.Optimizer | None,
187
+ preprocess_fn: Callable[[torch.Tensor], torch.Tensor] | None,
188
+ epochs: int,
189
+ batch_size: int,
190
+ device: torch.device,
191
+ verbose: bool,
192
+ ) -> None:
193
+ """
194
+ Train Pytorch model.
195
+
196
+ Parameters
197
+ ----------
198
+ model
199
+ Model to train.
200
+ loss_fn
201
+ Loss function used for training.
202
+ x_train
203
+ Training data.
204
+ y_train
205
+ Training labels.
206
+ optimizer
207
+ Optimizer used for training.
208
+ preprocess_fn
209
+ Preprocessing function applied to each training batch.
210
+ epochs
211
+ Number of training epochs.
212
+ reg_loss_fn
213
+ Allows an additional regularisation term to be defined as reg_loss_fn(model)
214
+ batch_size
215
+ Batch size used for training.
216
+ buffer_size
217
+ Maximum number of elements that will be buffered when prefetching.
218
+ verbose
219
+ Whether to print training progress.
220
+ """
221
+ if optimizer is None:
222
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
223
+
224
+ if y_train is None:
225
+ dataset = TensorDataset(torch.from_numpy(x_train).to(torch.float32))
226
+
227
+ else:
228
+ dataset = TensorDataset(
229
+ torch.from_numpy(x_train).to(torch.float32), torch.from_numpy(y_train).to(torch.float32)
230
+ )
231
+
232
+ loader = DataLoader(dataset=dataset)
233
+
234
+ model = model.to(device)
235
+
236
+ # iterate over epochs
237
+ loss = torch.nan
238
+ disable_tqdm = not verbose
239
+ for epoch in (pbar := tqdm(range(epochs), disable=disable_tqdm)):
240
+ epoch_loss = loss
241
+ for step, data in enumerate(loader):
242
+ if step % 250 == 0:
243
+ pbar.set_description(f"Epoch: {epoch} ({epoch_loss:.3f}), loss: {loss:.3f}")
244
+
245
+ x, y = [d.to(device) for d in data] if len(data) > 1 else (data[0].to(device), None)
246
+
247
+ if isinstance(preprocess_fn, Callable):
248
+ x = preprocess_fn(x)
249
+
250
+ y_hat = model(x)
251
+ y = x if y is None else y
252
+
253
+ loss = loss_fn(y, y_hat) # type: ignore
254
+
255
+ optimizer.zero_grad()
256
+ loss.backward()
257
+ optimizer.step()
@@ -3,8 +3,12 @@ from __future__ import annotations
3
3
  __all__ = ["read_dataset"]
4
4
 
5
5
  from collections import defaultdict
6
- from typing import Any
6
+ from functools import partial
7
+ from typing import Any, Callable
7
8
 
9
+ import numpy as np
10
+ import torch
11
+ from numpy.typing import NDArray
8
12
  from torch.utils.data import Dataset
9
13
 
10
14
 
@@ -61,3 +65,105 @@ def read_dataset(dataset: Dataset[Any]) -> list[list[Any]]:
61
65
  ddict[i].append(d)
62
66
 
63
67
  return list(ddict.values())
68
+
69
+
70
+ def get_device(device: str | torch.device | None = None) -> torch.device:
71
+ """
72
+ Instantiates a PyTorch device object.
73
+
74
+ Parameters
75
+ ----------
76
+ device : str | torch.device | None, default None
77
+ Either ``None``, a str ('gpu' or 'cpu') indicating the device to choose, or an
78
+ already instantiated device object. If ``None``, the GPU is selected if it is
79
+ detected, otherwise the CPU is used as a fallback.
80
+
81
+ Returns
82
+ -------
83
+ The instantiated device object.
84
+ """
85
+ if isinstance(device, torch.device): # Already a torch device
86
+ return device
87
+ else: # Instantiate device
88
+ if device is None or device.lower() in ["gpu", "cuda"]:
89
+ torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
90
+ else:
91
+ torch_device = torch.device("cpu")
92
+ return torch_device
93
+
94
+
95
+ def predict_batch(
96
+ x: NDArray[Any] | torch.Tensor,
97
+ model: Callable | torch.nn.Module | torch.nn.Sequential,
98
+ device: torch.device | None = None,
99
+ batch_size: int = int(1e10),
100
+ preprocess_fn: Callable | None = None,
101
+ dtype: type[np.generic] | torch.dtype = np.float32,
102
+ ) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
103
+ """
104
+ Make batch predictions on a model.
105
+
106
+ Parameters
107
+ ----------
108
+ x : np.ndarray | torch.Tensor
109
+ Batch of instances.
110
+ model : Callable | nn.Module | nn.Sequential
111
+ PyTorch model.
112
+ device : torch.device | None, default None
113
+ Device type used. The default None tries to use the GPU and falls back on CPU.
114
+ Can be specified by passing either torch.device('cuda') or torch.device('cpu').
115
+ batch_size : int, default 1e10
116
+ Batch size used during prediction.
117
+ preprocess_fn : Callable | None, default None
118
+ Optional preprocessing function for each batch.
119
+ dtype : np.dtype | torch.dtype, default np.float32
120
+ Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
121
+
122
+ Returns
123
+ -------
124
+ NDArray | torch.Tensor | tuple
125
+ Numpy array, torch tensor or tuples of those with model outputs.
126
+ """
127
+ device = get_device(device)
128
+ if isinstance(x, np.ndarray):
129
+ x = torch.from_numpy(x).to(device)
130
+ n = len(x)
131
+ n_minibatch = int(np.ceil(n / batch_size))
132
+ return_np = not isinstance(dtype, torch.dtype)
133
+ preds = []
134
+ with torch.no_grad():
135
+ for i in range(n_minibatch):
136
+ istart, istop = i * batch_size, min((i + 1) * batch_size, n)
137
+ x_batch = x[istart:istop]
138
+ if isinstance(preprocess_fn, Callable):
139
+ x_batch = preprocess_fn(x_batch)
140
+
141
+ preds_tmp = model(x_batch.to(torch.float32).to(device))
142
+ if isinstance(preds_tmp, (list, tuple)):
143
+ if len(preds) == 0: # init tuple with lists to store predictions
144
+ preds = tuple([] for _ in range(len(preds_tmp)))
145
+ for j, p in enumerate(preds_tmp):
146
+ if isinstance(p, torch.Tensor):
147
+ p = p.cpu()
148
+ preds[j].append(p if not return_np or isinstance(p, np.ndarray) else p.numpy())
149
+ elif isinstance(preds_tmp, (np.ndarray, torch.Tensor)):
150
+ if isinstance(preds_tmp, torch.Tensor):
151
+ preds_tmp = preds_tmp.cpu()
152
+ if isinstance(preds, tuple):
153
+ preds = list(preds)
154
+ preds.append(
155
+ preds_tmp
156
+ if not return_np or isinstance(preds_tmp, np.ndarray) # type: ignore
157
+ else preds_tmp.numpy()
158
+ )
159
+ else:
160
+ raise TypeError(
161
+ f"Model output type {type(preds_tmp)} not supported. The model \
162
+ output type needs to be one of list, tuple, NDArray or \
163
+ torch.Tensor."
164
+ )
165
+ concat = partial(np.concatenate, axis=0) if return_np else partial(torch.cat, dim=0)
166
+ out: tuple | np.ndarray | torch.Tensor = (
167
+ tuple(concat(p) for p in preds) if isinstance(preds, tuple) else concat(preds) # type: ignore
168
+ )
169
+ return out
@@ -4,7 +4,7 @@ Workflows perform a sequence of actions to analyze the dataset and make predicti
4
4
 
5
5
  from dataeval import _IS_TORCH_AVAILABLE
6
6
 
7
- if _IS_TORCH_AVAILABLE: # pragma: no cover
7
+ if _IS_TORCH_AVAILABLE:
8
8
  from dataeval.workflows.sufficiency import Sufficiency, SufficiencyOutput
9
9
 
10
10
  __all__ = ["Sufficiency", "SufficiencyOutput"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.73.0
3
+ Version: 0.74.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -23,7 +23,6 @@ Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
24
  Provides-Extra: tensorflow
25
25
  Provides-Extra: torch
26
- Requires-Dist: hdbscan (>=0.8.36)
27
26
  Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
28
27
  Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
29
28
  Requires-Dist: numpy (>1.24.3)