dataeval 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/config.py +3 -3
  3. dataeval/detectors/drift/__init__.py +2 -2
  4. dataeval/detectors/drift/_base.py +55 -203
  5. dataeval/detectors/drift/_cvm.py +19 -30
  6. dataeval/detectors/drift/_ks.py +18 -30
  7. dataeval/detectors/drift/_mmd.py +189 -53
  8. dataeval/detectors/drift/_uncertainty.py +52 -56
  9. dataeval/detectors/drift/updates.py +13 -12
  10. dataeval/detectors/linters/duplicates.py +5 -3
  11. dataeval/detectors/linters/outliers.py +2 -2
  12. dataeval/detectors/ood/ae.py +1 -1
  13. dataeval/metrics/bias/__init__.py +11 -1
  14. dataeval/metrics/bias/_completeness.py +130 -0
  15. dataeval/metrics/stats/_base.py +28 -32
  16. dataeval/metrics/stats/_dimensionstats.py +2 -2
  17. dataeval/metrics/stats/_hashstats.py +2 -2
  18. dataeval/metrics/stats/_imagestats.py +4 -4
  19. dataeval/metrics/stats/_labelstats.py +4 -45
  20. dataeval/metrics/stats/_pixelstats.py +2 -2
  21. dataeval/metrics/stats/_visualstats.py +2 -2
  22. dataeval/outputs/__init__.py +2 -1
  23. dataeval/outputs/_bias.py +31 -22
  24. dataeval/outputs/_stats.py +2 -3
  25. dataeval/typing.py +25 -22
  26. dataeval/utils/_array.py +43 -7
  27. dataeval/utils/data/_dataset.py +8 -4
  28. dataeval/utils/data/_embeddings.py +141 -24
  29. dataeval/utils/data/_images.py +38 -15
  30. dataeval/utils/data/_metadata.py +5 -4
  31. dataeval/utils/data/_selection.py +3 -15
  32. dataeval/utils/data/_split.py +76 -129
  33. dataeval/utils/data/datasets/_base.py +7 -4
  34. dataeval/utils/data/datasets/_cifar10.py +9 -9
  35. dataeval/utils/data/datasets/_milco.py +42 -14
  36. dataeval/utils/data/datasets/_mnist.py +9 -5
  37. dataeval/utils/data/datasets/_ships.py +8 -4
  38. dataeval/utils/data/datasets/_voc.py +40 -19
  39. dataeval/utils/data/selections/__init__.py +2 -0
  40. dataeval/utils/data/selections/_classbalance.py +38 -0
  41. dataeval/utils/data/selections/_classfilter.py +14 -29
  42. dataeval/utils/data/selections/_prioritize.py +1 -1
  43. dataeval/utils/data/selections/_shuffle.py +2 -2
  44. dataeval/utils/metadata.py +1 -1
  45. dataeval/utils/torch/_internal.py +12 -35
  46. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
  47. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +49 -48
  48. dataeval/detectors/drift/_torch.py +0 -222
  49. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
  50. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0
@@ -1,222 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- __all__ = []
12
-
13
- from typing import Any, Callable
14
-
15
- import numpy as np
16
- import torch
17
- import torch.nn as nn
18
- from numpy.typing import NDArray
19
-
20
- from dataeval.config import DeviceLike, get_device
21
- from dataeval.utils.torch._internal import predict_batch
22
-
23
-
24
- def mmd2_from_kernel_matrix(
25
- kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
26
- ) -> torch.Tensor:
27
- """
28
- Compute maximum mean discrepancy (MMD^2) between 2 samples x and y from the
29
- full kernel matrix between the samples.
30
-
31
- Parameters
32
- ----------
33
- kernel_mat : torch.Tensor
34
- Kernel matrix between samples x and y.
35
- m : int
36
- Number of instances in y.
37
- permute : bool, default False
38
- Whether to permute the row indices. Used for permutation tests.
39
- zero_diag : bool, default True
40
- Whether to zero out the diagonal of the kernel matrix.
41
-
42
- Returns
43
- -------
44
- torch.Tensor
45
- MMD^2 between the samples from the kernel matrix.
46
- """
47
- n = kernel_mat.shape[0] - m
48
- if zero_diag:
49
- kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())
50
- if permute:
51
- idx = torch.randperm(kernel_mat.shape[0])
52
- kernel_mat = kernel_mat[idx][:, idx]
53
- k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
54
- c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
55
- mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
56
- return mmd2
57
-
58
-
59
- def preprocess_drift(
60
- x: NDArray[Any],
61
- model: nn.Module,
62
- device: DeviceLike | None = None,
63
- preprocess_batch_fn: Callable | None = None,
64
- batch_size: int = int(1e10),
65
- dtype: type[np.generic] | torch.dtype = np.float32,
66
- ) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
67
- """
68
- Prediction function used for preprocessing step of drift detector.
69
-
70
- Parameters
71
- ----------
72
- x : NDArray
73
- Batch of instances.
74
- model : nn.Module
75
- Model used for preprocessing.
76
- device : DeviceLike or None, default None
77
- The hardware device to use if specified, otherwise uses the DataEval
78
- default or torch default.
79
- preprocess_batch_fn : Callable or None, default None
80
- Optional batch preprocessing function. For example to convert a list of objects
81
- to a batch which can be processed by the PyTorch model.
82
- batch_size : int, default 1e10
83
- Batch size used during prediction.
84
- dtype : np.dtype or torch.dtype, default np.float32
85
- Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
86
-
87
- Returns
88
- -------
89
- NDArray | torch.Tensor | tuple
90
- Numpy array, torch tensor or tuples of those with model outputs.
91
- """
92
- return predict_batch(
93
- x,
94
- model,
95
- device=get_device(device),
96
- batch_size=batch_size,
97
- preprocess_fn=preprocess_batch_fn,
98
- dtype=dtype,
99
- )
100
-
101
-
102
- @torch.jit.script
103
- def _squared_pairwise_distance(
104
- x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
105
- ) -> torch.Tensor: # pragma: no cover - torch.jit.script code is compiled and copied
106
- """
107
- PyTorch pairwise squared Euclidean distance between samples x and y.
108
-
109
- Parameters
110
- ----------
111
- x : torch.Tensor
112
- Batch of instances of shape [Nx, features].
113
- y : torch.Tensor
114
- Batch of instances of shape [Ny, features].
115
- a_min : float
116
- Lower bound to clip distance values.
117
-
118
- Returns
119
- -------
120
- torch.Tensor
121
- Pairwise squared Euclidean distance [Nx, Ny].
122
- """
123
- x2 = x.pow(2).sum(dim=-1, keepdim=True)
124
- y2 = y.pow(2).sum(dim=-1, keepdim=True)
125
- dist = torch.addmm(y2.transpose(-2, -1), x, y.transpose(-2, -1), alpha=-2).add_(x2)
126
- return dist.clamp_min_(a_min)
127
-
128
-
129
- def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
130
- """
131
- Bandwidth estimation using the median heuristic `Gretton2012`
132
-
133
- Parameters
134
- ----------
135
- x : torch.Tensor
136
- Tensor of instances with dimension [Nx, features].
137
- y : torch.Tensor
138
- Tensor of instances with dimension [Ny, features].
139
- dist : torch.Tensor
140
- Tensor with dimensions [Nx, Ny], containing the pairwise distances
141
- between `x` and `y`.
142
-
143
- Returns
144
- -------
145
- torch.Tensor
146
- The computed bandwidth, `sigma`.
147
- """
148
- n = min(x.shape[0], y.shape[0])
149
- n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
150
- n_median = n + (np.prod(dist.shape) - n) // 2 - 1
151
- sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
152
- return sigma
153
-
154
-
155
- class GaussianRBF(nn.Module):
156
- """
157
- Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
158
-
159
- A forward pass takes a batch of instances x [Nx, features] and
160
- y [Ny, features] and returns the kernel matrix [Nx, Ny].
161
-
162
- Parameters
163
- ----------
164
- sigma : torch.Tensor | None, default None
165
- Bandwidth used for the kernel. Needn't be specified if being inferred or
166
- trained. Can pass multiple values to eval kernel with and then average.
167
- init_sigma_fn : Callable | None, default None
168
- Function used to compute the bandwidth ``sigma``. Used when ``sigma`` is to be
169
- inferred. The function's signature should take in the tensors ``x``, ``y`` and
170
- ``dist`` and return ``sigma``. If ``None``, it is set to ``sigma_median``.
171
- trainable : bool, default False
172
- Whether or not to track gradients w.r.t. `sigma` to allow it to be trained.
173
- """
174
-
175
- def __init__(
176
- self,
177
- sigma: torch.Tensor | None = None,
178
- init_sigma_fn: Callable | None = None,
179
- trainable: bool = False,
180
- ) -> None:
181
- super().__init__()
182
- init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
183
- self.config: dict[str, Any] = {
184
- "sigma": sigma,
185
- "trainable": trainable,
186
- "init_sigma_fn": init_sigma_fn,
187
- }
188
- if sigma is None:
189
- self.log_sigma: nn.Parameter = nn.Parameter(torch.empty(1), requires_grad=trainable)
190
- self.init_required: bool = True
191
- else:
192
- sigma = sigma.reshape(-1) # [Ns,]
193
- self.log_sigma: nn.Parameter = nn.Parameter(sigma.log(), requires_grad=trainable)
194
- self.init_required: bool = False
195
- self.init_sigma_fn = init_sigma_fn
196
- self.trainable = trainable
197
-
198
- @property
199
- def sigma(self) -> torch.Tensor:
200
- return self.log_sigma.exp()
201
-
202
- def forward(
203
- self,
204
- x: np.ndarray[Any, Any] | torch.Tensor,
205
- y: np.ndarray[Any, Any] | torch.Tensor,
206
- infer_sigma: bool = False,
207
- ) -> torch.Tensor:
208
- x, y = torch.as_tensor(x), torch.as_tensor(y)
209
- dist = _squared_pairwise_distance(x.flatten(1), y.flatten(1)) # [Nx, Ny]
210
-
211
- if infer_sigma or self.init_required:
212
- if self.trainable and infer_sigma:
213
- raise ValueError("Gradients cannot be computed w.r.t. an inferred sigma value")
214
- sigma = self.init_sigma_fn(x, y, dist)
215
- with torch.no_grad():
216
- self.log_sigma.copy_(sigma.log().clone())
217
- self.init_required: bool = False
218
-
219
- gamma = 1.0 / (2.0 * self.sigma**2) # [Ns,]
220
- # TODO: do matrix multiplication after all?
221
- kernel_mat = torch.exp(-torch.cat([(g * dist)[None, :, :] for g in gamma], dim=0)) # [Ns, Nx, Ny]
222
- return kernel_mat.mean(dim=0) # [Nx, Ny]