dataeval 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/config.py +3 -3
- dataeval/detectors/drift/__init__.py +2 -2
- dataeval/detectors/drift/_base.py +55 -203
- dataeval/detectors/drift/_cvm.py +19 -30
- dataeval/detectors/drift/_ks.py +18 -30
- dataeval/detectors/drift/_mmd.py +189 -53
- dataeval/detectors/drift/_uncertainty.py +52 -56
- dataeval/detectors/drift/updates.py +13 -12
- dataeval/detectors/linters/duplicates.py +5 -3
- dataeval/detectors/linters/outliers.py +2 -2
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/metrics/bias/__init__.py +11 -1
- dataeval/metrics/bias/_completeness.py +130 -0
- dataeval/metrics/stats/_base.py +28 -32
- dataeval/metrics/stats/_dimensionstats.py +2 -2
- dataeval/metrics/stats/_hashstats.py +2 -2
- dataeval/metrics/stats/_imagestats.py +4 -4
- dataeval/metrics/stats/_labelstats.py +4 -45
- dataeval/metrics/stats/_pixelstats.py +2 -2
- dataeval/metrics/stats/_visualstats.py +2 -2
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_bias.py +31 -22
- dataeval/outputs/_stats.py +2 -3
- dataeval/typing.py +25 -22
- dataeval/utils/_array.py +43 -7
- dataeval/utils/data/_dataset.py +8 -4
- dataeval/utils/data/_embeddings.py +141 -24
- dataeval/utils/data/_images.py +38 -15
- dataeval/utils/data/_metadata.py +5 -4
- dataeval/utils/data/_selection.py +3 -15
- dataeval/utils/data/_split.py +76 -129
- dataeval/utils/data/datasets/_base.py +7 -4
- dataeval/utils/data/datasets/_cifar10.py +9 -9
- dataeval/utils/data/datasets/_milco.py +42 -14
- dataeval/utils/data/datasets/_mnist.py +9 -5
- dataeval/utils/data/datasets/_ships.py +8 -4
- dataeval/utils/data/datasets/_voc.py +40 -19
- dataeval/utils/data/selections/__init__.py +2 -0
- dataeval/utils/data/selections/_classbalance.py +38 -0
- dataeval/utils/data/selections/_classfilter.py +14 -29
- dataeval/utils/data/selections/_prioritize.py +1 -1
- dataeval/utils/data/selections/_shuffle.py +2 -2
- dataeval/utils/metadata.py +1 -1
- dataeval/utils/torch/_internal.py +12 -35
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +49 -48
- dataeval/detectors/drift/_torch.py +0 -222
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0
@@ -1,222 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
__all__ = []
|
12
|
-
|
13
|
-
from typing import Any, Callable
|
14
|
-
|
15
|
-
import numpy as np
|
16
|
-
import torch
|
17
|
-
import torch.nn as nn
|
18
|
-
from numpy.typing import NDArray
|
19
|
-
|
20
|
-
from dataeval.config import DeviceLike, get_device
|
21
|
-
from dataeval.utils.torch._internal import predict_batch
|
22
|
-
|
23
|
-
|
24
|
-
def mmd2_from_kernel_matrix(
|
25
|
-
kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
|
26
|
-
) -> torch.Tensor:
|
27
|
-
"""
|
28
|
-
Compute maximum mean discrepancy (MMD^2) between 2 samples x and y from the
|
29
|
-
full kernel matrix between the samples.
|
30
|
-
|
31
|
-
Parameters
|
32
|
-
----------
|
33
|
-
kernel_mat : torch.Tensor
|
34
|
-
Kernel matrix between samples x and y.
|
35
|
-
m : int
|
36
|
-
Number of instances in y.
|
37
|
-
permute : bool, default False
|
38
|
-
Whether to permute the row indices. Used for permutation tests.
|
39
|
-
zero_diag : bool, default True
|
40
|
-
Whether to zero out the diagonal of the kernel matrix.
|
41
|
-
|
42
|
-
Returns
|
43
|
-
-------
|
44
|
-
torch.Tensor
|
45
|
-
MMD^2 between the samples from the kernel matrix.
|
46
|
-
"""
|
47
|
-
n = kernel_mat.shape[0] - m
|
48
|
-
if zero_diag:
|
49
|
-
kernel_mat = kernel_mat - torch.diag(kernel_mat.diag())
|
50
|
-
if permute:
|
51
|
-
idx = torch.randperm(kernel_mat.shape[0])
|
52
|
-
kernel_mat = kernel_mat[idx][:, idx]
|
53
|
-
k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
|
54
|
-
c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
|
55
|
-
mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
|
56
|
-
return mmd2
|
57
|
-
|
58
|
-
|
59
|
-
def preprocess_drift(
|
60
|
-
x: NDArray[Any],
|
61
|
-
model: nn.Module,
|
62
|
-
device: DeviceLike | None = None,
|
63
|
-
preprocess_batch_fn: Callable | None = None,
|
64
|
-
batch_size: int = int(1e10),
|
65
|
-
dtype: type[np.generic] | torch.dtype = np.float32,
|
66
|
-
) -> NDArray[Any] | torch.Tensor | tuple[Any, ...]:
|
67
|
-
"""
|
68
|
-
Prediction function used for preprocessing step of drift detector.
|
69
|
-
|
70
|
-
Parameters
|
71
|
-
----------
|
72
|
-
x : NDArray
|
73
|
-
Batch of instances.
|
74
|
-
model : nn.Module
|
75
|
-
Model used for preprocessing.
|
76
|
-
device : DeviceLike or None, default None
|
77
|
-
The hardware device to use if specified, otherwise uses the DataEval
|
78
|
-
default or torch default.
|
79
|
-
preprocess_batch_fn : Callable or None, default None
|
80
|
-
Optional batch preprocessing function. For example to convert a list of objects
|
81
|
-
to a batch which can be processed by the PyTorch model.
|
82
|
-
batch_size : int, default 1e10
|
83
|
-
Batch size used during prediction.
|
84
|
-
dtype : np.dtype or torch.dtype, default np.float32
|
85
|
-
Model output type, either a :term:`NumPy` or torch dtype, e.g. np.float32 or torch.float32.
|
86
|
-
|
87
|
-
Returns
|
88
|
-
-------
|
89
|
-
NDArray | torch.Tensor | tuple
|
90
|
-
Numpy array, torch tensor or tuples of those with model outputs.
|
91
|
-
"""
|
92
|
-
return predict_batch(
|
93
|
-
x,
|
94
|
-
model,
|
95
|
-
device=get_device(device),
|
96
|
-
batch_size=batch_size,
|
97
|
-
preprocess_fn=preprocess_batch_fn,
|
98
|
-
dtype=dtype,
|
99
|
-
)
|
100
|
-
|
101
|
-
|
102
|
-
@torch.jit.script
|
103
|
-
def _squared_pairwise_distance(
|
104
|
-
x: torch.Tensor, y: torch.Tensor, a_min: float = 1e-30
|
105
|
-
) -> torch.Tensor: # pragma: no cover - torch.jit.script code is compiled and copied
|
106
|
-
"""
|
107
|
-
PyTorch pairwise squared Euclidean distance between samples x and y.
|
108
|
-
|
109
|
-
Parameters
|
110
|
-
----------
|
111
|
-
x : torch.Tensor
|
112
|
-
Batch of instances of shape [Nx, features].
|
113
|
-
y : torch.Tensor
|
114
|
-
Batch of instances of shape [Ny, features].
|
115
|
-
a_min : float
|
116
|
-
Lower bound to clip distance values.
|
117
|
-
|
118
|
-
Returns
|
119
|
-
-------
|
120
|
-
torch.Tensor
|
121
|
-
Pairwise squared Euclidean distance [Nx, Ny].
|
122
|
-
"""
|
123
|
-
x2 = x.pow(2).sum(dim=-1, keepdim=True)
|
124
|
-
y2 = y.pow(2).sum(dim=-1, keepdim=True)
|
125
|
-
dist = torch.addmm(y2.transpose(-2, -1), x, y.transpose(-2, -1), alpha=-2).add_(x2)
|
126
|
-
return dist.clamp_min_(a_min)
|
127
|
-
|
128
|
-
|
129
|
-
def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
|
130
|
-
"""
|
131
|
-
Bandwidth estimation using the median heuristic `Gretton2012`
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
x : torch.Tensor
|
136
|
-
Tensor of instances with dimension [Nx, features].
|
137
|
-
y : torch.Tensor
|
138
|
-
Tensor of instances with dimension [Ny, features].
|
139
|
-
dist : torch.Tensor
|
140
|
-
Tensor with dimensions [Nx, Ny], containing the pairwise distances
|
141
|
-
between `x` and `y`.
|
142
|
-
|
143
|
-
Returns
|
144
|
-
-------
|
145
|
-
torch.Tensor
|
146
|
-
The computed bandwidth, `sigma`.
|
147
|
-
"""
|
148
|
-
n = min(x.shape[0], y.shape[0])
|
149
|
-
n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
|
150
|
-
n_median = n + (np.prod(dist.shape) - n) // 2 - 1
|
151
|
-
sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
|
152
|
-
return sigma
|
153
|
-
|
154
|
-
|
155
|
-
class GaussianRBF(nn.Module):
|
156
|
-
"""
|
157
|
-
Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
|
158
|
-
|
159
|
-
A forward pass takes a batch of instances x [Nx, features] and
|
160
|
-
y [Ny, features] and returns the kernel matrix [Nx, Ny].
|
161
|
-
|
162
|
-
Parameters
|
163
|
-
----------
|
164
|
-
sigma : torch.Tensor | None, default None
|
165
|
-
Bandwidth used for the kernel. Needn't be specified if being inferred or
|
166
|
-
trained. Can pass multiple values to eval kernel with and then average.
|
167
|
-
init_sigma_fn : Callable | None, default None
|
168
|
-
Function used to compute the bandwidth ``sigma``. Used when ``sigma`` is to be
|
169
|
-
inferred. The function's signature should take in the tensors ``x``, ``y`` and
|
170
|
-
``dist`` and return ``sigma``. If ``None``, it is set to ``sigma_median``.
|
171
|
-
trainable : bool, default False
|
172
|
-
Whether or not to track gradients w.r.t. `sigma` to allow it to be trained.
|
173
|
-
"""
|
174
|
-
|
175
|
-
def __init__(
|
176
|
-
self,
|
177
|
-
sigma: torch.Tensor | None = None,
|
178
|
-
init_sigma_fn: Callable | None = None,
|
179
|
-
trainable: bool = False,
|
180
|
-
) -> None:
|
181
|
-
super().__init__()
|
182
|
-
init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
|
183
|
-
self.config: dict[str, Any] = {
|
184
|
-
"sigma": sigma,
|
185
|
-
"trainable": trainable,
|
186
|
-
"init_sigma_fn": init_sigma_fn,
|
187
|
-
}
|
188
|
-
if sigma is None:
|
189
|
-
self.log_sigma: nn.Parameter = nn.Parameter(torch.empty(1), requires_grad=trainable)
|
190
|
-
self.init_required: bool = True
|
191
|
-
else:
|
192
|
-
sigma = sigma.reshape(-1) # [Ns,]
|
193
|
-
self.log_sigma: nn.Parameter = nn.Parameter(sigma.log(), requires_grad=trainable)
|
194
|
-
self.init_required: bool = False
|
195
|
-
self.init_sigma_fn = init_sigma_fn
|
196
|
-
self.trainable = trainable
|
197
|
-
|
198
|
-
@property
|
199
|
-
def sigma(self) -> torch.Tensor:
|
200
|
-
return self.log_sigma.exp()
|
201
|
-
|
202
|
-
def forward(
|
203
|
-
self,
|
204
|
-
x: np.ndarray[Any, Any] | torch.Tensor,
|
205
|
-
y: np.ndarray[Any, Any] | torch.Tensor,
|
206
|
-
infer_sigma: bool = False,
|
207
|
-
) -> torch.Tensor:
|
208
|
-
x, y = torch.as_tensor(x), torch.as_tensor(y)
|
209
|
-
dist = _squared_pairwise_distance(x.flatten(1), y.flatten(1)) # [Nx, Ny]
|
210
|
-
|
211
|
-
if infer_sigma or self.init_required:
|
212
|
-
if self.trainable and infer_sigma:
|
213
|
-
raise ValueError("Gradients cannot be computed w.r.t. an inferred sigma value")
|
214
|
-
sigma = self.init_sigma_fn(x, y, dist)
|
215
|
-
with torch.no_grad():
|
216
|
-
self.log_sigma.copy_(sigma.log().clone())
|
217
|
-
self.init_required: bool = False
|
218
|
-
|
219
|
-
gamma = 1.0 / (2.0 * self.sigma**2) # [Ns,]
|
220
|
-
# TODO: do matrix multiplication after all?
|
221
|
-
kernel_mat = torch.exp(-torch.cat([(g * dist)[None, :, :] for g in gamma], dim=0)) # [Ns, Nx, Ny]
|
222
|
-
return kernel_mat.mean(dim=0) # [Nx, Ny]
|
File without changes
|
File without changes
|