dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/_internal/detectors/clusterer.py +45 -16
- dataeval/_internal/detectors/drift/base.py +15 -12
- dataeval/_internal/detectors/drift/cvm.py +12 -8
- dataeval/_internal/detectors/drift/ks.py +7 -3
- dataeval/_internal/detectors/drift/mmd.py +15 -12
- dataeval/_internal/detectors/drift/uncertainty.py +6 -5
- dataeval/_internal/detectors/duplicates.py +35 -11
- dataeval/_internal/detectors/linter.py +85 -16
- dataeval/_internal/detectors/ood/ae.py +7 -5
- dataeval/_internal/detectors/ood/aegmm.py +6 -5
- dataeval/_internal/detectors/ood/base.py +15 -13
- dataeval/_internal/detectors/ood/llr.py +8 -5
- dataeval/_internal/detectors/ood/vae.py +6 -4
- dataeval/_internal/detectors/ood/vaegmm.py +6 -4
- dataeval/_internal/interop.py +43 -0
- dataeval/_internal/metrics/balance.py +180 -0
- dataeval/_internal/metrics/base.py +2 -84
- dataeval/_internal/metrics/ber.py +77 -53
- dataeval/_internal/metrics/coverage.py +80 -55
- dataeval/_internal/metrics/divergence.py +62 -54
- dataeval/_internal/metrics/diversity.py +206 -0
- dataeval/_internal/metrics/parity.py +292 -163
- dataeval/_internal/metrics/stats.py +48 -35
- dataeval/_internal/metrics/uap.py +31 -26
- dataeval/_internal/metrics/utils.py +237 -2
- dataeval/_internal/utils.py +64 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/metrics/__init__.py +25 -5
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
- dataeval-0.64.0.dist-info/RECORD +60 -0
- dataeval/_internal/metrics/hash.py +0 -79
- dataeval-0.61.0.dist-info/RECORD +0 -55
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -2,14 +2,14 @@ from importlib.util import find_spec
|
|
2
2
|
|
3
3
|
from . import detectors, flags, metrics
|
4
4
|
|
5
|
-
__version__ = "0.
|
5
|
+
__version__ = "0.64.0"
|
6
6
|
|
7
7
|
__all__ = ["detectors", "flags", "metrics"]
|
8
8
|
|
9
9
|
if find_spec("torch") is not None: # pragma: no cover
|
10
|
-
from . import models, workflows
|
10
|
+
from . import models, utils, workflows
|
11
11
|
|
12
|
-
__all__ += ["models", "workflows"]
|
12
|
+
__all__ += ["models", "utils", "workflows"]
|
13
13
|
elif find_spec("tensorflow") is not None: # pragma: no cover
|
14
14
|
from . import models
|
15
15
|
|
@@ -1,9 +1,12 @@
|
|
1
1
|
from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
+
from numpy.typing import ArrayLike
|
4
5
|
from scipy.cluster.hierarchy import linkage
|
5
6
|
from scipy.spatial.distance import pdist, squareform
|
6
7
|
|
8
|
+
from dataeval._internal.interop import to_numpy
|
9
|
+
|
7
10
|
|
8
11
|
def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
|
9
12
|
"""
|
@@ -102,20 +105,36 @@ class Clusterer:
|
|
102
105
|
|
103
106
|
Parameters
|
104
107
|
----------
|
105
|
-
dataset :
|
106
|
-
|
108
|
+
dataset : ArrayLike, shape - (N, P)
|
109
|
+
A dataset in an ArrayLike format.
|
110
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
|
111
|
+
|
112
|
+
Warning
|
113
|
+
-------
|
114
|
+
The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
|
115
|
+
|
116
|
+
Note
|
117
|
+
----
|
118
|
+
The Clusterer works best when the length of the feature dimension, P, is less than 500.
|
119
|
+
If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
|
120
|
+
|
121
|
+
Example
|
122
|
+
-------
|
123
|
+
Initialize the Clusterer class:
|
124
|
+
|
125
|
+
>>> cluster = Clusterer(dataset)
|
107
126
|
"""
|
108
127
|
|
109
|
-
def __init__(self, dataset:
|
128
|
+
def __init__(self, dataset: ArrayLike):
|
110
129
|
# Allows an update to dataset to reset the state rather than instantiate a new class
|
111
130
|
self._on_init(dataset)
|
112
131
|
|
113
|
-
def _on_init(self, dataset:
|
114
|
-
self.
|
115
|
-
self._data
|
116
|
-
self._num_samples = len(
|
132
|
+
def _on_init(self, dataset: ArrayLike):
|
133
|
+
self._data: np.ndarray = to_numpy(dataset)
|
134
|
+
self._validate_data(self._data)
|
135
|
+
self._num_samples = len(self._data)
|
117
136
|
|
118
|
-
self._darr: np.ndarray = pdist(
|
137
|
+
self._darr: np.ndarray = pdist(self._data, metric="euclidean")
|
119
138
|
self._sqdmat: np.ndarray = squareform(self._darr)
|
120
139
|
self._larr: np.ndarray = extend_linkage(linkage(self._darr))
|
121
140
|
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
@@ -131,7 +150,7 @@ class Clusterer:
|
|
131
150
|
return self._data
|
132
151
|
|
133
152
|
@data.setter
|
134
|
-
def data(self, x:
|
153
|
+
def data(self, x: ArrayLike):
|
135
154
|
self._on_init(x)
|
136
155
|
|
137
156
|
@property
|
@@ -450,20 +469,30 @@ class Clusterer:
|
|
450
469
|
|
451
470
|
Returns
|
452
471
|
-------
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
472
|
+
Dict[str, List[int]]
|
473
|
+
outliers :
|
474
|
+
List of indices that do not fall within a cluster
|
475
|
+
potential_outliers :
|
476
|
+
List of indices which are near the border between belonging in the cluster and being an outlier
|
477
|
+
duplicates :
|
478
|
+
List of groups of indices that are exact duplicates
|
479
|
+
potential_duplicates :
|
480
|
+
List of groups of indices which are not exact but closely related data points
|
481
|
+
|
482
|
+
Example
|
483
|
+
-------
|
484
|
+
>>> cluster.evaluate()
|
485
|
+
{'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
|
486
|
+
""" # noqa: E501
|
458
487
|
|
459
488
|
outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
|
460
|
-
duplicates,
|
489
|
+
duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
|
461
490
|
|
462
491
|
ret = {
|
463
492
|
"outliers": outliers,
|
464
493
|
"potential_outliers": potential_outliers,
|
465
494
|
"duplicates": duplicates,
|
466
|
-
"
|
495
|
+
"potential_duplicates": potential_duplicates,
|
467
496
|
}
|
468
497
|
|
469
498
|
return ret
|
@@ -8,10 +8,12 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from abc import ABC, abstractmethod
|
10
10
|
from functools import wraps
|
11
|
-
from random import random
|
12
11
|
from typing import Callable, Dict, Literal, Optional, Tuple, Union
|
13
12
|
|
14
13
|
import numpy as np
|
14
|
+
from numpy.typing import ArrayLike
|
15
|
+
|
16
|
+
from dataeval._internal.interop import to_numpy
|
15
17
|
|
16
18
|
|
17
19
|
def update_x_ref(fn):
|
@@ -94,7 +96,7 @@ class ReservoirSamplingUpdate(UpdateStrategy):
|
|
94
96
|
x_reservoir[n_ref, :] = item
|
95
97
|
n_ref += 1
|
96
98
|
else:
|
97
|
-
r =
|
99
|
+
r = np.random.randint(0, count)
|
98
100
|
if r < self.n:
|
99
101
|
x_reservoir[r, :] = item
|
100
102
|
return x_reservoir
|
@@ -105,11 +107,11 @@ class BaseDrift:
|
|
105
107
|
|
106
108
|
def __init__(
|
107
109
|
self,
|
108
|
-
x_ref:
|
110
|
+
x_ref: ArrayLike,
|
109
111
|
p_val: float = 0.05,
|
110
112
|
x_ref_preprocessed: bool = False,
|
111
113
|
update_x_ref: Optional[UpdateStrategy] = None,
|
112
|
-
preprocess_fn: Optional[Callable[[
|
114
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
113
115
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
114
116
|
) -> None:
|
115
117
|
# Type checking
|
@@ -128,7 +130,7 @@ class BaseDrift:
|
|
128
130
|
self.update_x_ref = update_x_ref
|
129
131
|
self.preprocess_fn = preprocess_fn
|
130
132
|
self.correction = correction
|
131
|
-
self.n = len(
|
133
|
+
self.n = len(self._x_ref) # type: ignore
|
132
134
|
|
133
135
|
# Ref counter for preprocessed x
|
134
136
|
self._x_refcount = 0
|
@@ -140,9 +142,10 @@ class BaseDrift:
|
|
140
142
|
if self.preprocess_fn is not None:
|
141
143
|
self._x_ref = self.preprocess_fn(self._x_ref)
|
142
144
|
|
145
|
+
self._x_ref = to_numpy(self._x_ref)
|
143
146
|
return self._x_ref
|
144
147
|
|
145
|
-
def _preprocess(self, x:
|
148
|
+
def _preprocess(self, x: ArrayLike) -> ArrayLike:
|
146
149
|
"""Data preprocessing before computing the drift scores."""
|
147
150
|
if self.preprocess_fn is not None:
|
148
151
|
x = self.preprocess_fn(x)
|
@@ -159,11 +162,11 @@ class BaseUnivariateDrift(BaseDrift):
|
|
159
162
|
|
160
163
|
def __init__(
|
161
164
|
self,
|
162
|
-
x_ref:
|
165
|
+
x_ref: ArrayLike,
|
163
166
|
p_val: float = 0.05,
|
164
167
|
x_ref_preprocessed: bool = False,
|
165
168
|
update_x_ref: Optional[UpdateStrategy] = None,
|
166
|
-
preprocess_fn: Optional[Callable[[
|
169
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
167
170
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
168
171
|
n_features: Optional[int] = None,
|
169
172
|
) -> None:
|
@@ -188,14 +191,14 @@ class BaseUnivariateDrift(BaseDrift):
|
|
188
191
|
self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
|
189
192
|
else:
|
190
193
|
# infer number of features after applying preprocessing step
|
191
|
-
x = self.preprocess_fn(self.
|
194
|
+
x = to_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
|
192
195
|
self._n_features = x.reshape(x.shape[0], -1).shape[-1]
|
193
196
|
|
194
197
|
return self._n_features
|
195
198
|
|
196
199
|
@preprocess_x
|
197
200
|
@abstractmethod
|
198
|
-
def score(self, x:
|
201
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
199
202
|
"""Abstract method to calculate feature score after preprocessing"""
|
200
203
|
|
201
204
|
def _apply_correction(self, p_vals: np.ndarray) -> Tuple[int, float]:
|
@@ -221,7 +224,7 @@ class BaseUnivariateDrift(BaseDrift):
|
|
221
224
|
@update_x_ref
|
222
225
|
def predict(
|
223
226
|
self,
|
224
|
-
x:
|
227
|
+
x: ArrayLike,
|
225
228
|
drift_type: Literal["batch", "feature"] = "batch",
|
226
229
|
) -> Dict[str, Union[int, float, np.ndarray]]:
|
227
230
|
"""
|
@@ -230,7 +233,7 @@ class BaseUnivariateDrift(BaseDrift):
|
|
230
233
|
|
231
234
|
Parameters
|
232
235
|
----------
|
233
|
-
x :
|
236
|
+
x : ArrayLike
|
234
237
|
Batch of instances.
|
235
238
|
drift_type : Literal["batch", "feature"], default "batch"
|
236
239
|
Predict drift at the 'feature' or 'batch' level. For 'batch', the test
|
@@ -9,8 +9,11 @@ Licensed under Apache Software License (Apache 2.0)
|
|
9
9
|
from typing import Callable, Literal, Optional, Tuple
|
10
10
|
|
11
11
|
import numpy as np
|
12
|
+
from numpy.typing import ArrayLike
|
12
13
|
from scipy.stats import cramervonmises_2samp
|
13
14
|
|
15
|
+
from dataeval._internal.interop import to_numpy
|
16
|
+
|
14
17
|
from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
|
15
18
|
|
16
19
|
|
@@ -23,7 +26,7 @@ class DriftCVM(BaseUnivariateDrift):
|
|
23
26
|
|
24
27
|
Parameters
|
25
28
|
----------
|
26
|
-
x_ref :
|
29
|
+
x_ref : ArrayLike
|
27
30
|
Data used as reference distribution.
|
28
31
|
p_val : float, default 0.05
|
29
32
|
p-value used for significance of the statistical test for each feature.
|
@@ -40,7 +43,7 @@ class DriftCVM(BaseUnivariateDrift):
|
|
40
43
|
:py:class:`dataeval.detectors.LastSeenUpdateStrategy`
|
41
44
|
or via reservoir sampling with
|
42
45
|
:py:class:`dataeval.detectors.ReservoirSamplingUpdateStrategy`.
|
43
|
-
preprocess_fn : Optional[Callable[[
|
46
|
+
preprocess_fn : Optional[Callable[[ArrayLike], ArrayLike]], default None
|
44
47
|
Function to preprocess the data before computing the data drift metrics.
|
45
48
|
Typically a dimensionality reduction technique.
|
46
49
|
correction : Literal["bonferroni", "fdr"], default "bonferroni"
|
@@ -54,11 +57,11 @@ class DriftCVM(BaseUnivariateDrift):
|
|
54
57
|
|
55
58
|
def __init__(
|
56
59
|
self,
|
57
|
-
x_ref:
|
60
|
+
x_ref: ArrayLike,
|
58
61
|
p_val: float = 0.05,
|
59
62
|
x_ref_preprocessed: bool = False,
|
60
63
|
update_x_ref: Optional[UpdateStrategy] = None,
|
61
|
-
preprocess_fn: Optional[Callable[[
|
64
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
62
65
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
63
66
|
n_features: Optional[int] = None,
|
64
67
|
) -> None:
|
@@ -73,25 +76,26 @@ class DriftCVM(BaseUnivariateDrift):
|
|
73
76
|
)
|
74
77
|
|
75
78
|
@preprocess_x
|
76
|
-
def score(self, x:
|
79
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
77
80
|
"""
|
78
81
|
Performs the two-sample Cramér-von Mises test(s), computing the p-value and
|
79
82
|
test statistic per feature.
|
80
83
|
|
81
84
|
Parameters
|
82
85
|
----------
|
83
|
-
x
|
86
|
+
x : ArrayLike
|
84
87
|
Batch of instances.
|
85
88
|
|
86
89
|
Returns
|
87
90
|
-------
|
88
91
|
Feature level p-values and CVM statistics.
|
89
92
|
"""
|
90
|
-
|
93
|
+
x_np = to_numpy(x)
|
94
|
+
x_np = x_np.reshape(x_np.shape[0], -1)
|
91
95
|
x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
|
92
96
|
p_val = np.zeros(self.n_features, dtype=np.float32)
|
93
97
|
dist = np.zeros_like(p_val)
|
94
98
|
for f in range(self.n_features):
|
95
|
-
result = cramervonmises_2samp(x_ref[:, f],
|
99
|
+
result = cramervonmises_2samp(x_ref[:, f], x_np[:, f], method="auto")
|
96
100
|
p_val[f], dist[f] = result.pvalue, result.statistic
|
97
101
|
return p_val, dist
|
@@ -9,8 +9,11 @@ Licensed under Apache Software License (Apache 2.0)
|
|
9
9
|
from typing import Callable, Literal, Optional, Tuple
|
10
10
|
|
11
11
|
import numpy as np
|
12
|
+
from numpy.typing import ArrayLike
|
12
13
|
from scipy.stats import ks_2samp
|
13
14
|
|
15
|
+
from dataeval._internal.interop import to_numpy
|
16
|
+
|
14
17
|
from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
|
15
18
|
|
16
19
|
|
@@ -55,11 +58,11 @@ class DriftKS(BaseUnivariateDrift):
|
|
55
58
|
|
56
59
|
def __init__(
|
57
60
|
self,
|
58
|
-
x_ref:
|
61
|
+
x_ref: ArrayLike,
|
59
62
|
p_val: float = 0.05,
|
60
63
|
x_ref_preprocessed: bool = False,
|
61
64
|
update_x_ref: Optional[UpdateStrategy] = None,
|
62
|
-
preprocess_fn: Optional[Callable[[
|
65
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
63
66
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
64
67
|
alternative: Literal["two-sided", "less", "greater"] = "two-sided",
|
65
68
|
n_features: Optional[int] = None,
|
@@ -78,7 +81,7 @@ class DriftKS(BaseUnivariateDrift):
|
|
78
81
|
self.alternative = alternative
|
79
82
|
|
80
83
|
@preprocess_x
|
81
|
-
def score(self, x:
|
84
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
82
85
|
"""
|
83
86
|
Compute K-S scores and statistics per feature.
|
84
87
|
|
@@ -91,6 +94,7 @@ class DriftKS(BaseUnivariateDrift):
|
|
91
94
|
-------
|
92
95
|
Feature level p-values and K-S statistics.
|
93
96
|
"""
|
97
|
+
x = to_numpy(x)
|
94
98
|
x = x.reshape(x.shape[0], -1)
|
95
99
|
x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
|
96
100
|
p_val = np.zeros(self.n_features, dtype=np.float32)
|
@@ -8,8 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from typing import Callable, Dict, Optional, Tuple, Union
|
10
10
|
|
11
|
-
import numpy as np
|
12
11
|
import torch
|
12
|
+
from numpy.typing import ArrayLike
|
13
|
+
|
14
|
+
from dataeval._internal.interop import to_numpy
|
13
15
|
|
14
16
|
from .base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
|
15
17
|
from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
|
@@ -21,7 +23,7 @@ class DriftMMD(BaseDrift):
|
|
21
23
|
|
22
24
|
Parameters
|
23
25
|
----------
|
24
|
-
x_ref :
|
26
|
+
x_ref : ArrayLike
|
25
27
|
Data used as reference distribution.
|
26
28
|
p_val : float, default 0.05
|
27
29
|
p-value used for the significance of the permutation test.
|
@@ -44,7 +46,7 @@ class DriftMMD(BaseDrift):
|
|
44
46
|
Function to preprocess the data before computing the data drift metrics.
|
45
47
|
kernel : Callable, default :py:class:`dataeval.detectors.GaussianRBF`
|
46
48
|
Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
|
47
|
-
sigma : Optional[
|
49
|
+
sigma : Optional[ArrayLike], default None
|
48
50
|
Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
|
49
51
|
bandwidth values as an array. The kernel evaluation is then averaged over
|
50
52
|
those bandwidths.
|
@@ -59,13 +61,13 @@ class DriftMMD(BaseDrift):
|
|
59
61
|
|
60
62
|
def __init__(
|
61
63
|
self,
|
62
|
-
x_ref:
|
64
|
+
x_ref: ArrayLike,
|
63
65
|
p_val: float = 0.05,
|
64
66
|
x_ref_preprocessed: bool = False,
|
65
67
|
update_x_ref: Optional[UpdateStrategy] = None,
|
66
|
-
preprocess_fn: Optional[Callable[[
|
68
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
67
69
|
kernel: Callable = GaussianRBF,
|
68
|
-
sigma: Optional[
|
70
|
+
sigma: Optional[ArrayLike] = None,
|
69
71
|
configure_kernel_from_x_ref: bool = True,
|
70
72
|
n_permutations: int = 100,
|
71
73
|
device: Optional[str] = None,
|
@@ -73,7 +75,7 @@ class DriftMMD(BaseDrift):
|
|
73
75
|
super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
|
74
76
|
|
75
77
|
self.infer_sigma = configure_kernel_from_x_ref
|
76
|
-
if configure_kernel_from_x_ref and
|
78
|
+
if configure_kernel_from_x_ref and sigma is not None:
|
77
79
|
self.infer_sigma = False
|
78
80
|
|
79
81
|
self.n_permutations = n_permutations # nb of iterations through permutation test
|
@@ -82,7 +84,7 @@ class DriftMMD(BaseDrift):
|
|
82
84
|
self.device = get_device(device)
|
83
85
|
|
84
86
|
# initialize kernel
|
85
|
-
sigma_tensor = torch.from_numpy(sigma).to(self.device) if
|
87
|
+
sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if sigma is not None else None
|
86
88
|
self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
|
87
89
|
|
88
90
|
# compute kernel matrix for the reference data
|
@@ -102,7 +104,7 @@ class DriftMMD(BaseDrift):
|
|
102
104
|
return kernel_mat
|
103
105
|
|
104
106
|
@preprocess_x
|
105
|
-
def score(self, x:
|
107
|
+
def score(self, x: ArrayLike) -> Tuple[float, float, float]:
|
106
108
|
"""
|
107
109
|
Compute the p-value resulting from a permutation test using the maximum mean
|
108
110
|
discrepancy as a distance measure between the reference data and the data to
|
@@ -110,7 +112,7 @@ class DriftMMD(BaseDrift):
|
|
110
112
|
|
111
113
|
Parameters
|
112
114
|
----------
|
113
|
-
x
|
115
|
+
x : ArrayLike
|
114
116
|
Batch of instances.
|
115
117
|
|
116
118
|
Returns
|
@@ -118,6 +120,7 @@ class DriftMMD(BaseDrift):
|
|
118
120
|
p-value obtained from the permutation test, the MMD^2 between the reference and
|
119
121
|
test set, and the MMD^2 threshold above which drift is flagged.
|
120
122
|
"""
|
123
|
+
x = to_numpy(x)
|
121
124
|
x_ref = torch.from_numpy(self.x_ref).to(self.device)
|
122
125
|
n = x.shape[0]
|
123
126
|
kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
|
@@ -137,7 +140,7 @@ class DriftMMD(BaseDrift):
|
|
137
140
|
@update_x_ref
|
138
141
|
def predict(
|
139
142
|
self,
|
140
|
-
x:
|
143
|
+
x: ArrayLike,
|
141
144
|
) -> Dict[str, Union[int, float]]:
|
142
145
|
"""
|
143
146
|
Predict whether a batch of data has drifted from the reference data and then
|
@@ -145,7 +148,7 @@ class DriftMMD(BaseDrift):
|
|
145
148
|
|
146
149
|
Parameters
|
147
150
|
----------
|
148
|
-
x
|
151
|
+
x : ArrayLike
|
149
152
|
Batch of instances.
|
150
153
|
|
151
154
|
Returns
|
@@ -10,6 +10,7 @@ from functools import partial
|
|
10
10
|
from typing import Callable, Dict, Literal, Optional, Union
|
11
11
|
|
12
12
|
import numpy as np
|
13
|
+
from numpy.typing import ArrayLike
|
13
14
|
from scipy.special import softmax
|
14
15
|
from scipy.stats import entropy
|
15
16
|
|
@@ -64,7 +65,7 @@ class DriftUncertainty:
|
|
64
65
|
|
65
66
|
Parameters
|
66
67
|
----------
|
67
|
-
x_ref :
|
68
|
+
x_ref : ArrayLike
|
68
69
|
Data used as reference distribution. Should be disjoint from the data the
|
69
70
|
model was trained on for accurate p-values.
|
70
71
|
model : Callable
|
@@ -100,7 +101,7 @@ class DriftUncertainty:
|
|
100
101
|
|
101
102
|
def __init__(
|
102
103
|
self,
|
103
|
-
x_ref:
|
104
|
+
x_ref: ArrayLike,
|
104
105
|
model: Callable,
|
105
106
|
p_val: float = 0.05,
|
106
107
|
x_ref_preprocessed: bool = False,
|
@@ -130,16 +131,16 @@ class DriftUncertainty:
|
|
130
131
|
p_val=p_val,
|
131
132
|
x_ref_preprocessed=x_ref_preprocessed,
|
132
133
|
update_x_ref=update_x_ref,
|
133
|
-
preprocess_fn=preprocess_fn,
|
134
|
+
preprocess_fn=preprocess_fn, # type: ignore
|
134
135
|
)
|
135
136
|
|
136
|
-
def predict(self, x:
|
137
|
+
def predict(self, x: ArrayLike) -> Dict[str, Union[int, float, np.ndarray]]:
|
137
138
|
"""
|
138
139
|
Predict whether a batch of data has drifted from the reference data.
|
139
140
|
|
140
141
|
Parameters
|
141
142
|
----------
|
142
|
-
x
|
143
|
+
x : ArrayLike
|
143
144
|
Batch of instances.
|
144
145
|
|
145
146
|
Returns
|
@@ -1,6 +1,6 @@
|
|
1
|
-
from typing import Dict, List, Literal
|
1
|
+
from typing import Dict, Iterable, List, Literal
|
2
2
|
|
3
|
-
|
3
|
+
from numpy.typing import ArrayLike
|
4
4
|
|
5
5
|
from dataeval._internal.flags import ImageHash
|
6
6
|
from dataeval._internal.metrics.stats import ImageStats
|
@@ -10,14 +10,21 @@ class Duplicates:
|
|
10
10
|
"""
|
11
11
|
Finds the duplicate images in a dataset using xxhash for exact duplicates
|
12
12
|
and pchash for near duplicates
|
13
|
+
|
14
|
+
Attributes
|
15
|
+
----------
|
16
|
+
stats : ImageStats(flags=ImageHash.ALL)
|
17
|
+
Base stats class with the flags for checking duplicates
|
18
|
+
|
19
|
+
Example
|
20
|
+
-------
|
21
|
+
Initialize the Duplicates class:
|
22
|
+
|
23
|
+
>>> dups = Duplicates()
|
13
24
|
"""
|
14
25
|
|
15
|
-
def __init__(
|
16
|
-
self,
|
17
|
-
images: np.ndarray,
|
18
|
-
):
|
26
|
+
def __init__(self):
|
19
27
|
self.stats = ImageStats(ImageHash.ALL)
|
20
|
-
self.images = images
|
21
28
|
|
22
29
|
def _get_duplicates(self) -> dict:
|
23
30
|
exact = {}
|
@@ -34,16 +41,33 @@ class Duplicates:
|
|
34
41
|
"near": sorted(near),
|
35
42
|
}
|
36
43
|
|
37
|
-
def evaluate(self) -> Dict[Literal["exact", "near"], List[int]]:
|
44
|
+
def evaluate(self, images: Iterable[ArrayLike]) -> Dict[Literal["exact", "near"], List[int]]:
|
38
45
|
"""
|
39
46
|
Returns duplicate image indices for both exact matches and near matches
|
40
47
|
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
images : Iterable[ArrayLike], shape - (N, C, H, W)
|
51
|
+
A set of images in an ArrayLike format
|
52
|
+
|
41
53
|
Returns
|
42
54
|
-------
|
43
|
-
Dict[
|
44
|
-
|
55
|
+
Dict[str, List[int]]
|
56
|
+
exact :
|
57
|
+
List of groups of indices that are exact matches
|
58
|
+
near :
|
59
|
+
List of groups of indices that are near matches
|
60
|
+
|
61
|
+
See Also
|
62
|
+
--------
|
63
|
+
ImageStats
|
64
|
+
|
65
|
+
Example
|
66
|
+
-------
|
67
|
+
>>> dups.evaluate(images)
|
68
|
+
{'exact': [[3, 20], [16, 37]], 'near': [[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]]}
|
45
69
|
"""
|
46
70
|
self.stats.reset()
|
47
|
-
self.stats.update(
|
71
|
+
self.stats.update(images)
|
48
72
|
self.results = self.stats.compute()
|
49
73
|
return self._get_duplicates()
|