dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_internal/detectors/clusterer.py +44 -16
- dataeval/_internal/detectors/drift/base.py +14 -12
- dataeval/_internal/detectors/drift/cvm.py +11 -8
- dataeval/_internal/detectors/drift/ks.py +6 -3
- dataeval/_internal/detectors/drift/mmd.py +14 -12
- dataeval/_internal/detectors/drift/uncertainty.py +7 -5
- dataeval/_internal/detectors/duplicates.py +35 -12
- dataeval/_internal/detectors/linter.py +85 -16
- dataeval/_internal/detectors/ood/ae.py +6 -5
- dataeval/_internal/detectors/ood/aegmm.py +5 -5
- dataeval/_internal/detectors/ood/base.py +14 -13
- dataeval/_internal/detectors/ood/llr.py +6 -4
- dataeval/_internal/detectors/ood/vae.py +5 -4
- dataeval/_internal/detectors/ood/vaegmm.py +5 -4
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +63 -0
- dataeval/_internal/functional/coverage.py +75 -0
- dataeval/_internal/functional/divergence.py +16 -0
- dataeval/_internal/{metrics → functional}/hash.py +1 -1
- dataeval/_internal/functional/metadata.py +136 -0
- dataeval/_internal/functional/metadataparity.py +190 -0
- dataeval/_internal/functional/uap.py +6 -0
- dataeval/_internal/interop.py +52 -0
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +30 -0
- dataeval/_internal/metrics/base.py +2 -2
- dataeval/_internal/metrics/ber.py +16 -66
- dataeval/_internal/metrics/coverage.py +51 -35
- dataeval/_internal/metrics/divergence.py +50 -42
- dataeval/_internal/metrics/metadata.py +610 -0
- dataeval/_internal/metrics/metadataparity.py +67 -0
- dataeval/_internal/metrics/parity.py +40 -56
- dataeval/_internal/metrics/stats.py +46 -35
- dataeval/_internal/metrics/uap.py +14 -17
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/metrics/__init__.py +2 -1
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
- dataeval-0.63.0.dist-info/RECORD +68 -0
- dataeval-0.61.0.dist-info/RECORD +0 -55
- /dataeval/_internal/{metrics → functional}/utils.py +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -4,6 +4,8 @@ import numpy as np
|
|
4
4
|
from scipy.cluster.hierarchy import linkage
|
5
5
|
from scipy.spatial.distance import pdist, squareform
|
6
6
|
|
7
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
8
|
+
|
7
9
|
|
8
10
|
def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
|
9
11
|
"""
|
@@ -102,20 +104,36 @@ class Clusterer:
|
|
102
104
|
|
103
105
|
Parameters
|
104
106
|
----------
|
105
|
-
dataset :
|
106
|
-
|
107
|
+
dataset : ArrayLike, shape - (N, P)
|
108
|
+
A dataset in an ArrayLike format.
|
109
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
|
110
|
+
|
111
|
+
Warning
|
112
|
+
-------
|
113
|
+
The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
|
114
|
+
|
115
|
+
Note
|
116
|
+
----
|
117
|
+
The Clusterer works best when the length of the feature dimension, P, is less than 500.
|
118
|
+
If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
|
119
|
+
|
120
|
+
Example
|
121
|
+
-------
|
122
|
+
Initialize the Clusterer class:
|
123
|
+
|
124
|
+
>>> cluster = Clusterer(dataset)
|
107
125
|
"""
|
108
126
|
|
109
|
-
def __init__(self, dataset:
|
127
|
+
def __init__(self, dataset: ArrayLike):
|
110
128
|
# Allows an update to dataset to reset the state rather than instantiate a new class
|
111
129
|
self._on_init(dataset)
|
112
130
|
|
113
|
-
def _on_init(self, dataset:
|
114
|
-
self.
|
115
|
-
self._data
|
116
|
-
self._num_samples = len(
|
131
|
+
def _on_init(self, dataset: ArrayLike):
|
132
|
+
self._data: np.ndarray = to_numpy(dataset)
|
133
|
+
self._validate_data(self._data)
|
134
|
+
self._num_samples = len(self._data)
|
117
135
|
|
118
|
-
self._darr: np.ndarray = pdist(
|
136
|
+
self._darr: np.ndarray = pdist(self._data, metric="euclidean")
|
119
137
|
self._sqdmat: np.ndarray = squareform(self._darr)
|
120
138
|
self._larr: np.ndarray = extend_linkage(linkage(self._darr))
|
121
139
|
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
@@ -131,7 +149,7 @@ class Clusterer:
|
|
131
149
|
return self._data
|
132
150
|
|
133
151
|
@data.setter
|
134
|
-
def data(self, x:
|
152
|
+
def data(self, x: ArrayLike):
|
135
153
|
self._on_init(x)
|
136
154
|
|
137
155
|
@property
|
@@ -450,20 +468,30 @@ class Clusterer:
|
|
450
468
|
|
451
469
|
Returns
|
452
470
|
-------
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
471
|
+
Dict[str, List[int]]
|
472
|
+
outliers :
|
473
|
+
List of indices that do not fall within a cluster
|
474
|
+
potential_outliers :
|
475
|
+
List of indices which are near the border between belonging in the cluster and being an outlier
|
476
|
+
duplicates :
|
477
|
+
List of groups of indices that are exact duplicates
|
478
|
+
potential_duplicates :
|
479
|
+
List of groups of indices which are not exact but closely related data points
|
480
|
+
|
481
|
+
Example
|
482
|
+
-------
|
483
|
+
>>> cluster.evaluate()
|
484
|
+
{'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
|
485
|
+
""" # noqa: E501
|
458
486
|
|
459
487
|
outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
|
460
|
-
duplicates,
|
488
|
+
duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
|
461
489
|
|
462
490
|
ret = {
|
463
491
|
"outliers": outliers,
|
464
492
|
"potential_outliers": potential_outliers,
|
465
493
|
"duplicates": duplicates,
|
466
|
-
"
|
494
|
+
"potential_duplicates": potential_duplicates,
|
467
495
|
}
|
468
496
|
|
469
497
|
return ret
|
@@ -8,11 +8,12 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from abc import ABC, abstractmethod
|
10
10
|
from functools import wraps
|
11
|
-
from random import random
|
12
11
|
from typing import Callable, Dict, Literal, Optional, Tuple, Union
|
13
12
|
|
14
13
|
import numpy as np
|
15
14
|
|
15
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
16
|
+
|
16
17
|
|
17
18
|
def update_x_ref(fn):
|
18
19
|
@wraps(fn)
|
@@ -94,7 +95,7 @@ class ReservoirSamplingUpdate(UpdateStrategy):
|
|
94
95
|
x_reservoir[n_ref, :] = item
|
95
96
|
n_ref += 1
|
96
97
|
else:
|
97
|
-
r =
|
98
|
+
r = np.random.randint(0, count)
|
98
99
|
if r < self.n:
|
99
100
|
x_reservoir[r, :] = item
|
100
101
|
return x_reservoir
|
@@ -105,11 +106,11 @@ class BaseDrift:
|
|
105
106
|
|
106
107
|
def __init__(
|
107
108
|
self,
|
108
|
-
x_ref:
|
109
|
+
x_ref: ArrayLike,
|
109
110
|
p_val: float = 0.05,
|
110
111
|
x_ref_preprocessed: bool = False,
|
111
112
|
update_x_ref: Optional[UpdateStrategy] = None,
|
112
|
-
preprocess_fn: Optional[Callable[[
|
113
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
113
114
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
114
115
|
) -> None:
|
115
116
|
# Type checking
|
@@ -128,7 +129,7 @@ class BaseDrift:
|
|
128
129
|
self.update_x_ref = update_x_ref
|
129
130
|
self.preprocess_fn = preprocess_fn
|
130
131
|
self.correction = correction
|
131
|
-
self.n = len(
|
132
|
+
self.n = len(self._x_ref) # type: ignore
|
132
133
|
|
133
134
|
# Ref counter for preprocessed x
|
134
135
|
self._x_refcount = 0
|
@@ -140,9 +141,10 @@ class BaseDrift:
|
|
140
141
|
if self.preprocess_fn is not None:
|
141
142
|
self._x_ref = self.preprocess_fn(self._x_ref)
|
142
143
|
|
144
|
+
self._x_ref = to_numpy(self._x_ref)
|
143
145
|
return self._x_ref
|
144
146
|
|
145
|
-
def _preprocess(self, x:
|
147
|
+
def _preprocess(self, x: ArrayLike) -> ArrayLike:
|
146
148
|
"""Data preprocessing before computing the drift scores."""
|
147
149
|
if self.preprocess_fn is not None:
|
148
150
|
x = self.preprocess_fn(x)
|
@@ -159,11 +161,11 @@ class BaseUnivariateDrift(BaseDrift):
|
|
159
161
|
|
160
162
|
def __init__(
|
161
163
|
self,
|
162
|
-
x_ref:
|
164
|
+
x_ref: ArrayLike,
|
163
165
|
p_val: float = 0.05,
|
164
166
|
x_ref_preprocessed: bool = False,
|
165
167
|
update_x_ref: Optional[UpdateStrategy] = None,
|
166
|
-
preprocess_fn: Optional[Callable[[
|
168
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
167
169
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
168
170
|
n_features: Optional[int] = None,
|
169
171
|
) -> None:
|
@@ -188,14 +190,14 @@ class BaseUnivariateDrift(BaseDrift):
|
|
188
190
|
self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
|
189
191
|
else:
|
190
192
|
# infer number of features after applying preprocessing step
|
191
|
-
x = self.preprocess_fn(self.
|
193
|
+
x = to_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
|
192
194
|
self._n_features = x.reshape(x.shape[0], -1).shape[-1]
|
193
195
|
|
194
196
|
return self._n_features
|
195
197
|
|
196
198
|
@preprocess_x
|
197
199
|
@abstractmethod
|
198
|
-
def score(self, x:
|
200
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
199
201
|
"""Abstract method to calculate feature score after preprocessing"""
|
200
202
|
|
201
203
|
def _apply_correction(self, p_vals: np.ndarray) -> Tuple[int, float]:
|
@@ -221,7 +223,7 @@ class BaseUnivariateDrift(BaseDrift):
|
|
221
223
|
@update_x_ref
|
222
224
|
def predict(
|
223
225
|
self,
|
224
|
-
x:
|
226
|
+
x: ArrayLike,
|
225
227
|
drift_type: Literal["batch", "feature"] = "batch",
|
226
228
|
) -> Dict[str, Union[int, float, np.ndarray]]:
|
227
229
|
"""
|
@@ -230,7 +232,7 @@ class BaseUnivariateDrift(BaseDrift):
|
|
230
232
|
|
231
233
|
Parameters
|
232
234
|
----------
|
233
|
-
x :
|
235
|
+
x : ArrayLike
|
234
236
|
Batch of instances.
|
235
237
|
drift_type : Literal["batch", "feature"], default "batch"
|
236
238
|
Predict drift at the 'feature' or 'batch' level. For 'batch', the test
|
@@ -11,6 +11,8 @@ from typing import Callable, Literal, Optional, Tuple
|
|
11
11
|
import numpy as np
|
12
12
|
from scipy.stats import cramervonmises_2samp
|
13
13
|
|
14
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
|
+
|
14
16
|
from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
|
15
17
|
|
16
18
|
|
@@ -23,7 +25,7 @@ class DriftCVM(BaseUnivariateDrift):
|
|
23
25
|
|
24
26
|
Parameters
|
25
27
|
----------
|
26
|
-
x_ref :
|
28
|
+
x_ref : ArrayLike
|
27
29
|
Data used as reference distribution.
|
28
30
|
p_val : float, default 0.05
|
29
31
|
p-value used for significance of the statistical test for each feature.
|
@@ -40,7 +42,7 @@ class DriftCVM(BaseUnivariateDrift):
|
|
40
42
|
:py:class:`dataeval.detectors.LastSeenUpdateStrategy`
|
41
43
|
or via reservoir sampling with
|
42
44
|
:py:class:`dataeval.detectors.ReservoirSamplingUpdateStrategy`.
|
43
|
-
preprocess_fn : Optional[Callable[[
|
45
|
+
preprocess_fn : Optional[Callable[[ArrayLike], ArrayLike]], default None
|
44
46
|
Function to preprocess the data before computing the data drift metrics.
|
45
47
|
Typically a dimensionality reduction technique.
|
46
48
|
correction : Literal["bonferroni", "fdr"], default "bonferroni"
|
@@ -54,11 +56,11 @@ class DriftCVM(BaseUnivariateDrift):
|
|
54
56
|
|
55
57
|
def __init__(
|
56
58
|
self,
|
57
|
-
x_ref:
|
59
|
+
x_ref: ArrayLike,
|
58
60
|
p_val: float = 0.05,
|
59
61
|
x_ref_preprocessed: bool = False,
|
60
62
|
update_x_ref: Optional[UpdateStrategy] = None,
|
61
|
-
preprocess_fn: Optional[Callable[[
|
63
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
62
64
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
63
65
|
n_features: Optional[int] = None,
|
64
66
|
) -> None:
|
@@ -73,25 +75,26 @@ class DriftCVM(BaseUnivariateDrift):
|
|
73
75
|
)
|
74
76
|
|
75
77
|
@preprocess_x
|
76
|
-
def score(self, x:
|
78
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
77
79
|
"""
|
78
80
|
Performs the two-sample Cramér-von Mises test(s), computing the p-value and
|
79
81
|
test statistic per feature.
|
80
82
|
|
81
83
|
Parameters
|
82
84
|
----------
|
83
|
-
x
|
85
|
+
x : ArrayLike
|
84
86
|
Batch of instances.
|
85
87
|
|
86
88
|
Returns
|
87
89
|
-------
|
88
90
|
Feature level p-values and CVM statistics.
|
89
91
|
"""
|
90
|
-
|
92
|
+
x_np = to_numpy(x)
|
93
|
+
x_np = x_np.reshape(x_np.shape[0], -1)
|
91
94
|
x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
|
92
95
|
p_val = np.zeros(self.n_features, dtype=np.float32)
|
93
96
|
dist = np.zeros_like(p_val)
|
94
97
|
for f in range(self.n_features):
|
95
|
-
result = cramervonmises_2samp(x_ref[:, f],
|
98
|
+
result = cramervonmises_2samp(x_ref[:, f], x_np[:, f], method="auto")
|
96
99
|
p_val[f], dist[f] = result.pvalue, result.statistic
|
97
100
|
return p_val, dist
|
@@ -11,6 +11,8 @@ from typing import Callable, Literal, Optional, Tuple
|
|
11
11
|
import numpy as np
|
12
12
|
from scipy.stats import ks_2samp
|
13
13
|
|
14
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
15
|
+
|
14
16
|
from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
|
15
17
|
|
16
18
|
|
@@ -55,11 +57,11 @@ class DriftKS(BaseUnivariateDrift):
|
|
55
57
|
|
56
58
|
def __init__(
|
57
59
|
self,
|
58
|
-
x_ref:
|
60
|
+
x_ref: ArrayLike,
|
59
61
|
p_val: float = 0.05,
|
60
62
|
x_ref_preprocessed: bool = False,
|
61
63
|
update_x_ref: Optional[UpdateStrategy] = None,
|
62
|
-
preprocess_fn: Optional[Callable[[
|
64
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
63
65
|
correction: Literal["bonferroni", "fdr"] = "bonferroni",
|
64
66
|
alternative: Literal["two-sided", "less", "greater"] = "two-sided",
|
65
67
|
n_features: Optional[int] = None,
|
@@ -78,7 +80,7 @@ class DriftKS(BaseUnivariateDrift):
|
|
78
80
|
self.alternative = alternative
|
79
81
|
|
80
82
|
@preprocess_x
|
81
|
-
def score(self, x:
|
83
|
+
def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
|
82
84
|
"""
|
83
85
|
Compute K-S scores and statistics per feature.
|
84
86
|
|
@@ -91,6 +93,7 @@ class DriftKS(BaseUnivariateDrift):
|
|
91
93
|
-------
|
92
94
|
Feature level p-values and K-S statistics.
|
93
95
|
"""
|
96
|
+
x = to_numpy(x)
|
94
97
|
x = x.reshape(x.shape[0], -1)
|
95
98
|
x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
|
96
99
|
p_val = np.zeros(self.n_features, dtype=np.float32)
|
@@ -8,9 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from typing import Callable, Dict, Optional, Tuple, Union
|
10
10
|
|
11
|
-
import numpy as np
|
12
11
|
import torch
|
13
12
|
|
13
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
14
|
+
|
14
15
|
from .base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
|
15
16
|
from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
|
16
17
|
|
@@ -21,7 +22,7 @@ class DriftMMD(BaseDrift):
|
|
21
22
|
|
22
23
|
Parameters
|
23
24
|
----------
|
24
|
-
x_ref :
|
25
|
+
x_ref : ArrayLike
|
25
26
|
Data used as reference distribution.
|
26
27
|
p_val : float, default 0.05
|
27
28
|
p-value used for the significance of the permutation test.
|
@@ -44,7 +45,7 @@ class DriftMMD(BaseDrift):
|
|
44
45
|
Function to preprocess the data before computing the data drift metrics.
|
45
46
|
kernel : Callable, default :py:class:`dataeval.detectors.GaussianRBF`
|
46
47
|
Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
|
47
|
-
sigma : Optional[
|
48
|
+
sigma : Optional[ArrayLike], default None
|
48
49
|
Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
|
49
50
|
bandwidth values as an array. The kernel evaluation is then averaged over
|
50
51
|
those bandwidths.
|
@@ -59,13 +60,13 @@ class DriftMMD(BaseDrift):
|
|
59
60
|
|
60
61
|
def __init__(
|
61
62
|
self,
|
62
|
-
x_ref:
|
63
|
+
x_ref: ArrayLike,
|
63
64
|
p_val: float = 0.05,
|
64
65
|
x_ref_preprocessed: bool = False,
|
65
66
|
update_x_ref: Optional[UpdateStrategy] = None,
|
66
|
-
preprocess_fn: Optional[Callable[[
|
67
|
+
preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
|
67
68
|
kernel: Callable = GaussianRBF,
|
68
|
-
sigma: Optional[
|
69
|
+
sigma: Optional[ArrayLike] = None,
|
69
70
|
configure_kernel_from_x_ref: bool = True,
|
70
71
|
n_permutations: int = 100,
|
71
72
|
device: Optional[str] = None,
|
@@ -73,7 +74,7 @@ class DriftMMD(BaseDrift):
|
|
73
74
|
super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
|
74
75
|
|
75
76
|
self.infer_sigma = configure_kernel_from_x_ref
|
76
|
-
if configure_kernel_from_x_ref and isinstance(sigma,
|
77
|
+
if configure_kernel_from_x_ref and isinstance(sigma, ArrayLike):
|
77
78
|
self.infer_sigma = False
|
78
79
|
|
79
80
|
self.n_permutations = n_permutations # nb of iterations through permutation test
|
@@ -82,7 +83,7 @@ class DriftMMD(BaseDrift):
|
|
82
83
|
self.device = get_device(device)
|
83
84
|
|
84
85
|
# initialize kernel
|
85
|
-
sigma_tensor = torch.from_numpy(sigma).to(self.device) if isinstance(sigma,
|
86
|
+
sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if isinstance(sigma, ArrayLike) else None
|
86
87
|
self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
|
87
88
|
|
88
89
|
# compute kernel matrix for the reference data
|
@@ -102,7 +103,7 @@ class DriftMMD(BaseDrift):
|
|
102
103
|
return kernel_mat
|
103
104
|
|
104
105
|
@preprocess_x
|
105
|
-
def score(self, x:
|
106
|
+
def score(self, x: ArrayLike) -> Tuple[float, float, float]:
|
106
107
|
"""
|
107
108
|
Compute the p-value resulting from a permutation test using the maximum mean
|
108
109
|
discrepancy as a distance measure between the reference data and the data to
|
@@ -110,7 +111,7 @@ class DriftMMD(BaseDrift):
|
|
110
111
|
|
111
112
|
Parameters
|
112
113
|
----------
|
113
|
-
x
|
114
|
+
x : ArrayLike
|
114
115
|
Batch of instances.
|
115
116
|
|
116
117
|
Returns
|
@@ -118,6 +119,7 @@ class DriftMMD(BaseDrift):
|
|
118
119
|
p-value obtained from the permutation test, the MMD^2 between the reference and
|
119
120
|
test set, and the MMD^2 threshold above which drift is flagged.
|
120
121
|
"""
|
122
|
+
x = to_numpy(x)
|
121
123
|
x_ref = torch.from_numpy(self.x_ref).to(self.device)
|
122
124
|
n = x.shape[0]
|
123
125
|
kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
|
@@ -137,7 +139,7 @@ class DriftMMD(BaseDrift):
|
|
137
139
|
@update_x_ref
|
138
140
|
def predict(
|
139
141
|
self,
|
140
|
-
x:
|
142
|
+
x: ArrayLike,
|
141
143
|
) -> Dict[str, Union[int, float]]:
|
142
144
|
"""
|
143
145
|
Predict whether a batch of data has drifted from the reference data and then
|
@@ -145,7 +147,7 @@ class DriftMMD(BaseDrift):
|
|
145
147
|
|
146
148
|
Parameters
|
147
149
|
----------
|
148
|
-
x
|
150
|
+
x : ArrayLike
|
149
151
|
Batch of instances.
|
150
152
|
|
151
153
|
Returns
|
@@ -13,6 +13,8 @@ import numpy as np
|
|
13
13
|
from scipy.special import softmax
|
14
14
|
from scipy.stats import entropy
|
15
15
|
|
16
|
+
from dataeval._internal.interop import ArrayLike
|
17
|
+
|
16
18
|
from .base import UpdateStrategy
|
17
19
|
from .ks import DriftKS
|
18
20
|
from .torch import get_device, preprocess_drift
|
@@ -64,7 +66,7 @@ class DriftUncertainty:
|
|
64
66
|
|
65
67
|
Parameters
|
66
68
|
----------
|
67
|
-
x_ref :
|
69
|
+
x_ref : ArrayLike
|
68
70
|
Data used as reference distribution. Should be disjoint from the data the
|
69
71
|
model was trained on for accurate p-values.
|
70
72
|
model : Callable
|
@@ -100,7 +102,7 @@ class DriftUncertainty:
|
|
100
102
|
|
101
103
|
def __init__(
|
102
104
|
self,
|
103
|
-
x_ref:
|
105
|
+
x_ref: ArrayLike,
|
104
106
|
model: Callable,
|
105
107
|
p_val: float = 0.05,
|
106
108
|
x_ref_preprocessed: bool = False,
|
@@ -130,16 +132,16 @@ class DriftUncertainty:
|
|
130
132
|
p_val=p_val,
|
131
133
|
x_ref_preprocessed=x_ref_preprocessed,
|
132
134
|
update_x_ref=update_x_ref,
|
133
|
-
preprocess_fn=preprocess_fn,
|
135
|
+
preprocess_fn=preprocess_fn, # type: ignore
|
134
136
|
)
|
135
137
|
|
136
|
-
def predict(self, x:
|
138
|
+
def predict(self, x: ArrayLike) -> Dict[str, Union[int, float, np.ndarray]]:
|
137
139
|
"""
|
138
140
|
Predict whether a batch of data has drifted from the reference data.
|
139
141
|
|
140
142
|
Parameters
|
141
143
|
----------
|
142
|
-
x
|
144
|
+
x : ArrayLike
|
143
145
|
Batch of instances.
|
144
146
|
|
145
147
|
Returns
|
@@ -1,8 +1,7 @@
|
|
1
|
-
from typing import Dict, List, Literal
|
2
|
-
|
3
|
-
import numpy as np
|
1
|
+
from typing import Dict, Iterable, List, Literal
|
4
2
|
|
5
3
|
from dataeval._internal.flags import ImageHash
|
4
|
+
from dataeval._internal.interop import ArrayLike
|
6
5
|
from dataeval._internal.metrics.stats import ImageStats
|
7
6
|
|
8
7
|
|
@@ -10,14 +9,21 @@ class Duplicates:
|
|
10
9
|
"""
|
11
10
|
Finds the duplicate images in a dataset using xxhash for exact duplicates
|
12
11
|
and pchash for near duplicates
|
12
|
+
|
13
|
+
Attributes
|
14
|
+
----------
|
15
|
+
stats : ImageStats(flags=ImageHash.ALL)
|
16
|
+
Base stats class with the flags for checking duplicates
|
17
|
+
|
18
|
+
Example
|
19
|
+
-------
|
20
|
+
Initialize the Duplicates class:
|
21
|
+
|
22
|
+
>>> dups = Duplicates()
|
13
23
|
"""
|
14
24
|
|
15
|
-
def __init__(
|
16
|
-
self,
|
17
|
-
images: np.ndarray,
|
18
|
-
):
|
25
|
+
def __init__(self):
|
19
26
|
self.stats = ImageStats(ImageHash.ALL)
|
20
|
-
self.images = images
|
21
27
|
|
22
28
|
def _get_duplicates(self) -> dict:
|
23
29
|
exact = {}
|
@@ -34,16 +40,33 @@ class Duplicates:
|
|
34
40
|
"near": sorted(near),
|
35
41
|
}
|
36
42
|
|
37
|
-
def evaluate(self) -> Dict[Literal["exact", "near"], List[int]]:
|
43
|
+
def evaluate(self, images: Iterable[ArrayLike]) -> Dict[Literal["exact", "near"], List[int]]:
|
38
44
|
"""
|
39
45
|
Returns duplicate image indices for both exact matches and near matches
|
40
46
|
|
47
|
+
Parameters
|
48
|
+
----------
|
49
|
+
images : Iterable[ArrayLike], shape - (N, C, H, W)
|
50
|
+
A set of images in an ArrayLike format
|
51
|
+
|
41
52
|
Returns
|
42
53
|
-------
|
43
|
-
Dict[
|
44
|
-
|
54
|
+
Dict[str, List[int]]
|
55
|
+
exact :
|
56
|
+
List of groups of indices that are exact matches
|
57
|
+
near :
|
58
|
+
List of groups of indices that are near matches
|
59
|
+
|
60
|
+
See Also
|
61
|
+
--------
|
62
|
+
ImageStats
|
63
|
+
|
64
|
+
Example
|
65
|
+
-------
|
66
|
+
>>> dups.evaluate(images)
|
67
|
+
{'exact': [[3, 20], [16, 37]], 'near': [[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]]}
|
45
68
|
"""
|
46
69
|
self.stats.reset()
|
47
|
-
self.stats.update(
|
70
|
+
self.stats.update(images)
|
48
71
|
self.results = self.stats.compute()
|
49
72
|
return self._get_duplicates()
|