dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +45 -16
  3. dataeval/_internal/detectors/drift/base.py +15 -12
  4. dataeval/_internal/detectors/drift/cvm.py +12 -8
  5. dataeval/_internal/detectors/drift/ks.py +7 -3
  6. dataeval/_internal/detectors/drift/mmd.py +15 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +6 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -11
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +7 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +6 -5
  12. dataeval/_internal/detectors/ood/base.py +15 -13
  13. dataeval/_internal/detectors/ood/llr.py +8 -5
  14. dataeval/_internal/detectors/ood/vae.py +6 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  16. dataeval/_internal/interop.py +43 -0
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +2 -84
  19. dataeval/_internal/metrics/ber.py +77 -53
  20. dataeval/_internal/metrics/coverage.py +80 -55
  21. dataeval/_internal/metrics/divergence.py +62 -54
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +292 -163
  24. dataeval/_internal/metrics/stats.py +48 -35
  25. dataeval/_internal/metrics/uap.py +31 -26
  26. dataeval/_internal/metrics/utils.py +237 -2
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/_internal/workflows/__init__.py +0 -0
  29. dataeval/metrics/__init__.py +25 -5
  30. dataeval/utils/__init__.py +9 -0
  31. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
  32. dataeval-0.64.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/hash.py +0 -79
  34. dataeval-0.61.0.dist-info/RECORD +0 -55
  35. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py CHANGED
@@ -2,14 +2,14 @@ from importlib.util import find_spec
2
2
 
3
3
  from . import detectors, flags, metrics
4
4
 
5
- __version__ = "0.61.0"
5
+ __version__ = "0.64.0"
6
6
 
7
7
  __all__ = ["detectors", "flags", "metrics"]
8
8
 
9
9
  if find_spec("torch") is not None: # pragma: no cover
10
- from . import models, workflows
10
+ from . import models, utils, workflows
11
11
 
12
- __all__ += ["models", "workflows"]
12
+ __all__ += ["models", "utils", "workflows"]
13
13
  elif find_spec("tensorflow") is not None: # pragma: no cover
14
14
  from . import models
15
15
 
@@ -1,9 +1,12 @@
1
1
  from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
2
2
 
3
3
  import numpy as np
4
+ from numpy.typing import ArrayLike
4
5
  from scipy.cluster.hierarchy import linkage
5
6
  from scipy.spatial.distance import pdist, squareform
6
7
 
8
+ from dataeval._internal.interop import to_numpy
9
+
7
10
 
8
11
  def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
9
12
  """
@@ -102,20 +105,36 @@ class Clusterer:
102
105
 
103
106
  Parameters
104
107
  ----------
105
- dataset : np.ndarray
106
- An array of images or image embeddings to perform clustering
108
+ dataset : ArrayLike, shape - (N, P)
109
+ A dataset in an ArrayLike format.
110
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
111
+
112
+ Warning
113
+ -------
114
+ The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
115
+
116
+ Note
117
+ ----
118
+ The Clusterer works best when the length of the feature dimension, P, is less than 500.
119
+ If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
120
+
121
+ Example
122
+ -------
123
+ Initialize the Clusterer class:
124
+
125
+ >>> cluster = Clusterer(dataset)
107
126
  """
108
127
 
109
- def __init__(self, dataset: np.ndarray):
128
+ def __init__(self, dataset: ArrayLike):
110
129
  # Allows an update to dataset to reset the state rather than instantiate a new class
111
130
  self._on_init(dataset)
112
131
 
113
- def _on_init(self, dataset: np.ndarray):
114
- self._validate_data(dataset)
115
- self._data: np.ndarray = dataset
116
- self._num_samples = len(dataset)
132
+ def _on_init(self, dataset: ArrayLike):
133
+ self._data: np.ndarray = to_numpy(dataset)
134
+ self._validate_data(self._data)
135
+ self._num_samples = len(self._data)
117
136
 
118
- self._darr: np.ndarray = pdist(dataset, metric="euclidean")
137
+ self._darr: np.ndarray = pdist(self._data, metric="euclidean")
119
138
  self._sqdmat: np.ndarray = squareform(self._darr)
120
139
  self._larr: np.ndarray = extend_linkage(linkage(self._darr))
121
140
  self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
@@ -131,7 +150,7 @@ class Clusterer:
131
150
  return self._data
132
151
 
133
152
  @data.setter
134
- def data(self, x: np.ndarray):
153
+ def data(self, x: ArrayLike):
135
154
  self._on_init(x)
136
155
 
137
156
  @property
@@ -450,20 +469,30 @@ class Clusterer:
450
469
 
451
470
  Returns
452
471
  -------
453
-
454
- Dict[str, Union[List[int]], List[List[int]]]
455
- Dictionary containing list of outliers, potential outliers, duplicates, and near duplicates in keys
456
- "outliers", "potential_outliers", "duplicates", "near_duplicates" respectively
457
- """
472
+ Dict[str, List[int]]
473
+ outliers :
474
+ List of indices that do not fall within a cluster
475
+ potential_outliers :
476
+ List of indices which are near the border between belonging in the cluster and being an outlier
477
+ duplicates :
478
+ List of groups of indices that are exact duplicates
479
+ potential_duplicates :
480
+ List of groups of indices which are not exact but closely related data points
481
+
482
+ Example
483
+ -------
484
+ >>> cluster.evaluate()
485
+ {'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
486
+ """ # noqa: E501
458
487
 
459
488
  outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
460
- duplicates, near_duplicates = self.find_duplicates(self.last_good_merge_levels)
489
+ duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
461
490
 
462
491
  ret = {
463
492
  "outliers": outliers,
464
493
  "potential_outliers": potential_outliers,
465
494
  "duplicates": duplicates,
466
- "near_duplicates": near_duplicates,
495
+ "potential_duplicates": potential_duplicates,
467
496
  }
468
497
 
469
498
  return ret
@@ -8,10 +8,12 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from abc import ABC, abstractmethod
10
10
  from functools import wraps
11
- from random import random
12
11
  from typing import Callable, Dict, Literal, Optional, Tuple, Union
13
12
 
14
13
  import numpy as np
14
+ from numpy.typing import ArrayLike
15
+
16
+ from dataeval._internal.interop import to_numpy
15
17
 
16
18
 
17
19
  def update_x_ref(fn):
@@ -94,7 +96,7 @@ class ReservoirSamplingUpdate(UpdateStrategy):
94
96
  x_reservoir[n_ref, :] = item
95
97
  n_ref += 1
96
98
  else:
97
- r = int(random() * count)
99
+ r = np.random.randint(0, count)
98
100
  if r < self.n:
99
101
  x_reservoir[r, :] = item
100
102
  return x_reservoir
@@ -105,11 +107,11 @@ class BaseDrift:
105
107
 
106
108
  def __init__(
107
109
  self,
108
- x_ref: np.ndarray,
110
+ x_ref: ArrayLike,
109
111
  p_val: float = 0.05,
110
112
  x_ref_preprocessed: bool = False,
111
113
  update_x_ref: Optional[UpdateStrategy] = None,
112
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
114
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
113
115
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
114
116
  ) -> None:
115
117
  # Type checking
@@ -128,7 +130,7 @@ class BaseDrift:
128
130
  self.update_x_ref = update_x_ref
129
131
  self.preprocess_fn = preprocess_fn
130
132
  self.correction = correction
131
- self.n = len(x_ref)
133
+ self.n = len(self._x_ref) # type: ignore
132
134
 
133
135
  # Ref counter for preprocessed x
134
136
  self._x_refcount = 0
@@ -140,9 +142,10 @@ class BaseDrift:
140
142
  if self.preprocess_fn is not None:
141
143
  self._x_ref = self.preprocess_fn(self._x_ref)
142
144
 
145
+ self._x_ref = to_numpy(self._x_ref)
143
146
  return self._x_ref
144
147
 
145
- def _preprocess(self, x: np.ndarray) -> np.ndarray:
148
+ def _preprocess(self, x: ArrayLike) -> ArrayLike:
146
149
  """Data preprocessing before computing the drift scores."""
147
150
  if self.preprocess_fn is not None:
148
151
  x = self.preprocess_fn(x)
@@ -159,11 +162,11 @@ class BaseUnivariateDrift(BaseDrift):
159
162
 
160
163
  def __init__(
161
164
  self,
162
- x_ref: np.ndarray,
165
+ x_ref: ArrayLike,
163
166
  p_val: float = 0.05,
164
167
  x_ref_preprocessed: bool = False,
165
168
  update_x_ref: Optional[UpdateStrategy] = None,
166
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
169
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
167
170
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
168
171
  n_features: Optional[int] = None,
169
172
  ) -> None:
@@ -188,14 +191,14 @@ class BaseUnivariateDrift(BaseDrift):
188
191
  self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
189
192
  else:
190
193
  # infer number of features after applying preprocessing step
191
- x = self.preprocess_fn(self.x_ref[0:1])
194
+ x = to_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
192
195
  self._n_features = x.reshape(x.shape[0], -1).shape[-1]
193
196
 
194
197
  return self._n_features
195
198
 
196
199
  @preprocess_x
197
200
  @abstractmethod
198
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
201
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
199
202
  """Abstract method to calculate feature score after preprocessing"""
200
203
 
201
204
  def _apply_correction(self, p_vals: np.ndarray) -> Tuple[int, float]:
@@ -221,7 +224,7 @@ class BaseUnivariateDrift(BaseDrift):
221
224
  @update_x_ref
222
225
  def predict(
223
226
  self,
224
- x: np.ndarray,
227
+ x: ArrayLike,
225
228
  drift_type: Literal["batch", "feature"] = "batch",
226
229
  ) -> Dict[str, Union[int, float, np.ndarray]]:
227
230
  """
@@ -230,7 +233,7 @@ class BaseUnivariateDrift(BaseDrift):
230
233
 
231
234
  Parameters
232
235
  ----------
233
- x : np.ndarray
236
+ x : ArrayLike
234
237
  Batch of instances.
235
238
  drift_type : Literal["batch", "feature"], default "batch"
236
239
  Predict drift at the 'feature' or 'batch' level. For 'batch', the test
@@ -9,8 +9,11 @@ Licensed under Apache Software License (Apache 2.0)
9
9
  from typing import Callable, Literal, Optional, Tuple
10
10
 
11
11
  import numpy as np
12
+ from numpy.typing import ArrayLike
12
13
  from scipy.stats import cramervonmises_2samp
13
14
 
15
+ from dataeval._internal.interop import to_numpy
16
+
14
17
  from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
15
18
 
16
19
 
@@ -23,7 +26,7 @@ class DriftCVM(BaseUnivariateDrift):
23
26
 
24
27
  Parameters
25
28
  ----------
26
- x_ref : np.ndarray
29
+ x_ref : ArrayLike
27
30
  Data used as reference distribution.
28
31
  p_val : float, default 0.05
29
32
  p-value used for significance of the statistical test for each feature.
@@ -40,7 +43,7 @@ class DriftCVM(BaseUnivariateDrift):
40
43
  :py:class:`dataeval.detectors.LastSeenUpdateStrategy`
41
44
  or via reservoir sampling with
42
45
  :py:class:`dataeval.detectors.ReservoirSamplingUpdateStrategy`.
43
- preprocess_fn : Optional[Callable[[np.ndarray], np.ndarray]], default None
46
+ preprocess_fn : Optional[Callable[[ArrayLike], ArrayLike]], default None
44
47
  Function to preprocess the data before computing the data drift metrics.
45
48
  Typically a dimensionality reduction technique.
46
49
  correction : Literal["bonferroni", "fdr"], default "bonferroni"
@@ -54,11 +57,11 @@ class DriftCVM(BaseUnivariateDrift):
54
57
 
55
58
  def __init__(
56
59
  self,
57
- x_ref: np.ndarray,
60
+ x_ref: ArrayLike,
58
61
  p_val: float = 0.05,
59
62
  x_ref_preprocessed: bool = False,
60
63
  update_x_ref: Optional[UpdateStrategy] = None,
61
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
64
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
62
65
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
63
66
  n_features: Optional[int] = None,
64
67
  ) -> None:
@@ -73,25 +76,26 @@ class DriftCVM(BaseUnivariateDrift):
73
76
  )
74
77
 
75
78
  @preprocess_x
76
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
79
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
77
80
  """
78
81
  Performs the two-sample Cramér-von Mises test(s), computing the p-value and
79
82
  test statistic per feature.
80
83
 
81
84
  Parameters
82
85
  ----------
83
- x
86
+ x : ArrayLike
84
87
  Batch of instances.
85
88
 
86
89
  Returns
87
90
  -------
88
91
  Feature level p-values and CVM statistics.
89
92
  """
90
- x = x.reshape(x.shape[0], -1)
93
+ x_np = to_numpy(x)
94
+ x_np = x_np.reshape(x_np.shape[0], -1)
91
95
  x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
92
96
  p_val = np.zeros(self.n_features, dtype=np.float32)
93
97
  dist = np.zeros_like(p_val)
94
98
  for f in range(self.n_features):
95
- result = cramervonmises_2samp(x_ref[:, f], x[:, f], method="auto")
99
+ result = cramervonmises_2samp(x_ref[:, f], x_np[:, f], method="auto")
96
100
  p_val[f], dist[f] = result.pvalue, result.statistic
97
101
  return p_val, dist
@@ -9,8 +9,11 @@ Licensed under Apache Software License (Apache 2.0)
9
9
  from typing import Callable, Literal, Optional, Tuple
10
10
 
11
11
  import numpy as np
12
+ from numpy.typing import ArrayLike
12
13
  from scipy.stats import ks_2samp
13
14
 
15
+ from dataeval._internal.interop import to_numpy
16
+
14
17
  from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
15
18
 
16
19
 
@@ -55,11 +58,11 @@ class DriftKS(BaseUnivariateDrift):
55
58
 
56
59
  def __init__(
57
60
  self,
58
- x_ref: np.ndarray,
61
+ x_ref: ArrayLike,
59
62
  p_val: float = 0.05,
60
63
  x_ref_preprocessed: bool = False,
61
64
  update_x_ref: Optional[UpdateStrategy] = None,
62
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
65
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
63
66
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
64
67
  alternative: Literal["two-sided", "less", "greater"] = "two-sided",
65
68
  n_features: Optional[int] = None,
@@ -78,7 +81,7 @@ class DriftKS(BaseUnivariateDrift):
78
81
  self.alternative = alternative
79
82
 
80
83
  @preprocess_x
81
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
84
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
82
85
  """
83
86
  Compute K-S scores and statistics per feature.
84
87
 
@@ -91,6 +94,7 @@ class DriftKS(BaseUnivariateDrift):
91
94
  -------
92
95
  Feature level p-values and K-S statistics.
93
96
  """
97
+ x = to_numpy(x)
94
98
  x = x.reshape(x.shape[0], -1)
95
99
  x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
96
100
  p_val = np.zeros(self.n_features, dtype=np.float32)
@@ -8,8 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from typing import Callable, Dict, Optional, Tuple, Union
10
10
 
11
- import numpy as np
12
11
  import torch
12
+ from numpy.typing import ArrayLike
13
+
14
+ from dataeval._internal.interop import to_numpy
13
15
 
14
16
  from .base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
15
17
  from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
@@ -21,7 +23,7 @@ class DriftMMD(BaseDrift):
21
23
 
22
24
  Parameters
23
25
  ----------
24
- x_ref : np.ndarray
26
+ x_ref : ArrayLike
25
27
  Data used as reference distribution.
26
28
  p_val : float, default 0.05
27
29
  p-value used for the significance of the permutation test.
@@ -44,7 +46,7 @@ class DriftMMD(BaseDrift):
44
46
  Function to preprocess the data before computing the data drift metrics.
45
47
  kernel : Callable, default :py:class:`dataeval.detectors.GaussianRBF`
46
48
  Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
47
- sigma : Optional[np.ndarray], default None
49
+ sigma : Optional[ArrayLike], default None
48
50
  Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
49
51
  bandwidth values as an array. The kernel evaluation is then averaged over
50
52
  those bandwidths.
@@ -59,13 +61,13 @@ class DriftMMD(BaseDrift):
59
61
 
60
62
  def __init__(
61
63
  self,
62
- x_ref: np.ndarray,
64
+ x_ref: ArrayLike,
63
65
  p_val: float = 0.05,
64
66
  x_ref_preprocessed: bool = False,
65
67
  update_x_ref: Optional[UpdateStrategy] = None,
66
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
68
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
67
69
  kernel: Callable = GaussianRBF,
68
- sigma: Optional[np.ndarray] = None,
70
+ sigma: Optional[ArrayLike] = None,
69
71
  configure_kernel_from_x_ref: bool = True,
70
72
  n_permutations: int = 100,
71
73
  device: Optional[str] = None,
@@ -73,7 +75,7 @@ class DriftMMD(BaseDrift):
73
75
  super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
74
76
 
75
77
  self.infer_sigma = configure_kernel_from_x_ref
76
- if configure_kernel_from_x_ref and isinstance(sigma, np.ndarray):
78
+ if configure_kernel_from_x_ref and sigma is not None:
77
79
  self.infer_sigma = False
78
80
 
79
81
  self.n_permutations = n_permutations # nb of iterations through permutation test
@@ -82,7 +84,7 @@ class DriftMMD(BaseDrift):
82
84
  self.device = get_device(device)
83
85
 
84
86
  # initialize kernel
85
- sigma_tensor = torch.from_numpy(sigma).to(self.device) if isinstance(sigma, np.ndarray) else None
87
+ sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if sigma is not None else None
86
88
  self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
87
89
 
88
90
  # compute kernel matrix for the reference data
@@ -102,7 +104,7 @@ class DriftMMD(BaseDrift):
102
104
  return kernel_mat
103
105
 
104
106
  @preprocess_x
105
- def score(self, x: np.ndarray) -> Tuple[float, float, float]:
107
+ def score(self, x: ArrayLike) -> Tuple[float, float, float]:
106
108
  """
107
109
  Compute the p-value resulting from a permutation test using the maximum mean
108
110
  discrepancy as a distance measure between the reference data and the data to
@@ -110,7 +112,7 @@ class DriftMMD(BaseDrift):
110
112
 
111
113
  Parameters
112
114
  ----------
113
- x
115
+ x : ArrayLike
114
116
  Batch of instances.
115
117
 
116
118
  Returns
@@ -118,6 +120,7 @@ class DriftMMD(BaseDrift):
118
120
  p-value obtained from the permutation test, the MMD^2 between the reference and
119
121
  test set, and the MMD^2 threshold above which drift is flagged.
120
122
  """
123
+ x = to_numpy(x)
121
124
  x_ref = torch.from_numpy(self.x_ref).to(self.device)
122
125
  n = x.shape[0]
123
126
  kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
@@ -137,7 +140,7 @@ class DriftMMD(BaseDrift):
137
140
  @update_x_ref
138
141
  def predict(
139
142
  self,
140
- x: np.ndarray,
143
+ x: ArrayLike,
141
144
  ) -> Dict[str, Union[int, float]]:
142
145
  """
143
146
  Predict whether a batch of data has drifted from the reference data and then
@@ -145,7 +148,7 @@ class DriftMMD(BaseDrift):
145
148
 
146
149
  Parameters
147
150
  ----------
148
- x
151
+ x : ArrayLike
149
152
  Batch of instances.
150
153
 
151
154
  Returns
@@ -10,6 +10,7 @@ from functools import partial
10
10
  from typing import Callable, Dict, Literal, Optional, Union
11
11
 
12
12
  import numpy as np
13
+ from numpy.typing import ArrayLike
13
14
  from scipy.special import softmax
14
15
  from scipy.stats import entropy
15
16
 
@@ -64,7 +65,7 @@ class DriftUncertainty:
64
65
 
65
66
  Parameters
66
67
  ----------
67
- x_ref : np.ndarray
68
+ x_ref : ArrayLike
68
69
  Data used as reference distribution. Should be disjoint from the data the
69
70
  model was trained on for accurate p-values.
70
71
  model : Callable
@@ -100,7 +101,7 @@ class DriftUncertainty:
100
101
 
101
102
  def __init__(
102
103
  self,
103
- x_ref: np.ndarray,
104
+ x_ref: ArrayLike,
104
105
  model: Callable,
105
106
  p_val: float = 0.05,
106
107
  x_ref_preprocessed: bool = False,
@@ -130,16 +131,16 @@ class DriftUncertainty:
130
131
  p_val=p_val,
131
132
  x_ref_preprocessed=x_ref_preprocessed,
132
133
  update_x_ref=update_x_ref,
133
- preprocess_fn=preprocess_fn,
134
+ preprocess_fn=preprocess_fn, # type: ignore
134
135
  )
135
136
 
136
- def predict(self, x: np.ndarray) -> Dict[str, Union[int, float, np.ndarray]]:
137
+ def predict(self, x: ArrayLike) -> Dict[str, Union[int, float, np.ndarray]]:
137
138
  """
138
139
  Predict whether a batch of data has drifted from the reference data.
139
140
 
140
141
  Parameters
141
142
  ----------
142
- x
143
+ x : ArrayLike
143
144
  Batch of instances.
144
145
 
145
146
  Returns
@@ -1,6 +1,6 @@
1
- from typing import Dict, List, Literal
1
+ from typing import Dict, Iterable, List, Literal
2
2
 
3
- import numpy as np
3
+ from numpy.typing import ArrayLike
4
4
 
5
5
  from dataeval._internal.flags import ImageHash
6
6
  from dataeval._internal.metrics.stats import ImageStats
@@ -10,14 +10,21 @@ class Duplicates:
10
10
  """
11
11
  Finds the duplicate images in a dataset using xxhash for exact duplicates
12
12
  and pchash for near duplicates
13
+
14
+ Attributes
15
+ ----------
16
+ stats : ImageStats(flags=ImageHash.ALL)
17
+ Base stats class with the flags for checking duplicates
18
+
19
+ Example
20
+ -------
21
+ Initialize the Duplicates class:
22
+
23
+ >>> dups = Duplicates()
13
24
  """
14
25
 
15
- def __init__(
16
- self,
17
- images: np.ndarray,
18
- ):
26
+ def __init__(self):
19
27
  self.stats = ImageStats(ImageHash.ALL)
20
- self.images = images
21
28
 
22
29
  def _get_duplicates(self) -> dict:
23
30
  exact = {}
@@ -34,16 +41,33 @@ class Duplicates:
34
41
  "near": sorted(near),
35
42
  }
36
43
 
37
- def evaluate(self) -> Dict[Literal["exact", "near"], List[int]]:
44
+ def evaluate(self, images: Iterable[ArrayLike]) -> Dict[Literal["exact", "near"], List[int]]:
38
45
  """
39
46
  Returns duplicate image indices for both exact matches and near matches
40
47
 
48
+ Parameters
49
+ ----------
50
+ images : Iterable[ArrayLike], shape - (N, C, H, W)
51
+ A set of images in an ArrayLike format
52
+
41
53
  Returns
42
54
  -------
43
- Dict[Literal["exact", "near"], List[int]]
44
- Dictionary of exact and near match indices
55
+ Dict[str, List[int]]
56
+ exact :
57
+ List of groups of indices that are exact matches
58
+ near :
59
+ List of groups of indices that are near matches
60
+
61
+ See Also
62
+ --------
63
+ ImageStats
64
+
65
+ Example
66
+ -------
67
+ >>> dups.evaluate(images)
68
+ {'exact': [[3, 20], [16, 37]], 'near': [[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]]}
45
69
  """
46
70
  self.stats.reset()
47
- self.stats.update(self.images)
71
+ self.stats.update(images)
48
72
  self.results = self.stats.compute()
49
73
  return self._get_duplicates()