dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_internal/detectors/clusterer.py +44 -16
  3. dataeval/_internal/detectors/drift/base.py +14 -12
  4. dataeval/_internal/detectors/drift/cvm.py +11 -8
  5. dataeval/_internal/detectors/drift/ks.py +6 -3
  6. dataeval/_internal/detectors/drift/mmd.py +14 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +7 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -12
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +6 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +5 -5
  12. dataeval/_internal/detectors/ood/base.py +14 -13
  13. dataeval/_internal/detectors/ood/llr.py +6 -4
  14. dataeval/_internal/detectors/ood/vae.py +5 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +5 -4
  16. dataeval/_internal/functional/__init__.py +0 -0
  17. dataeval/_internal/functional/ber.py +63 -0
  18. dataeval/_internal/functional/coverage.py +75 -0
  19. dataeval/_internal/functional/divergence.py +16 -0
  20. dataeval/_internal/{metrics → functional}/hash.py +1 -1
  21. dataeval/_internal/functional/metadata.py +136 -0
  22. dataeval/_internal/functional/metadataparity.py +190 -0
  23. dataeval/_internal/functional/uap.py +6 -0
  24. dataeval/_internal/interop.py +52 -0
  25. dataeval/_internal/maite/__init__.py +0 -0
  26. dataeval/_internal/maite/utils.py +30 -0
  27. dataeval/_internal/metrics/base.py +2 -2
  28. dataeval/_internal/metrics/ber.py +16 -66
  29. dataeval/_internal/metrics/coverage.py +51 -35
  30. dataeval/_internal/metrics/divergence.py +50 -42
  31. dataeval/_internal/metrics/metadata.py +610 -0
  32. dataeval/_internal/metrics/metadataparity.py +67 -0
  33. dataeval/_internal/metrics/parity.py +40 -56
  34. dataeval/_internal/metrics/stats.py +46 -35
  35. dataeval/_internal/metrics/uap.py +14 -17
  36. dataeval/_internal/workflows/__init__.py +0 -0
  37. dataeval/metrics/__init__.py +2 -1
  38. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
  39. dataeval-0.63.0.dist-info/RECORD +68 -0
  40. dataeval-0.61.0.dist-info/RECORD +0 -55
  41. /dataeval/_internal/{metrics → functional}/utils.py +0 -0
  42. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
  43. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py CHANGED
@@ -2,7 +2,7 @@ from importlib.util import find_spec
2
2
 
3
3
  from . import detectors, flags, metrics
4
4
 
5
- __version__ = "0.61.0"
5
+ __version__ = "0.63.0"
6
6
 
7
7
  __all__ = ["detectors", "flags", "metrics"]
8
8
 
@@ -4,6 +4,8 @@ import numpy as np
4
4
  from scipy.cluster.hierarchy import linkage
5
5
  from scipy.spatial.distance import pdist, squareform
6
6
 
7
+ from dataeval._internal.interop import ArrayLike, to_numpy
8
+
7
9
 
8
10
  def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
9
11
  """
@@ -102,20 +104,36 @@ class Clusterer:
102
104
 
103
105
  Parameters
104
106
  ----------
105
- dataset : np.ndarray
106
- An array of images or image embeddings to perform clustering
107
+ dataset : ArrayLike, shape - (N, P)
108
+ A dataset in an ArrayLike format.
109
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
110
+
111
+ Warning
112
+ -------
113
+ The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
114
+
115
+ Note
116
+ ----
117
+ The Clusterer works best when the length of the feature dimension, P, is less than 500.
118
+ If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
119
+
120
+ Example
121
+ -------
122
+ Initialize the Clusterer class:
123
+
124
+ >>> cluster = Clusterer(dataset)
107
125
  """
108
126
 
109
- def __init__(self, dataset: np.ndarray):
127
+ def __init__(self, dataset: ArrayLike):
110
128
  # Allows an update to dataset to reset the state rather than instantiate a new class
111
129
  self._on_init(dataset)
112
130
 
113
- def _on_init(self, dataset: np.ndarray):
114
- self._validate_data(dataset)
115
- self._data: np.ndarray = dataset
116
- self._num_samples = len(dataset)
131
+ def _on_init(self, dataset: ArrayLike):
132
+ self._data: np.ndarray = to_numpy(dataset)
133
+ self._validate_data(self._data)
134
+ self._num_samples = len(self._data)
117
135
 
118
- self._darr: np.ndarray = pdist(dataset, metric="euclidean")
136
+ self._darr: np.ndarray = pdist(self._data, metric="euclidean")
119
137
  self._sqdmat: np.ndarray = squareform(self._darr)
120
138
  self._larr: np.ndarray = extend_linkage(linkage(self._darr))
121
139
  self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
@@ -131,7 +149,7 @@ class Clusterer:
131
149
  return self._data
132
150
 
133
151
  @data.setter
134
- def data(self, x: np.ndarray):
152
+ def data(self, x: ArrayLike):
135
153
  self._on_init(x)
136
154
 
137
155
  @property
@@ -450,20 +468,30 @@ class Clusterer:
450
468
 
451
469
  Returns
452
470
  -------
453
-
454
- Dict[str, Union[List[int]], List[List[int]]]
455
- Dictionary containing list of outliers, potential outliers, duplicates, and near duplicates in keys
456
- "outliers", "potential_outliers", "duplicates", "near_duplicates" respectively
457
- """
471
+ Dict[str, List[int]]
472
+ outliers :
473
+ List of indices that do not fall within a cluster
474
+ potential_outliers :
475
+ List of indices which are near the border between belonging in the cluster and being an outlier
476
+ duplicates :
477
+ List of groups of indices that are exact duplicates
478
+ potential_duplicates :
479
+ List of groups of indices which are not exact but closely related data points
480
+
481
+ Example
482
+ -------
483
+ >>> cluster.evaluate()
484
+ {'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
485
+ """ # noqa: E501
458
486
 
459
487
  outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
460
- duplicates, near_duplicates = self.find_duplicates(self.last_good_merge_levels)
488
+ duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
461
489
 
462
490
  ret = {
463
491
  "outliers": outliers,
464
492
  "potential_outliers": potential_outliers,
465
493
  "duplicates": duplicates,
466
- "near_duplicates": near_duplicates,
494
+ "potential_duplicates": potential_duplicates,
467
495
  }
468
496
 
469
497
  return ret
@@ -8,11 +8,12 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from abc import ABC, abstractmethod
10
10
  from functools import wraps
11
- from random import random
12
11
  from typing import Callable, Dict, Literal, Optional, Tuple, Union
13
12
 
14
13
  import numpy as np
15
14
 
15
+ from dataeval._internal.interop import ArrayLike, to_numpy
16
+
16
17
 
17
18
  def update_x_ref(fn):
18
19
  @wraps(fn)
@@ -94,7 +95,7 @@ class ReservoirSamplingUpdate(UpdateStrategy):
94
95
  x_reservoir[n_ref, :] = item
95
96
  n_ref += 1
96
97
  else:
97
- r = int(random() * count)
98
+ r = np.random.randint(0, count)
98
99
  if r < self.n:
99
100
  x_reservoir[r, :] = item
100
101
  return x_reservoir
@@ -105,11 +106,11 @@ class BaseDrift:
105
106
 
106
107
  def __init__(
107
108
  self,
108
- x_ref: np.ndarray,
109
+ x_ref: ArrayLike,
109
110
  p_val: float = 0.05,
110
111
  x_ref_preprocessed: bool = False,
111
112
  update_x_ref: Optional[UpdateStrategy] = None,
112
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
113
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
113
114
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
114
115
  ) -> None:
115
116
  # Type checking
@@ -128,7 +129,7 @@ class BaseDrift:
128
129
  self.update_x_ref = update_x_ref
129
130
  self.preprocess_fn = preprocess_fn
130
131
  self.correction = correction
131
- self.n = len(x_ref)
132
+ self.n = len(self._x_ref) # type: ignore
132
133
 
133
134
  # Ref counter for preprocessed x
134
135
  self._x_refcount = 0
@@ -140,9 +141,10 @@ class BaseDrift:
140
141
  if self.preprocess_fn is not None:
141
142
  self._x_ref = self.preprocess_fn(self._x_ref)
142
143
 
144
+ self._x_ref = to_numpy(self._x_ref)
143
145
  return self._x_ref
144
146
 
145
- def _preprocess(self, x: np.ndarray) -> np.ndarray:
147
+ def _preprocess(self, x: ArrayLike) -> ArrayLike:
146
148
  """Data preprocessing before computing the drift scores."""
147
149
  if self.preprocess_fn is not None:
148
150
  x = self.preprocess_fn(x)
@@ -159,11 +161,11 @@ class BaseUnivariateDrift(BaseDrift):
159
161
 
160
162
  def __init__(
161
163
  self,
162
- x_ref: np.ndarray,
164
+ x_ref: ArrayLike,
163
165
  p_val: float = 0.05,
164
166
  x_ref_preprocessed: bool = False,
165
167
  update_x_ref: Optional[UpdateStrategy] = None,
166
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
168
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
167
169
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
168
170
  n_features: Optional[int] = None,
169
171
  ) -> None:
@@ -188,14 +190,14 @@ class BaseUnivariateDrift(BaseDrift):
188
190
  self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
189
191
  else:
190
192
  # infer number of features after applying preprocessing step
191
- x = self.preprocess_fn(self.x_ref[0:1])
193
+ x = to_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
192
194
  self._n_features = x.reshape(x.shape[0], -1).shape[-1]
193
195
 
194
196
  return self._n_features
195
197
 
196
198
  @preprocess_x
197
199
  @abstractmethod
198
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
200
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
199
201
  """Abstract method to calculate feature score after preprocessing"""
200
202
 
201
203
  def _apply_correction(self, p_vals: np.ndarray) -> Tuple[int, float]:
@@ -221,7 +223,7 @@ class BaseUnivariateDrift(BaseDrift):
221
223
  @update_x_ref
222
224
  def predict(
223
225
  self,
224
- x: np.ndarray,
226
+ x: ArrayLike,
225
227
  drift_type: Literal["batch", "feature"] = "batch",
226
228
  ) -> Dict[str, Union[int, float, np.ndarray]]:
227
229
  """
@@ -230,7 +232,7 @@ class BaseUnivariateDrift(BaseDrift):
230
232
 
231
233
  Parameters
232
234
  ----------
233
- x : np.ndarray
235
+ x : ArrayLike
234
236
  Batch of instances.
235
237
  drift_type : Literal["batch", "feature"], default "batch"
236
238
  Predict drift at the 'feature' or 'batch' level. For 'batch', the test
@@ -11,6 +11,8 @@ from typing import Callable, Literal, Optional, Tuple
11
11
  import numpy as np
12
12
  from scipy.stats import cramervonmises_2samp
13
13
 
14
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
+
14
16
  from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
15
17
 
16
18
 
@@ -23,7 +25,7 @@ class DriftCVM(BaseUnivariateDrift):
23
25
 
24
26
  Parameters
25
27
  ----------
26
- x_ref : np.ndarray
28
+ x_ref : ArrayLike
27
29
  Data used as reference distribution.
28
30
  p_val : float, default 0.05
29
31
  p-value used for significance of the statistical test for each feature.
@@ -40,7 +42,7 @@ class DriftCVM(BaseUnivariateDrift):
40
42
  :py:class:`dataeval.detectors.LastSeenUpdateStrategy`
41
43
  or via reservoir sampling with
42
44
  :py:class:`dataeval.detectors.ReservoirSamplingUpdateStrategy`.
43
- preprocess_fn : Optional[Callable[[np.ndarray], np.ndarray]], default None
45
+ preprocess_fn : Optional[Callable[[ArrayLike], ArrayLike]], default None
44
46
  Function to preprocess the data before computing the data drift metrics.
45
47
  Typically a dimensionality reduction technique.
46
48
  correction : Literal["bonferroni", "fdr"], default "bonferroni"
@@ -54,11 +56,11 @@ class DriftCVM(BaseUnivariateDrift):
54
56
 
55
57
  def __init__(
56
58
  self,
57
- x_ref: np.ndarray,
59
+ x_ref: ArrayLike,
58
60
  p_val: float = 0.05,
59
61
  x_ref_preprocessed: bool = False,
60
62
  update_x_ref: Optional[UpdateStrategy] = None,
61
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
63
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
62
64
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
63
65
  n_features: Optional[int] = None,
64
66
  ) -> None:
@@ -73,25 +75,26 @@ class DriftCVM(BaseUnivariateDrift):
73
75
  )
74
76
 
75
77
  @preprocess_x
76
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
78
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
77
79
  """
78
80
  Performs the two-sample Cramér-von Mises test(s), computing the p-value and
79
81
  test statistic per feature.
80
82
 
81
83
  Parameters
82
84
  ----------
83
- x
85
+ x : ArrayLike
84
86
  Batch of instances.
85
87
 
86
88
  Returns
87
89
  -------
88
90
  Feature level p-values and CVM statistics.
89
91
  """
90
- x = x.reshape(x.shape[0], -1)
92
+ x_np = to_numpy(x)
93
+ x_np = x_np.reshape(x_np.shape[0], -1)
91
94
  x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
92
95
  p_val = np.zeros(self.n_features, dtype=np.float32)
93
96
  dist = np.zeros_like(p_val)
94
97
  for f in range(self.n_features):
95
- result = cramervonmises_2samp(x_ref[:, f], x[:, f], method="auto")
98
+ result = cramervonmises_2samp(x_ref[:, f], x_np[:, f], method="auto")
96
99
  p_val[f], dist[f] = result.pvalue, result.statistic
97
100
  return p_val, dist
@@ -11,6 +11,8 @@ from typing import Callable, Literal, Optional, Tuple
11
11
  import numpy as np
12
12
  from scipy.stats import ks_2samp
13
13
 
14
+ from dataeval._internal.interop import ArrayLike, to_numpy
15
+
14
16
  from .base import BaseUnivariateDrift, UpdateStrategy, preprocess_x
15
17
 
16
18
 
@@ -55,11 +57,11 @@ class DriftKS(BaseUnivariateDrift):
55
57
 
56
58
  def __init__(
57
59
  self,
58
- x_ref: np.ndarray,
60
+ x_ref: ArrayLike,
59
61
  p_val: float = 0.05,
60
62
  x_ref_preprocessed: bool = False,
61
63
  update_x_ref: Optional[UpdateStrategy] = None,
62
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
64
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
63
65
  correction: Literal["bonferroni", "fdr"] = "bonferroni",
64
66
  alternative: Literal["two-sided", "less", "greater"] = "two-sided",
65
67
  n_features: Optional[int] = None,
@@ -78,7 +80,7 @@ class DriftKS(BaseUnivariateDrift):
78
80
  self.alternative = alternative
79
81
 
80
82
  @preprocess_x
81
- def score(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
83
+ def score(self, x: ArrayLike) -> Tuple[np.ndarray, np.ndarray]:
82
84
  """
83
85
  Compute K-S scores and statistics per feature.
84
86
 
@@ -91,6 +93,7 @@ class DriftKS(BaseUnivariateDrift):
91
93
  -------
92
94
  Feature level p-values and K-S statistics.
93
95
  """
96
+ x = to_numpy(x)
94
97
  x = x.reshape(x.shape[0], -1)
95
98
  x_ref = self.x_ref.reshape(self.x_ref.shape[0], -1)
96
99
  p_val = np.zeros(self.n_features, dtype=np.float32)
@@ -8,9 +8,10 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from typing import Callable, Dict, Optional, Tuple, Union
10
10
 
11
- import numpy as np
12
11
  import torch
13
12
 
13
+ from dataeval._internal.interop import ArrayLike, to_numpy
14
+
14
15
  from .base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
15
16
  from .torch import GaussianRBF, get_device, mmd2_from_kernel_matrix
16
17
 
@@ -21,7 +22,7 @@ class DriftMMD(BaseDrift):
21
22
 
22
23
  Parameters
23
24
  ----------
24
- x_ref : np.ndarray
25
+ x_ref : ArrayLike
25
26
  Data used as reference distribution.
26
27
  p_val : float, default 0.05
27
28
  p-value used for the significance of the permutation test.
@@ -44,7 +45,7 @@ class DriftMMD(BaseDrift):
44
45
  Function to preprocess the data before computing the data drift metrics.
45
46
  kernel : Callable, default :py:class:`dataeval.detectors.GaussianRBF`
46
47
  Kernel used for the MMD computation, defaults to Gaussian RBF kernel.
47
- sigma : Optional[np.ndarray], default None
48
+ sigma : Optional[ArrayLike], default None
48
49
  Optionally set the GaussianRBF kernel bandwidth. Can also pass multiple
49
50
  bandwidth values as an array. The kernel evaluation is then averaged over
50
51
  those bandwidths.
@@ -59,13 +60,13 @@ class DriftMMD(BaseDrift):
59
60
 
60
61
  def __init__(
61
62
  self,
62
- x_ref: np.ndarray,
63
+ x_ref: ArrayLike,
63
64
  p_val: float = 0.05,
64
65
  x_ref_preprocessed: bool = False,
65
66
  update_x_ref: Optional[UpdateStrategy] = None,
66
- preprocess_fn: Optional[Callable[[np.ndarray], np.ndarray]] = None,
67
+ preprocess_fn: Optional[Callable[[ArrayLike], ArrayLike]] = None,
67
68
  kernel: Callable = GaussianRBF,
68
- sigma: Optional[np.ndarray] = None,
69
+ sigma: Optional[ArrayLike] = None,
69
70
  configure_kernel_from_x_ref: bool = True,
70
71
  n_permutations: int = 100,
71
72
  device: Optional[str] = None,
@@ -73,7 +74,7 @@ class DriftMMD(BaseDrift):
73
74
  super().__init__(x_ref, p_val, x_ref_preprocessed, update_x_ref, preprocess_fn)
74
75
 
75
76
  self.infer_sigma = configure_kernel_from_x_ref
76
- if configure_kernel_from_x_ref and isinstance(sigma, np.ndarray):
77
+ if configure_kernel_from_x_ref and isinstance(sigma, ArrayLike):
77
78
  self.infer_sigma = False
78
79
 
79
80
  self.n_permutations = n_permutations # nb of iterations through permutation test
@@ -82,7 +83,7 @@ class DriftMMD(BaseDrift):
82
83
  self.device = get_device(device)
83
84
 
84
85
  # initialize kernel
85
- sigma_tensor = torch.from_numpy(sigma).to(self.device) if isinstance(sigma, np.ndarray) else None
86
+ sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if isinstance(sigma, ArrayLike) else None
86
87
  self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
87
88
 
88
89
  # compute kernel matrix for the reference data
@@ -102,7 +103,7 @@ class DriftMMD(BaseDrift):
102
103
  return kernel_mat
103
104
 
104
105
  @preprocess_x
105
- def score(self, x: np.ndarray) -> Tuple[float, float, float]:
106
+ def score(self, x: ArrayLike) -> Tuple[float, float, float]:
106
107
  """
107
108
  Compute the p-value resulting from a permutation test using the maximum mean
108
109
  discrepancy as a distance measure between the reference data and the data to
@@ -110,7 +111,7 @@ class DriftMMD(BaseDrift):
110
111
 
111
112
  Parameters
112
113
  ----------
113
- x
114
+ x : ArrayLike
114
115
  Batch of instances.
115
116
 
116
117
  Returns
@@ -118,6 +119,7 @@ class DriftMMD(BaseDrift):
118
119
  p-value obtained from the permutation test, the MMD^2 between the reference and
119
120
  test set, and the MMD^2 threshold above which drift is flagged.
120
121
  """
122
+ x = to_numpy(x)
121
123
  x_ref = torch.from_numpy(self.x_ref).to(self.device)
122
124
  n = x.shape[0]
123
125
  kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
@@ -137,7 +139,7 @@ class DriftMMD(BaseDrift):
137
139
  @update_x_ref
138
140
  def predict(
139
141
  self,
140
- x: np.ndarray,
142
+ x: ArrayLike,
141
143
  ) -> Dict[str, Union[int, float]]:
142
144
  """
143
145
  Predict whether a batch of data has drifted from the reference data and then
@@ -145,7 +147,7 @@ class DriftMMD(BaseDrift):
145
147
 
146
148
  Parameters
147
149
  ----------
148
- x
150
+ x : ArrayLike
149
151
  Batch of instances.
150
152
 
151
153
  Returns
@@ -13,6 +13,8 @@ import numpy as np
13
13
  from scipy.special import softmax
14
14
  from scipy.stats import entropy
15
15
 
16
+ from dataeval._internal.interop import ArrayLike
17
+
16
18
  from .base import UpdateStrategy
17
19
  from .ks import DriftKS
18
20
  from .torch import get_device, preprocess_drift
@@ -64,7 +66,7 @@ class DriftUncertainty:
64
66
 
65
67
  Parameters
66
68
  ----------
67
- x_ref : np.ndarray
69
+ x_ref : ArrayLike
68
70
  Data used as reference distribution. Should be disjoint from the data the
69
71
  model was trained on for accurate p-values.
70
72
  model : Callable
@@ -100,7 +102,7 @@ class DriftUncertainty:
100
102
 
101
103
  def __init__(
102
104
  self,
103
- x_ref: np.ndarray,
105
+ x_ref: ArrayLike,
104
106
  model: Callable,
105
107
  p_val: float = 0.05,
106
108
  x_ref_preprocessed: bool = False,
@@ -130,16 +132,16 @@ class DriftUncertainty:
130
132
  p_val=p_val,
131
133
  x_ref_preprocessed=x_ref_preprocessed,
132
134
  update_x_ref=update_x_ref,
133
- preprocess_fn=preprocess_fn,
135
+ preprocess_fn=preprocess_fn, # type: ignore
134
136
  )
135
137
 
136
- def predict(self, x: np.ndarray) -> Dict[str, Union[int, float, np.ndarray]]:
138
+ def predict(self, x: ArrayLike) -> Dict[str, Union[int, float, np.ndarray]]:
137
139
  """
138
140
  Predict whether a batch of data has drifted from the reference data.
139
141
 
140
142
  Parameters
141
143
  ----------
142
- x
144
+ x : ArrayLike
143
145
  Batch of instances.
144
146
 
145
147
  Returns
@@ -1,8 +1,7 @@
1
- from typing import Dict, List, Literal
2
-
3
- import numpy as np
1
+ from typing import Dict, Iterable, List, Literal
4
2
 
5
3
  from dataeval._internal.flags import ImageHash
4
+ from dataeval._internal.interop import ArrayLike
6
5
  from dataeval._internal.metrics.stats import ImageStats
7
6
 
8
7
 
@@ -10,14 +9,21 @@ class Duplicates:
10
9
  """
11
10
  Finds the duplicate images in a dataset using xxhash for exact duplicates
12
11
  and pchash for near duplicates
12
+
13
+ Attributes
14
+ ----------
15
+ stats : ImageStats(flags=ImageHash.ALL)
16
+ Base stats class with the flags for checking duplicates
17
+
18
+ Example
19
+ -------
20
+ Initialize the Duplicates class:
21
+
22
+ >>> dups = Duplicates()
13
23
  """
14
24
 
15
- def __init__(
16
- self,
17
- images: np.ndarray,
18
- ):
25
+ def __init__(self):
19
26
  self.stats = ImageStats(ImageHash.ALL)
20
- self.images = images
21
27
 
22
28
  def _get_duplicates(self) -> dict:
23
29
  exact = {}
@@ -34,16 +40,33 @@ class Duplicates:
34
40
  "near": sorted(near),
35
41
  }
36
42
 
37
- def evaluate(self) -> Dict[Literal["exact", "near"], List[int]]:
43
+ def evaluate(self, images: Iterable[ArrayLike]) -> Dict[Literal["exact", "near"], List[int]]:
38
44
  """
39
45
  Returns duplicate image indices for both exact matches and near matches
40
46
 
47
+ Parameters
48
+ ----------
49
+ images : Iterable[ArrayLike], shape - (N, C, H, W)
50
+ A set of images in an ArrayLike format
51
+
41
52
  Returns
42
53
  -------
43
- Dict[Literal["exact", "near"], List[int]]
44
- Dictionary of exact and near match indices
54
+ Dict[str, List[int]]
55
+ exact :
56
+ List of groups of indices that are exact matches
57
+ near :
58
+ List of groups of indices that are near matches
59
+
60
+ See Also
61
+ --------
62
+ ImageStats
63
+
64
+ Example
65
+ -------
66
+ >>> dups.evaluate(images)
67
+ {'exact': [[3, 20], [16, 37]], 'near': [[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]]}
45
68
  """
46
69
  self.stats.reset()
47
- self.stats.update(self.images)
70
+ self.stats.update(images)
48
71
  self.results = self.stats.compute()
49
72
  return self._get_duplicates()