dataeval 0.88.0__py3-none-any.whl → 0.89.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataeval/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.88.0'
21
- __version_tuple__ = version_tuple = (0, 88, 0)
20
+ __version__ = version = '0.89.0'
21
+ __version_tuple__ = version_tuple = (0, 89, 0)
@@ -5,7 +5,7 @@ __all__ = []
5
5
  import logging
6
6
  import math
7
7
  import os
8
- from collections.abc import Iterator, Sequence
8
+ from collections.abc import Iterable, Iterator, Sequence
9
9
  from pathlib import Path
10
10
  from typing import Any, cast
11
11
 
@@ -80,7 +80,7 @@ class Embeddings:
80
80
  # Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
81
81
  dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike],
82
82
  batch_size: int,
83
- transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
83
+ transforms: Transform[torch.Tensor] | Iterable[Transform[torch.Tensor]] | None = None,
84
84
  model: torch.nn.Module | None = None,
85
85
  device: DeviceLike | None = None,
86
86
  cache: Path | str | bool = False,
@@ -15,6 +15,7 @@ from tqdm.auto import tqdm
15
15
  from dataeval.typing import (
16
16
  AnnotatedDataset,
17
17
  Array,
18
+ DatumMetadata,
18
19
  ObjectDetectionTarget,
19
20
  )
20
21
  from dataeval.utils._array import as_numpy
@@ -76,7 +77,7 @@ class Metadata:
76
77
 
77
78
  def __init__(
78
79
  self,
79
- dataset: AnnotatedDataset[tuple[Any, Any, Mapping[str, Any]]],
80
+ dataset: AnnotatedDataset[tuple[Any, Any, DatumMetadata]],
80
81
  *,
81
82
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
82
83
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
@@ -55,6 +55,42 @@ def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
55
55
 
56
56
 
57
57
  class BaseDrift:
58
+ """Base class for drift detection algorithms.
59
+
60
+ Provides common functionality for drift detectors including reference data
61
+ management, encoding of input data, and statistical correction methods.
62
+ Subclasses implement specific drift detection algorithms.
63
+
64
+ Parameters
65
+ ----------
66
+ data : Embeddings or Array
67
+ Reference dataset used as baseline for drift detection.
68
+ Can be image embeddings or raw arrays.
69
+ p_val : float, default 0.05
70
+ Significance threshold for drift detection, between 0 and 1.
71
+ Default 0.05 limits false drift alerts to 5% when no drift exists (Type I error rate).
72
+ update_strategy : UpdateStrategy or None, default None
73
+ Strategy for updating reference data when new data arrives.
74
+ When None, reference data remains fixed throughout detection.
75
+ Default None maintains stable baseline for consistent comparison.
76
+ correction : {"bonferroni", "fdr"}, default "bonferroni"
77
+ Multiple testing correction method for multivariate drift detection.
78
+ "bonferroni" provides conservative family-wise error control.
79
+ "fdr" (False Discovery Rate) offers less conservative control.
80
+ Default "bonferroni" minimizes false positive drift detections.
81
+
82
+ Attributes
83
+ ----------
84
+ p_val : float
85
+ Significance threshold for statistical tests.
86
+ update_strategy : UpdateStrategy or None
87
+ Reference data update strategy.
88
+ correction : {"bonferroni", "fdr"}
89
+ Multiple testing correction method.
90
+ n : int
91
+ Number of samples in the reference dataset.
92
+ """
93
+
58
94
  p_val: float
59
95
  update_strategy: UpdateStrategy | None
60
96
  correction: Literal["bonferroni", "fdr"]
@@ -83,19 +119,43 @@ class BaseDrift:
83
119
 
84
120
  @property
85
121
  def x_ref(self) -> NDArray[np.float32]:
86
- """
87
- Retrieve the reference data of the drift detector.
122
+ """Reference data for drift detection.
123
+
124
+ Lazily encodes the reference dataset on first access.
125
+ Data is flattened and converted to 32-bit floating point for
126
+ consistent numerical processing across different input types.
88
127
 
89
128
  Returns
90
129
  -------
91
130
  NDArray[np.float32]
92
- The reference data as a 32-bit floating point numpy array.
131
+ Reference data as flattened 32-bit floating point array.
132
+ Shape is (n_samples, n_features_flattened).
133
+
134
+ Notes
135
+ -----
136
+ Data is cached after first access to avoid repeated encoding overhead.
93
137
  """
94
138
  if self._x_ref is None:
95
139
  self._x_ref = self._encode(self._data)
96
140
  return self._x_ref
97
141
 
98
142
  def _encode(self, data: Embeddings | Array) -> NDArray[np.float32]:
143
+ """
144
+ Encode input data to consistent numpy format.
145
+
146
+ Handles different input types (Embeddings, Arrays) and converts
147
+ them to flattened 32-bit floating point arrays for drift detection.
148
+
149
+ Parameters
150
+ ----------
151
+ data : Embeddings or Array
152
+ Input data to encode.
153
+
154
+ Returns
155
+ -------
156
+ NDArray[np.float32]
157
+ Encoded data as flattened 32-bit floating point array.
158
+ """
99
159
  array = (
100
160
  data.to_numpy().astype(np.float32)
101
161
  if isinstance(data, Embeddings)
@@ -107,6 +167,46 @@ class BaseDrift:
107
167
 
108
168
 
109
169
  class BaseDriftUnivariate(BaseDrift):
170
+ """
171
+ Base class for univariate drift detection algorithms.
172
+
173
+ Extends BaseDrift with feature-wise drift detection capabilities.
174
+ Applies statistical tests independently to each feature (pixel) and
175
+ uses multiple testing correction to control false discovery rates.
176
+
177
+ Parameters
178
+ ----------
179
+ data : Embeddings or Array
180
+ Reference dataset used as baseline for drift detection.
181
+ p_val : float, default 0.05
182
+ Significance threshold for drift detection, between 0 and 1.
183
+ Default 0.05 limits false drift alerts to 5% when no drift exists (Type I error rate).
184
+ update_strategy : UpdateStrategy or None, default None
185
+ Strategy for updating reference data when new data arrives.
186
+ When None, reference data remains fixed throughout detection.
187
+ Default None maintains stable baseline for consistent comparison.
188
+ correction : {"bonferroni", "fdr"}, default "bonferroni"
189
+ Multiple testing correction method for controlling false positives
190
+ across multiple features. "bonferroni" divides significance level
191
+ by number of features. "fdr" uses Benjamini-Hochberg procedure.
192
+ Default "bonferroni" provides conservative family-wise error control.
193
+ n_features : int or None, default None
194
+ Number of features to analyze. When None, automatically inferred
195
+ from the first sample's flattened shape. Default None enables
196
+ automatic feature detection for flexible input handling.
197
+
198
+ Attributes
199
+ ----------
200
+ p_val : float
201
+ Significance threshold for statistical tests.
202
+ update_strategy : UpdateStrategy or None
203
+ Reference data update strategy.
204
+ correction : {"bonferroni", "fdr"}
205
+ Multiple testing correction method.
206
+ n : int
207
+ Number of samples in the reference dataset.
208
+ """
209
+
110
210
  def __init__(
111
211
  self,
112
212
  data: Embeddings | Array,
@@ -121,16 +221,22 @@ class BaseDriftUnivariate(BaseDrift):
121
221
 
122
222
  @property
123
223
  def n_features(self) -> int:
124
- """
125
- Get the number of features in the reference data.
224
+ """Number of features in the reference data.
126
225
 
127
- If the number of features is not provided during initialization, it will be inferred
128
- from the reference data (``x_ref``).
226
+ Lazily computes the number of features from the first data sample
227
+ if not provided during initialization. Features correspond to the
228
+ flattened dimensionality of the input data (e.g., pixels for images).
129
229
 
130
230
  Returns
131
231
  -------
132
232
  int
133
- Number of features in the reference data.
233
+ Number of features (flattened dimensions) in the reference data.
234
+ Always > 0 for valid datasets.
235
+
236
+ Notes
237
+ -----
238
+ For image data, this equals C x H x W.
239
+ Computed once and cached for efficiency.
134
240
  """
135
241
  # lazy process n_features as needed
136
242
  if self._n_features is None:
@@ -139,18 +245,27 @@ class BaseDriftUnivariate(BaseDrift):
139
245
  return self._n_features
140
246
 
141
247
  def score(self, data: Embeddings | Array) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
142
- """
143
- Calculates p-values and test statistics per feature.
248
+ """Calculate feature-wise p-values and test statistics.
249
+
250
+ Applies the detector's statistical test independently to each feature,
251
+ comparing the distribution of each feature between reference and test data.
144
252
 
145
253
  Parameters
146
254
  ----------
147
255
  data : Embeddings or Array
148
- Batch of instances to score.
256
+ Test dataset to compare against reference data.
149
257
 
150
258
  Returns
151
259
  -------
152
- tuple[NDArray, NDArray]
153
- Feature level p-values and test statistics
260
+ tuple[NDArray[np.float32], NDArray[np.float32]]
261
+ First array contains p-values for each feature (all between 0 and 1).
262
+ Second array contains test statistics for each feature (all >= 0).
263
+ Both arrays have shape (n_features,).
264
+
265
+ Notes
266
+ -----
267
+ Lower p-values indicate stronger evidence of drift for that feature.
268
+ Higher test statistics indicate greater distributional differences.
154
269
  """
155
270
  x_np = self._encode(data)
156
271
  p_val = np.zeros(self.n_features, dtype=np.float32)
@@ -164,22 +279,29 @@ class BaseDriftUnivariate(BaseDrift):
164
279
 
165
280
  def _apply_correction(self, p_vals: NDArray[np.float32]) -> tuple[bool, float]:
166
281
  """
167
- Apply the specified correction method (Bonferroni or FDR) to the p-values.
282
+ Apply multiple testing correction to feature-wise p-values.
168
283
 
169
- If the correction method is Bonferroni, the threshold for detecting :term:`drift<Drift>`
170
- is divided by the number of features. For FDR, the correction is applied
171
- using the Benjamini-Hochberg procedure.
284
+ Corrects for multiple comparisons across features to control
285
+ false positive rates. Bonferroni correction divides the significance
286
+ threshold by the number of features. FDR correction uses the
287
+ Benjamini-Hochberg procedure for less conservative control.
172
288
 
173
289
  Parameters
174
290
  ----------
175
- p_vals : NDArray
176
- Array of p-values from the univariate tests for each feature.
291
+ p_vals : NDArray[np.float32]
292
+ Array of p-values from univariate tests for each feature.
293
+ All values should be between 0 and 1.
177
294
 
178
295
  Returns
179
296
  -------
180
297
  tuple[bool, float]
181
- A tuple containing a boolean indicating if drift was detected and the
182
- threshold after correction.
298
+ Boolean indicating whether drift was detected after correction.
299
+ Float is the effective threshold used for detection.
300
+
301
+ Notes
302
+ -----
303
+ Bonferroni correction: threshold = p_val / n_features
304
+ FDR correction: Uses Benjamini-Hochberg step-up procedure
183
305
  """
184
306
  if self.correction == "bonferroni":
185
307
  threshold = self.p_val / self.n_features
@@ -201,21 +323,24 @@ class BaseDriftUnivariate(BaseDrift):
201
323
  @set_metadata
202
324
  @update_strategy
203
325
  def predict(self, data: Embeddings | Array) -> DriftOutput:
204
- """
205
- Predict whether a batch of data has drifted from the reference data and update
206
- reference data using specified update strategy.
326
+ """Predict drift and update reference data using specified strategy.
327
+
328
+ Performs feature-wise drift detection, applies multiple testing
329
+ correction, and optionally updates the reference dataset based
330
+ on the configured update strategy.
207
331
 
208
332
  Parameters
209
333
  ----------
210
334
  data : Embeddings or Array
211
- Batch of instances to predict drift on.
335
+ Test dataset to analyze for drift against reference data.
212
336
 
213
337
  Returns
214
338
  -------
215
339
  DriftOutput
216
- Dictionary containing the :term:`drift<Drift>` prediction and optionally the feature level
217
- p-values, threshold after multivariate correction if needed and test :term:`statistics<Statistics>`.
340
+ Complete drift detection results including overall :term:`drift<Drift>` prediction,
341
+ corrected thresholds, feature-level analysis, and summary :term:`statistics<Statistics>`.
218
342
  """
343
+
219
344
  # compute drift scores
220
345
  p_vals, dist = self.score(data)
221
346
 
@@ -22,47 +22,66 @@ from dataeval.typing import Array
22
22
 
23
23
 
24
24
  class DriftCVM(BaseDriftUnivariate):
25
- """
26
- :term:`Drift` detector employing the :term:`Cramér-von Mises (CVM) Drift Detection` test.
25
+ """:term:`Drift` detector using the :term:`Cramér-von Mises (CVM) Test`.
26
+
27
+ Detects distributional changes in continuous data by comparing empirical
28
+ cumulative distribution functions between reference and test datasets.
29
+ For multivariate data, applies CVM test independently to each feature
30
+ and aggregates results using either the Bonferroni or
31
+ :term:`False Discovery Rate (FDR)` correction.
27
32
 
28
- The CVM test detects changes in the distribution of continuous
29
- univariate data. For multivariate data, a separate CVM test is applied to each
30
- feature, and the obtained p-values are aggregated via the Bonferroni or
31
- :term:`False Discovery Rate (FDR)` corrections.
33
+ The CVM test is particularly effective at detecting subtle
34
+ distributional shifts throughout the entire domain, providing higher
35
+ power than Kolmogorov-Smirnov for many types of drift.
32
36
 
33
37
  Parameters
34
38
  ----------
35
39
  data : Embeddings or Array
36
- Data used as reference distribution.
37
- p_val : float or None, default 0.05
38
- :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
39
- If the FDR correction method is used, this corresponds to the acceptable
40
- q-value.
40
+ Reference dataset used as baseline distribution for drift detection.
41
+ Should represent the expected data distribution.
42
+ p_val : float, default 0.05
43
+ Significance threshold for drift detection, between 0 and 1.
44
+ Default 0.05 limits false drift alerts to 5% when no drift exists (Type I error rate).
41
45
  update_strategy : UpdateStrategy or None, default None
42
- Reference data can optionally be updated using an UpdateStrategy class. Update
43
- using the last n instances seen by the detector with LastSeenUpdateStrategy
44
- or via reservoir sampling with ReservoirSamplingUpdateStrategy.
46
+ Strategy for updating reference data when new data arrives.
47
+ When None, reference data remains fixed throughout detection.
45
48
  correction : "bonferroni" or "fdr", default "bonferroni"
46
- Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
47
- Discovery Rate).
49
+ Multiple testing correction method for multivariate drift detection.
50
+ "bonferroni" provides conservative family-wise error control by
51
+ dividing significance threshold by number of features.
52
+ "fdr" uses Benjamini-Hochberg procedure for less conservative control.
53
+ Default "bonferroni" minimizes false positive drift detections.
48
54
  n_features : int or None, default None
49
- Number of features used in the univariate drift tests. If not provided, it will
50
- be inferred from the data.
51
-
55
+ Number of features to analyze in univariate tests.
56
+ When None, automatically inferred from the flattened shape of first data sample.
52
57
 
53
58
  Example
54
59
  -------
60
+ Basic drift detection with image embeddings
61
+
55
62
  >>> from dataeval.data import Embeddings
63
+ >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
64
+ >>> drift_detector = DriftCVM(train_emb)
56
65
 
57
- Use Embeddings to encode images before testing for drift
66
+ Test incoming images for distributional drift
58
67
 
59
- >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
60
- >>> drift = DriftCVM(train_emb)
68
+ >>> result = drift_detector.predict(test_images)
69
+ >>> print(f"Drift detected: {result.drifted}")
70
+ Drift detected: True
71
+
72
+ >>> print(f"Mean CVM statistic: {result.distance:.4f}")
73
+ Mean CVM statistic: 24.1325
74
+
75
+ Using different correction methods
76
+
77
+ >>> drift_fdr = DriftCVM(train_emb, correction="fdr", p_val=0.1)
78
+ >>> result = drift_fdr.predict(test_images)
61
79
 
62
- Test incoming images for drift
80
+ Access feature level results
63
81
 
64
- >>> drift.predict(test_images).drifted
65
- True
82
+ >>> n_features = result.feature_drift
83
+ >>> print(f"Features showing drift: {n_features.sum()} / {len(n_features)}")
84
+ Features showing drift: 576 / 576
66
85
  """
67
86
 
68
87
  def __init__(
@@ -22,49 +22,77 @@ from dataeval.typing import Array
22
22
 
23
23
 
24
24
  class DriftKS(BaseDriftUnivariate):
25
- """
26
- :term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
25
+ """:term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
27
26
  distribution<Kolmogorov-Smirnov (K-S) test>` test.
28
27
 
29
- The KS test detects changes in the maximum distance between two data
30
- distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
31
- for multivariate data.
28
+ Detects distributional changes by measuring the maximum distance between
29
+ empirical cumulative distribution functions of reference and test datasets.
30
+ For multivariate data, applies KS test independently to each feature
31
+ and aggregates results using multiple testing correction.
32
+
33
+ The Kolmogorov-Smirnov test is particularly sensitive to differences in
34
+ the middle portions of distributions but has reduced power in the tails
35
+ where cumulative distribution functions are constrained near 0 and 1.
32
36
 
33
37
  Parameters
34
38
  ----------
35
39
  data : Embeddings or Array
36
- Data used as reference distribution.
37
- p_val : float or None, default 0.05
38
- :term:`p-value<P-Value>` used for significance of the statistical test for each feature.
39
- If the FDR correction method is used, this corresponds to the acceptable
40
- q-value.
40
+ Reference dataset used as baseline distribution for drift detection.
41
+ Should represent the expected data distribution.
42
+ p_val : float, default 0.05
43
+ Significance threshold for drift detection, between 0 and 1.
44
+ Default 0.05 limits false drift alerts to 5% when no drift exists (Type I error rate).
41
45
  update_strategy : UpdateStrategy or None, default None
42
- Reference data can optionally be updated using an UpdateStrategy class. Update
43
- using the last n instances seen by the detector with LastSeenUpdateStrategy
44
- or via reservoir sampling with ReservoirSamplingUpdateStrategy.
46
+ Strategy for updating reference data when new data arrives.
47
+ When None, reference data remains fixed throughout detection.
45
48
  correction : "bonferroni" or "fdr", default "bonferroni"
46
- Correction type for multivariate data. Either 'bonferroni' or 'fdr' (False
47
- Discovery Rate).
49
+ Multiple testing correction method for multivariate drift detection.
50
+ "bonferroni" provides conservative family-wise error control by
51
+ dividing significance threshold by number of features.
52
+ "fdr" uses Benjamini-Hochberg procedure for less conservative control.
53
+ Default "bonferroni" minimizes false positive drift detections.
48
54
  alternative : "two-sided", "less" or "greater", default "two-sided"
49
- Defines the alternative hypothesis. Options are 'two-sided', 'less' or
50
- 'greater'.
55
+ Alternative hypothesis for the statistical test. "two-sided" detects
56
+ any distributional difference. "less" tests if test distribution is
57
+ stochastically smaller. "greater" tests if test distribution is
58
+ stochastically larger. Default "two-sided" provides most general
59
+ drift detection without directional assumptions.
51
60
  n_features : int | None, default None
52
- Number of features used in the univariate drift tests. If not provided, it will
53
- be inferred from the data.
61
+ Number of features to analyze in univariate tests.
62
+ When None, automatically inferred from the flattened shape of first data sample.
54
63
 
55
64
  Example
56
65
  -------
57
- >>> from dataeval.data import Embeddings
58
-
59
- Use Embeddings to encode images before testing for drift
66
+ Basic drift detection with image embeddings:
60
67
 
68
+ >>> from dataeval.data import Embeddings
61
69
  >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
62
- >>> drift = DriftKS(train_emb)
63
-
64
- Test incoming images for drift
65
-
66
- >>> drift.predict(test_images).drifted
67
- True
70
+ >>> drift_detector = DriftKS(train_emb)
71
+
72
+ Test incoming images for distributional drift
73
+
74
+ >>> result = drift_detector.predict(test_images)
75
+ >>> print(f"Drift detected: {result.drifted}")
76
+ Drift detected: True
77
+
78
+ >>> print(f"Mean KS statistic: {result.distance:.4f}")
79
+ Mean KS statistic: 0.8750
80
+
81
+ Detect if test data has systematically higher values
82
+
83
+ >>> drift_greater = DriftKS(train_emb, alternative="greater")
84
+ >>> result = drift_greater.predict(test_images)
85
+
86
+ Using different correction methods
87
+
88
+ >>> drift_fdr = DriftKS(train_emb, correction="fdr", p_val=0.1)
89
+ >>> result = drift_fdr.predict(test_images)
90
+
91
+ Access feature-level results
92
+
93
+ >>> n_features = result.feature_drift
94
+ >>> print(f"Features showing drift: {n_features.sum()} / {len(n_features)}")
95
+ Features showing drift: 576 / 576
68
96
  """
69
97
 
70
98
  def __init__(
@@ -24,31 +24,57 @@ from dataeval.typing import Array
24
24
 
25
25
 
26
26
  class DriftMMD(BaseDrift):
27
- """
28
- :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
29
- using a permutation test.
27
+ """Drift detector using :term:`Maximum Mean Discrepancy (MMD) Drift Detection` with permutation test.
28
+
29
+ Detects distributional differences by comparing kernel embeddings of reference
30
+ and test datasets in a reproducing kernel Hilbert space (RKHS). Uses permutation
31
+ testing to assess statistical significance of the observed MMD^2 statistic.
32
+
33
+ MMD is particularly effective for high-dimensional data like images as it can
34
+ capture complex distributional differences that univariate tests might miss.
35
+ The kernel-based approach enables detection of both marginal and dependency
36
+ changes between features.
30
37
 
31
38
  Parameters
32
39
  ----------
33
40
  data : Embeddings or Array
34
- Data used as reference distribution.
35
- p_val : float or None, default 0.05
36
- :term:`P-value` used for significance of the statistical test for each feature.
37
- If the FDR correction method is used, this corresponds to the acceptable
38
- q-value.
41
+ Reference dataset used as baseline distribution for drift detection.
42
+ Should represent the expected data distribution.
43
+ p_val : float, default 0.05
44
+ Significance threshold for statistical tests, between 0 and 1.
45
+ For FDR correction, this represents the acceptable false discovery rate.
46
+ Default 0.05 provides 95% confidence level for drift detection.
39
47
  update_strategy : UpdateStrategy or None, default None
40
- Reference data can optionally be updated using an UpdateStrategy class. Update
41
- using the last n instances seen by the detector with LastSeenUpdateStrategy
42
- or via reservoir sampling with ReservoirSamplingUpdateStrategy.
48
+ Strategy for updating reference data when new data arrives.
49
+ When None, reference data remains fixed throughout detection.
43
50
  sigma : Array or None, default None
44
- Optionally set the internal GaussianRBF kernel bandwidth. Can also pass multiple
45
- bandwidth values as an array. The kernel evaluation is then averaged over
46
- those bandwidths.
51
+ Bandwidth parameter(s) for the Gaussian RBF kernel. Controls the
52
+ kernel's sensitivity to distance between data points. When None,
53
+ automatically selects bandwidth using median heuristic. Can provide
54
+ multiple values as array to average over different scales.
47
55
  n_permutations : int, default 100
48
- Number of permutations used in the permutation test.
56
+ Number of random permutations used in the permutation test to estimate
57
+ the null distribution of MMD² under no drift. Higher values provide
58
+ more accurate p-value estimates but increase computation time.
59
+ Default 100 balances statistical accuracy with computational efficiency.
49
60
  device : DeviceLike or None, default None
50
- The hardware device to use if specified, otherwise uses the DataEval
51
- default or torch default.
61
+ Hardware device for computation. When None, automatically selects
62
+ DataEval's configured device, falling back to PyTorch's default.
63
+
64
+ Attributes
65
+ ----------
66
+ p_val : float
67
+ Significance threshold for statistical tests.
68
+ update_strategy : UpdateStrategy or None
69
+ Reference data update strategy.
70
+ n : int
71
+ Number of samples in the reference dataset.
72
+ sigma : Array or None
73
+ Gaussian RBF kernel bandwidth parameter(s).
74
+ n_permutations : int
75
+ Number of permutations for statistical testing.
76
+ device : torch.device
77
+ Hardware device used for computations.
52
78
 
53
79
  Example
54
80
  -------
@@ -56,7 +82,7 @@ class DriftMMD(BaseDrift):
56
82
 
57
83
  Use Embeddings to encode images before testing for drift
58
84
 
59
- >>> train_emb = Embeddings(train_images, model=encoder, batch_size=64)
85
+ >>> train_emb = Embeddings(train_images, model=encoder, batch_size=16)
60
86
  >>> drift = DriftMMD(train_emb)
61
87
 
62
88
  Test incoming images for drift