dataeval 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/config.py +21 -4
  4. dataeval/data/_embeddings.py +2 -2
  5. dataeval/data/_images.py +2 -3
  6. dataeval/data/_metadata.py +48 -37
  7. dataeval/data/_selection.py +1 -2
  8. dataeval/data/_split.py +2 -3
  9. dataeval/data/_targets.py +17 -13
  10. dataeval/data/selections/_classfilter.py +2 -5
  11. dataeval/data/selections/_prioritize.py +6 -9
  12. dataeval/data/selections/_shuffle.py +3 -1
  13. dataeval/detectors/drift/_base.py +4 -5
  14. dataeval/detectors/drift/_mmd.py +3 -6
  15. dataeval/detectors/drift/_nml/_base.py +4 -2
  16. dataeval/detectors/drift/_nml/_chunk.py +11 -19
  17. dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
  18. dataeval/detectors/drift/_nml/_result.py +8 -9
  19. dataeval/detectors/drift/_nml/_thresholds.py +66 -77
  20. dataeval/detectors/linters/outliers.py +7 -7
  21. dataeval/metrics/bias/_parity.py +10 -13
  22. dataeval/metrics/estimators/_divergence.py +2 -4
  23. dataeval/metrics/stats/_base.py +103 -42
  24. dataeval/metrics/stats/_boxratiostats.py +21 -19
  25. dataeval/metrics/stats/_dimensionstats.py +14 -10
  26. dataeval/metrics/stats/_hashstats.py +1 -1
  27. dataeval/metrics/stats/_pixelstats.py +6 -6
  28. dataeval/metrics/stats/_visualstats.py +3 -3
  29. dataeval/outputs/_base.py +22 -7
  30. dataeval/outputs/_bias.py +26 -28
  31. dataeval/outputs/_drift.py +1 -9
  32. dataeval/outputs/_linters.py +11 -11
  33. dataeval/outputs/_stats.py +82 -23
  34. dataeval/outputs/_workflows.py +2 -2
  35. dataeval/utils/_array.py +6 -9
  36. dataeval/utils/_bin.py +1 -2
  37. dataeval/utils/_clusterer.py +7 -4
  38. dataeval/utils/_fast_mst.py +27 -13
  39. dataeval/utils/_image.py +65 -11
  40. dataeval/utils/_mst.py +1 -3
  41. dataeval/utils/_plot.py +15 -10
  42. dataeval/utils/data/_dataset.py +32 -20
  43. dataeval/utils/data/metadata.py +104 -82
  44. dataeval/utils/datasets/__init__.py +2 -0
  45. dataeval/utils/datasets/_antiuav.py +189 -0
  46. dataeval/utils/datasets/_base.py +11 -8
  47. dataeval/utils/datasets/_cifar10.py +104 -45
  48. dataeval/utils/datasets/_fileio.py +21 -47
  49. dataeval/utils/datasets/_milco.py +19 -11
  50. dataeval/utils/datasets/_mixin.py +2 -4
  51. dataeval/utils/datasets/_mnist.py +3 -4
  52. dataeval/utils/datasets/_ships.py +14 -7
  53. dataeval/utils/datasets/_voc.py +229 -42
  54. dataeval/utils/torch/models.py +5 -10
  55. dataeval/utils/torch/trainer.py +3 -3
  56. dataeval/workflows/sufficiency.py +2 -2
  57. {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +1 -1
  58. dataeval-0.86.1.dist-info/RECORD +114 -0
  59. dataeval/detectors/ood/vae.py +0 -74
  60. dataeval-0.86.0.dist-info/RECORD +0 -114
  61. {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
  62. {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
dataeval/__init__.py CHANGED
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.0"
11
+ __version__ = "0.86.1"
12
12
 
13
13
  import logging
14
14
 
dataeval/_log.py CHANGED
@@ -8,7 +8,7 @@ class LogMessage:
8
8
  Deferred message callback for logging expensive messages.
9
9
  """
10
10
 
11
- def __init__(self, fn: Callable[..., str]):
11
+ def __init__(self, fn: Callable[..., str]) -> None:
12
12
  self._fn = fn
13
13
  self._str = None
14
14
 
dataeval/config.py CHANGED
@@ -4,10 +4,10 @@ Global configuration settings for DataEval.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "DeviceLike"]
7
+ __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
8
8
 
9
9
  import sys
10
- from typing import Union
10
+ from typing import Any, Union
11
11
 
12
12
  if sys.version_info >= (3, 10):
13
13
  from typing import TypeAlias
@@ -78,8 +78,7 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
78
78
  if override is None:
79
79
  global _device
80
80
  return torch.get_default_device() if _device is None else _device
81
- else:
82
- return _todevice(override)
81
+ return _todevice(override)
83
82
 
84
83
 
85
84
  def set_max_processes(processes: int | None) -> None:
@@ -112,6 +111,24 @@ def get_max_processes() -> int | None:
112
111
  return _processes
113
112
 
114
113
 
114
+ class MaxProcessesContextManager:
115
+ def __init__(self, processes: int) -> None:
116
+ self._processes = processes
117
+
118
+ def __enter__(self) -> None:
119
+ global _processes
120
+ self._old = _processes
121
+ set_max_processes(self._processes)
122
+
123
+ def __exit__(self, *args: tuple[Any, ...]) -> None:
124
+ global _processes
125
+ _processes = self._old
126
+
127
+
128
+ def use_max_processes(processes: int) -> MaxProcessesContextManager:
129
+ return MaxProcessesContextManager(processes)
130
+
131
+
115
132
  def set_seed(seed: int | None, all_generators: bool = False) -> None:
116
133
  """
117
134
  Sets the seed for use by classes that allow for a random state or seed.
@@ -144,8 +144,7 @@ class Embeddings:
144
144
  """
145
145
  if indices is not None:
146
146
  return torch.vstack(list(self._batch(indices))).to(self.device)
147
- else:
148
- return self[:]
147
+ return self[:]
149
148
 
150
149
  def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
151
150
  """
@@ -248,6 +247,7 @@ class Embeddings:
248
247
  _logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
249
248
  except Exception as e:
250
249
  _logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
250
+ raise e
251
251
 
252
252
  @classmethod
253
253
  def load(cls, path: Path | str) -> Embeddings:
dataeval/data/_images.py CHANGED
@@ -73,15 +73,14 @@ class Images(Generic[T]):
73
73
  def __getitem__(self, key: int | slice, /) -> Sequence[T] | T:
74
74
  if isinstance(key, slice):
75
75
  return [self._get_image(k) for k in range(len(self._dataset))[key]]
76
- elif hasattr(key, "__int__"):
76
+ if hasattr(key, "__int__"):
77
77
  return self._get_image(int(key))
78
78
  raise TypeError(f"Key must be integers or slices, not {type(key)}")
79
79
 
80
80
  def _get_image(self, index: int) -> T:
81
81
  if self._is_tuple_datum:
82
82
  return cast(Dataset[tuple[T, Any, Any]], self._dataset)[index][0]
83
- else:
84
- return cast(Dataset[T], self._dataset)[index]
83
+ return cast(Dataset[T], self._dataset)[index]
85
84
 
86
85
  def __iter__(self) -> Iterator[T]:
87
86
  for i in range(len(self._dataset)):
@@ -196,7 +196,7 @@ class Metadata:
196
196
  self._process()
197
197
  return int(self._image_indices.max() + 1)
198
198
 
199
- def _collate(self, force: bool = False):
199
+ def _collate(self, force: bool = False) -> None:
200
200
  if self._collated and not force:
201
201
  return
202
202
 
@@ -243,7 +243,7 @@ class Metadata:
243
243
  self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
244
244
  self._collated = True
245
245
 
246
- def _merge(self, force: bool = False):
246
+ def _merge(self, force: bool = False) -> None:
247
247
  if self._merged is not None and not force:
248
248
  return
249
249
 
@@ -266,48 +266,26 @@ class Metadata:
266
266
  "Metadata dictionary needs to be a single dictionary whose values "
267
267
  "are arraylike containing the metadata on a per image or per object basis."
268
268
  )
269
- else:
270
- check_length = len(v) if check_length is None else check_length
271
- if check_length != len(v):
272
- raise ValueError(
273
- "The lists/arrays in the metadata dict have varying lengths. "
274
- "Metadata requires them to be uniform in length."
275
- )
269
+ check_length = len(v) if check_length is None else check_length
270
+ if check_length != len(v):
271
+ raise ValueError(
272
+ "The lists/arrays in the metadata dict have varying lengths. "
273
+ "Metadata requires them to be uniform in length."
274
+ )
276
275
  if len(self._class_labels) != check_length:
277
276
  raise ValueError(
278
277
  f"The length of the label array {len(self._class_labels)} is not the same as "
279
278
  f"the length of the metadata arrays {check_length}."
280
279
  )
281
280
 
282
- def _process(self, force: bool = False) -> None:
283
- if self._processed and not force:
284
- return
285
-
286
- # Create image indices from targets
287
- self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
288
-
289
- # Validate the metadata dimensions
290
- self._validate()
291
-
292
- # Include specified metadata keys
293
- if self.include:
294
- metadata = {i: self.merged[i] for i in self.include if i in self.merged}
295
- continuous_factor_bins = (
296
- {i: self.continuous_factor_bins[i] for i in self.include if i in self.continuous_factor_bins}
297
- if self.continuous_factor_bins
298
- else {}
299
- )
300
- else:
301
- metadata = self.merged
302
- continuous_factor_bins = dict(self.continuous_factor_bins) if self.continuous_factor_bins else {}
303
- for k in self.exclude:
304
- metadata.pop(k, None)
305
- continuous_factor_bins.pop(k, None)
306
-
307
- # Remove generated "_image_index" if present
308
- if "_image_index" in metadata:
309
- metadata.pop("_image_index", None)
281
+ def _filter(self, d: Mapping[str, Any]) -> dict[str, Any]:
282
+ return (
283
+ {k: d[k] for k in self.include if k in d} if self.include else {k: d[k] for k in d if k not in self.exclude}
284
+ )
310
285
 
286
+ def _split_continuous_discrete(
287
+ self, metadata: dict[str, NDArray[Any]], continuous_factor_bins: dict[str, int | Sequence[float]]
288
+ ) -> tuple[dict[str, NDArray[Any]], dict[str, NDArray[np.int64]]]:
311
289
  # Bin according to user supplied bins
312
290
  continuous_metadata = {}
313
291
  discrete_metadata = {}
@@ -346,6 +324,28 @@ class Metadata:
346
324
  else:
347
325
  _, discrete_metadata[key] = np.unique(data, return_inverse=True)
348
326
 
327
+ return continuous_metadata, discrete_metadata
328
+
329
+ def _process(self, force: bool = False) -> None:
330
+ if self._processed and not force:
331
+ return
332
+
333
+ # Create image indices from targets
334
+ self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
335
+
336
+ # Validate the metadata dimensions
337
+ self._validate()
338
+
339
+ # Filter the merged metadata and continuous factor bins
340
+ metadata = self._filter(self.merged)
341
+ continuous_factor_bins = self._filter(self.continuous_factor_bins)
342
+
343
+ # Remove generated "_image_index" if present
344
+ metadata.pop("_image_index", None)
345
+
346
+ # Split the metadata into continuous and discrete
347
+ continuous_metadata, discrete_metadata = self._split_continuous_discrete(metadata, continuous_factor_bins)
348
+
349
349
  # Split out the dictionaries into the keys and values
350
350
  self._discrete_factor_names = list(discrete_metadata.keys())
351
351
  self._discrete_data = (
@@ -363,6 +363,17 @@ class Metadata:
363
363
  self._processed = True
364
364
 
365
365
  def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
366
+ """
367
+ Add additional factors to the metadata.
368
+
369
+ The number of measures per factor must match the number of images
370
+ in the dataset or the number of detections in the dataset.
371
+
372
+ Parameters
373
+ ----------
374
+ factors : Mapping[str, ArrayLike]
375
+ Dictionary of factors to add to the metadata.
376
+ """
366
377
  self._merge()
367
378
 
368
379
  targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
@@ -110,8 +110,7 @@ class Select(AnnotatedDataset[_TDatum]):
110
110
  grouped: dict[int, list[Selection[_TDatum]]] = {}
111
111
  for selection in selections_list:
112
112
  grouped.setdefault(selection.stage, []).append(selection)
113
- selection_list = [selection for category in sorted(grouped) for selection in grouped[category]]
114
- return selection_list
113
+ return [selection for category in sorted(grouped) for selection in grouped[category]]
115
114
 
116
115
  def _apply_selections(self) -> None:
117
116
  for selection in self._selections:
dataeval/data/_split.py CHANGED
@@ -23,7 +23,7 @@ _logger = logging.getLogger(__name__)
23
23
  class KFoldSplitter(Protocol):
24
24
  """Protocol covering sklearn KFold variant splitters"""
25
25
 
26
- def __init__(self, n_splits: int): ...
26
+ def __init__(self, n_splits: int) -> None: ...
27
27
  def split(self, X: Any, y: Any, groups: Any) -> Iterator[tuple[NDArray[Any], NDArray[Any]]]: ...
28
28
 
29
29
 
@@ -209,8 +209,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
209
209
  split_set = set(split_on)
210
210
  indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
211
211
  binned_features = metadata.discrete_data[:, indices]
212
- group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
213
- return group_ids
212
+ return np.unique(binned_features, axis=0, return_inverse=True)[1]
214
213
 
215
214
 
216
215
  def make_splits(
dataeval/data/_targets.py CHANGED
@@ -24,11 +24,13 @@ class Targets:
24
24
  labels : NDArray[np.intp]
25
25
  Labels (N,) for N images or objects
26
26
  scores : NDArray[np.float32]
27
- Probability scores (N,M) for N images of M classes or confidence score (N,) of objects
27
+ Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
28
28
  bboxes : NDArray[np.float32] | None
29
- Bounding boxes (N,4) for N objects in (x0,y0,x1,y1) format
29
+ Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
30
30
  source : NDArray[np.intp] | None
31
31
  Source image index (N,) for N objects
32
+ size : int
33
+ Count of objects
32
34
  """
33
35
 
34
36
  labels: NDArray[np.intp]
@@ -55,13 +57,16 @@ class Targets:
55
57
  )
56
58
 
57
59
  if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
58
- raise ValueError("Bounding boxes must be in (x0,y0,x1,y1) format.")
60
+ raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
61
+
62
+ @property
63
+ def size(self) -> int:
64
+ return len(self.labels)
59
65
 
60
66
  def __len__(self) -> int:
61
67
  if self.source is None:
62
68
  return len(self.labels)
63
- else:
64
- return len(np.unique(self.source))
69
+ return len(np.unique(self.source))
65
70
 
66
71
  def __getitem__(self, idx: int, /) -> Targets:
67
72
  if self.source is None or self.bboxes is None:
@@ -71,14 +76,13 @@ class Targets:
71
76
  None,
72
77
  None,
73
78
  )
74
- else:
75
- mask = np.where(self.source == idx, True, False)
76
- return Targets(
77
- np.atleast_1d(self.labels[mask]),
78
- np.atleast_1d(self.scores[mask]),
79
- np.atleast_2d(self.bboxes[mask]),
80
- np.atleast_1d(self.source[mask]),
81
- )
79
+ mask = np.where(self.source == idx, True, False)
80
+ return Targets(
81
+ np.atleast_1d(self.labels[mask]),
82
+ np.atleast_1d(self.scores[mask]),
83
+ np.atleast_2d(self.bboxes[mask]),
84
+ np.atleast_1d(self.source[mask]),
85
+ )
82
86
 
83
87
  def __iter__(self) -> Iterator[Targets]:
84
88
  for i in range(len(self.labels)) if self.source is None else np.unique(self.source):
@@ -68,11 +68,8 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
68
68
 
69
69
 
70
70
  def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
71
- if isinstance(obj, Sized) and not isinstance(obj, (str, bytes, bytearray)) and len(obj) == len(mask):
72
- if isinstance(obj, Array):
73
- return obj[mask]
74
- elif isinstance(obj, Sequence):
75
- return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
71
+ if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
72
+ return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
76
73
  return obj
77
74
 
78
75
 
@@ -99,8 +99,7 @@ class _KNNSorter(_Sorter):
99
99
  np.fill_diagonal(dists, np.inf)
100
100
  else:
101
101
  dists = pairwise_distances(embeddings, reference)
102
- inds = np.argsort(np.sort(dists, axis=1)[:, self._k])
103
- return inds
102
+ return np.argsort(np.sort(dists, axis=1)[:, self._k])
104
103
 
105
104
 
106
105
  class _KMeansSorter(_Sorter):
@@ -124,15 +123,13 @@ class _KMeansSorter(_Sorter):
124
123
  class _KMeansDistanceSorter(_KMeansSorter):
125
124
  def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
126
125
  clst = self._get_clusters(embeddings if reference is None else reference)
127
- inds = np.argsort(clst._dist2center(embeddings))
128
- return inds
126
+ return np.argsort(clst._dist2center(embeddings))
129
127
 
130
128
 
131
129
  class _KMeansComplexitySorter(_KMeansSorter):
132
130
  def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
133
131
  clst = self._get_clusters(embeddings if reference is None else reference)
134
- inds = clst._sort_by_weights(embeddings)
135
- return inds
132
+ return clst._sort_by_weights(embeddings)
136
133
 
137
134
 
138
135
  class Prioritize(Selection[Any]):
@@ -266,10 +263,10 @@ class Prioritize(Selection[Any]):
266
263
  def _get_sorter(self, samples: int) -> _Sorter:
267
264
  if self._method == "knn":
268
265
  return _KNNSorter(samples, self._k)
269
- elif self._method == "kmeans_distance":
266
+ if self._method == "kmeans_distance":
270
267
  return _KMeansDistanceSorter(samples, self._c)
271
- else: # self._method == "kmeans_complexity"
272
- return _KMeansComplexitySorter(samples, self._c)
268
+ # self._method == "kmeans_complexity"
269
+ return _KMeansComplexitySorter(samples, self._c)
273
270
 
274
271
  def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
275
272
  emb: NDArray[Any] = embeddings.to_numpy(selection)
@@ -30,7 +30,9 @@ class Shuffle(Selection[Any]):
30
30
  seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
31
31
  stage = SelectionStage.ORDER
32
32
 
33
- def __init__(self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None):
33
+ def __init__(
34
+ self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
35
+ ) -> None:
34
36
  self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
35
37
 
36
38
  def __call__(self, dataset: Select[Any]) -> None:
@@ -13,7 +13,7 @@ __all__ = []
13
13
  import math
14
14
  from abc import abstractmethod
15
15
  from functools import wraps
16
- from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
16
+ from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
17
17
 
18
18
  import numpy as np
19
19
  from numpy.typing import NDArray
@@ -40,7 +40,7 @@ def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
40
40
  """Decorator to update x_ref with x using selected update methodology"""
41
41
 
42
42
  @wraps(fn)
43
- def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
43
+ def _(self: BaseDrift, data: Embeddings | Array, *args: tuple[Any, ...], **kwargs: dict[str, Any]) -> R:
44
44
  output = fn(self, data, *args, **kwargs)
45
45
 
46
46
  # update reference dataset
@@ -184,7 +184,7 @@ class BaseDriftUnivariate(BaseDrift):
184
184
  threshold = self.p_val / self.n_features
185
185
  drift_pred = bool((p_vals < threshold).any())
186
186
  return drift_pred, threshold
187
- elif self.correction == "fdr":
187
+ if self.correction == "fdr":
188
188
  n = p_vals.shape[0]
189
189
  i = np.arange(n) + np.int_(1)
190
190
  p_sorted = np.sort(p_vals)
@@ -195,8 +195,7 @@ class BaseDriftUnivariate(BaseDrift):
195
195
  except ValueError: # sorted p-values not below thresholds
196
196
  return bool(below_threshold.any()), q_threshold.min()
197
197
  return bool(below_threshold.any()), q_threshold[idx_threshold]
198
- else:
199
- raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
198
+ raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
200
199
 
201
200
  @set_metadata
202
201
  @update_strategy
@@ -95,8 +95,7 @@ class DriftMMD(BaseDrift):
95
95
  k_xy = self._kernel(x, y)
96
96
  k_xx = self._k_xx if self._k_xx is not None and self.update_strategy is None else self._kernel(x, x)
97
97
  k_yy = self._kernel(y, y)
98
- kernel_mat = torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
99
- return kernel_mat
98
+ return torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
100
99
 
101
100
  def score(self, data: Embeddings | Array) -> tuple[float, float, float]:
102
101
  """
@@ -205,8 +204,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
205
204
  n = min(x.shape[0], y.shape[0])
206
205
  n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
207
206
  n_median = n + (torch.prod(torch.as_tensor(dist.shape)) - n) // 2 - 1
208
- sigma = (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
209
- return sigma
207
+ return (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
210
208
 
211
209
 
212
210
  class GaussianRBF(torch.nn.Module):
@@ -310,5 +308,4 @@ def mmd2_from_kernel_matrix(
310
308
  kernel_mat = kernel_mat[idx][:, idx]
311
309
  k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
312
310
  c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
313
- mmd2 = c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
314
- return mmd2
311
+ return c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
@@ -27,7 +27,9 @@ def _validate(data: pd.DataFrame, expected_features: int | None = None) -> int:
27
27
  return data.shape[-1]
28
28
 
29
29
 
30
- def _create_multilevel_index(chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]):
30
+ def _create_multilevel_index(
31
+ chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]
32
+ ) -> pd.MultiIndex:
31
33
  chunk_column_names = (*chunks[0].KEYS, "period")
32
34
  chunk_tuples = [("chunk", chunk_column_name) for chunk_column_name in chunk_column_names]
33
35
  result_tuples = [(result_group_name, column_name) for column_name in result_column_names]
@@ -37,7 +39,7 @@ def _create_multilevel_index(chunks: Sequence[Chunk], result_group_name: str, re
37
39
  class AbstractCalculator(ABC):
38
40
  """Base class for drift calculation."""
39
41
 
40
- def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None):
42
+ def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None) -> None:
41
43
  self.chunker = chunker if isinstance(chunker, Chunker) else CountBasedChunker(10)
42
44
  self.result: DriftMVDCOutput | None = None
43
45
  self.n_features: int | None = None
@@ -16,7 +16,6 @@ from abc import ABC, abstractmethod
16
16
  from typing import Any, Generic, Literal, Sequence, TypeVar, cast
17
17
 
18
18
  import pandas as pd
19
- from dateutil.parser import ParserError
20
19
  from pandas import Index, Period
21
20
  from typing_extensions import Self
22
21
 
@@ -31,7 +30,7 @@ class Chunk(ABC):
31
30
  def __init__(
32
31
  self,
33
32
  data: pd.DataFrame,
34
- ):
33
+ ) -> None:
35
34
  self.key: str
36
35
  self.data = data
37
36
 
@@ -39,11 +38,11 @@ class Chunk(ABC):
39
38
  self.end_index: int = -1
40
39
  self.chunk_index: int = -1
41
40
 
42
- def __repr__(self):
41
+ def __repr__(self) -> str:
43
42
  attr_str = ", ".join([f"{k}={v}" for k, v in self.dict().items()])
44
43
  return f"{self.__class__.__name__}(data=pd.DataFrame(shape={self.data.shape}), {attr_str})"
45
44
 
46
- def __len__(self):
45
+ def __len__(self) -> int:
47
46
  return self.data.shape[0]
48
47
 
49
48
  @abstractmethod
@@ -76,7 +75,7 @@ class IndexChunk(Chunk):
76
75
  data: pd.DataFrame,
77
76
  start_index: int,
78
77
  end_index: int,
79
- ):
78
+ ) -> None:
80
79
  super().__init__(data)
81
80
  self.key = f"[{start_index}:{end_index}]"
82
81
  self.start_index: int = start_index
@@ -113,7 +112,7 @@ class PeriodChunk(Chunk):
113
112
 
114
113
  KEYS = ("key", "chunk_index", "start_date", "end_date", "chunk_size")
115
114
 
116
- def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int):
115
+ def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int) -> None:
117
116
  super().__init__(data)
118
117
  self.key = str(period)
119
118
  self.start_datetime = period.start_time
@@ -127,6 +126,7 @@ class PeriodChunk(Chunk):
127
126
  a, b = (self, other) if self < other else (other, self)
128
127
  result = copy.deepcopy(a)
129
128
  result.data = pd.concat([a.data, b.data])
129
+ result.end_index = b.end_index
130
130
  result.end_datetime = b.end_datetime
131
131
  result.chunk_size += b.chunk_size
132
132
  return result
@@ -237,13 +237,7 @@ class PeriodBasedChunker(Chunker[PeriodChunk]):
237
237
  if self.timestamp_column_name not in data:
238
238
  raise ValueError(f"timestamp column '{self.timestamp_column_name}' not in columns")
239
239
 
240
- try:
241
- grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
242
- except ParserError:
243
- raise ValueError(
244
- f"could not parse date_column '{self.timestamp_column_name}' values as dates."
245
- f"Please verify if you've specified the correct date column."
246
- )
240
+ grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
247
241
 
248
242
  for k, v in grouped.groups.items():
249
243
  period, index = cast(Period, k), cast(Index, v)
@@ -281,7 +275,7 @@ class SizeBasedChunker(Chunker[IndexChunk]):
281
275
  self,
282
276
  chunk_size: int,
283
277
  incomplete: Literal["append", "drop", "keep"] = "keep",
284
- ):
278
+ ) -> None:
285
279
  """Create a new SizeBasedChunker.
286
280
 
287
281
  Parameters
@@ -314,12 +308,11 @@ class SizeBasedChunker(Chunker[IndexChunk]):
314
308
  def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
315
309
  def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> IndexChunk:
316
310
  chunk_data = data.iloc[index : index + chunk_size]
317
- chunk = IndexChunk(
311
+ return IndexChunk(
318
312
  data=chunk_data,
319
313
  start_index=index,
320
314
  end_index=index + chunk_size - 1,
321
315
  )
322
- return chunk
323
316
 
324
317
  chunks = [
325
318
  _create_chunk(index=i, data=data, chunk_size=self.chunk_size)
@@ -364,7 +357,7 @@ class CountBasedChunker(Chunker[IndexChunk]):
364
357
  self,
365
358
  chunk_number: int,
366
359
  incomplete: Literal["append", "drop", "keep"] = "keep",
367
- ):
360
+ ) -> None:
368
361
  """Creates a new CountBasedChunker.
369
362
 
370
363
  It will calculate the amount of observations per chunk based on the given chunk count.
@@ -400,5 +393,4 @@ class CountBasedChunker(Chunker[IndexChunk]):
400
393
  def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
401
394
  chunk_size = data.shape[0] // self.chunk_number
402
395
  chunker = SizeBasedChunker(chunk_size, self.incomplete)
403
- chunks = chunker.split(data=data)
404
- return chunks
396
+ return chunker.split(data=data)
@@ -20,7 +20,7 @@ from sklearn.model_selection import StratifiedKFold
20
20
  from dataeval.config import get_max_processes, get_seed
21
21
  from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
22
22
  from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
23
- from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold, calculate_threshold_values
23
+ from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
24
24
  from dataeval.outputs._base import set_metadata
25
25
  from dataeval.outputs._drift import DriftMVDCOutput
26
26
 
@@ -38,10 +38,8 @@ DEFAULT_LGBM_HYPERPARAMS = {
38
38
  "min_child_weight": 0.001,
39
39
  "min_split_gain": 0.0,
40
40
  "n_estimators": 100,
41
- "n_jobs": get_max_processes() or 0,
42
41
  "num_leaves": 31,
43
42
  "objective": None,
44
- "random_state": get_seed(),
45
43
  "reg_alpha": 0.0,
46
44
  "reg_lambda": 0.0,
47
45
  "subsample": 1.0,
@@ -126,7 +124,7 @@ class DomainClassifierCalculator(AbstractCalculator):
126
124
  self.result._data = pd.concat([self.result._data, res], ignore_index=True)
127
125
  return self.result
128
126
 
129
- def _calculate_chunk(self, chunk: Chunk):
127
+ def _calculate_chunk(self, chunk: Chunk) -> float:
130
128
  if self.result is None:
131
129
  # Use information from chunk indices to identify reference chunk's location. This is possible because
132
130
  # both the internal reference data copy and the chunk data were sorted by timestamp, so these
@@ -151,7 +149,7 @@ class DomainClassifierCalculator(AbstractCalculator):
151
149
  _try = y[train_index]
152
150
  _tsx = df_X.iloc[test_index]
153
151
  _tsy = y[test_index]
154
- model = LGBMClassifier(**self.hyperparameters)
152
+ model = LGBMClassifier(**self.hyperparameters, n_jobs=get_max_processes(), random_state=get_seed())
155
153
  model.fit(_trx, _try)
156
154
  preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
157
155
  all_preds.append(preds)
@@ -159,24 +157,15 @@ class DomainClassifierCalculator(AbstractCalculator):
159
157
 
160
158
  np_all_preds = np.concatenate(all_preds, axis=0)
161
159
  np_all_tgts = np.concatenate(all_tgts, axis=0)
162
- try:
163
- # catch case where all rows are duplicates
164
- result = roc_auc_score(np_all_tgts, np_all_preds)
165
- except ValueError as err:
166
- if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
167
- raise
168
- else:
169
- # by definition if reference and chunk exactly match we can't discriminate
170
- result = 0.5
171
- return result
160
+ result = roc_auc_score(np_all_tgts, np_all_preds)
161
+ return 0.5 if result == np.nan else float(result)
172
162
 
173
163
  def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
174
164
  if self.result is None:
175
- self._threshold_values = calculate_threshold_values(
176
- threshold=self.threshold,
165
+ self._threshold_values = self.threshold.calculate(
177
166
  data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
178
- lower_threshold_value_limit=0.0,
179
- upper_threshold_value_limit=1.0,
167
+ lower_limit=0.0,
168
+ upper_limit=1.0,
180
169
  logger=self._logger,
181
170
  )
182
171