dataeval 0.86.0__py3-none-any.whl → 0.86.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_log.py +1 -1
- dataeval/config.py +21 -4
- dataeval/data/_embeddings.py +2 -2
- dataeval/data/_images.py +2 -3
- dataeval/data/_metadata.py +48 -37
- dataeval/data/_selection.py +1 -2
- dataeval/data/_split.py +2 -3
- dataeval/data/_targets.py +17 -13
- dataeval/data/selections/_classfilter.py +2 -5
- dataeval/data/selections/_prioritize.py +6 -9
- dataeval/data/selections/_shuffle.py +3 -1
- dataeval/detectors/drift/_base.py +4 -5
- dataeval/detectors/drift/_mmd.py +3 -6
- dataeval/detectors/drift/_nml/_base.py +4 -2
- dataeval/detectors/drift/_nml/_chunk.py +11 -19
- dataeval/detectors/drift/_nml/_domainclassifier.py +8 -19
- dataeval/detectors/drift/_nml/_result.py +8 -9
- dataeval/detectors/drift/_nml/_thresholds.py +66 -77
- dataeval/detectors/linters/outliers.py +7 -7
- dataeval/metrics/bias/_parity.py +10 -13
- dataeval/metrics/estimators/_divergence.py +2 -4
- dataeval/metrics/stats/_base.py +103 -42
- dataeval/metrics/stats/_boxratiostats.py +21 -19
- dataeval/metrics/stats/_dimensionstats.py +14 -10
- dataeval/metrics/stats/_hashstats.py +1 -1
- dataeval/metrics/stats/_pixelstats.py +6 -6
- dataeval/metrics/stats/_visualstats.py +3 -3
- dataeval/outputs/_base.py +22 -7
- dataeval/outputs/_bias.py +26 -28
- dataeval/outputs/_drift.py +1 -9
- dataeval/outputs/_linters.py +11 -11
- dataeval/outputs/_stats.py +82 -23
- dataeval/outputs/_workflows.py +2 -2
- dataeval/utils/_array.py +6 -9
- dataeval/utils/_bin.py +1 -2
- dataeval/utils/_clusterer.py +7 -4
- dataeval/utils/_fast_mst.py +27 -13
- dataeval/utils/_image.py +65 -11
- dataeval/utils/_mst.py +1 -3
- dataeval/utils/_plot.py +15 -10
- dataeval/utils/data/_dataset.py +32 -20
- dataeval/utils/data/metadata.py +104 -82
- dataeval/utils/datasets/__init__.py +2 -0
- dataeval/utils/datasets/_antiuav.py +189 -0
- dataeval/utils/datasets/_base.py +11 -8
- dataeval/utils/datasets/_cifar10.py +104 -45
- dataeval/utils/datasets/_fileio.py +21 -47
- dataeval/utils/datasets/_milco.py +19 -11
- dataeval/utils/datasets/_mixin.py +2 -4
- dataeval/utils/datasets/_mnist.py +3 -4
- dataeval/utils/datasets/_ships.py +14 -7
- dataeval/utils/datasets/_voc.py +229 -42
- dataeval/utils/torch/models.py +5 -10
- dataeval/utils/torch/trainer.py +3 -3
- dataeval/workflows/sufficiency.py +2 -2
- {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/METADATA +1 -1
- dataeval-0.86.1.dist-info/RECORD +114 -0
- dataeval/detectors/ood/vae.py +0 -74
- dataeval-0.86.0.dist-info/RECORD +0 -114
- {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.86.0.dist-info → dataeval-0.86.1.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
dataeval/_log.py
CHANGED
dataeval/config.py
CHANGED
@@ -4,10 +4,10 @@ Global configuration settings for DataEval.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
|
-
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "DeviceLike"]
|
7
|
+
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
|
8
8
|
|
9
9
|
import sys
|
10
|
-
from typing import Union
|
10
|
+
from typing import Any, Union
|
11
11
|
|
12
12
|
if sys.version_info >= (3, 10):
|
13
13
|
from typing import TypeAlias
|
@@ -78,8 +78,7 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
|
|
78
78
|
if override is None:
|
79
79
|
global _device
|
80
80
|
return torch.get_default_device() if _device is None else _device
|
81
|
-
|
82
|
-
return _todevice(override)
|
81
|
+
return _todevice(override)
|
83
82
|
|
84
83
|
|
85
84
|
def set_max_processes(processes: int | None) -> None:
|
@@ -112,6 +111,24 @@ def get_max_processes() -> int | None:
|
|
112
111
|
return _processes
|
113
112
|
|
114
113
|
|
114
|
+
class MaxProcessesContextManager:
|
115
|
+
def __init__(self, processes: int) -> None:
|
116
|
+
self._processes = processes
|
117
|
+
|
118
|
+
def __enter__(self) -> None:
|
119
|
+
global _processes
|
120
|
+
self._old = _processes
|
121
|
+
set_max_processes(self._processes)
|
122
|
+
|
123
|
+
def __exit__(self, *args: tuple[Any, ...]) -> None:
|
124
|
+
global _processes
|
125
|
+
_processes = self._old
|
126
|
+
|
127
|
+
|
128
|
+
def use_max_processes(processes: int) -> MaxProcessesContextManager:
|
129
|
+
return MaxProcessesContextManager(processes)
|
130
|
+
|
131
|
+
|
115
132
|
def set_seed(seed: int | None, all_generators: bool = False) -> None:
|
116
133
|
"""
|
117
134
|
Sets the seed for use by classes that allow for a random state or seed.
|
dataeval/data/_embeddings.py
CHANGED
@@ -144,8 +144,7 @@ class Embeddings:
|
|
144
144
|
"""
|
145
145
|
if indices is not None:
|
146
146
|
return torch.vstack(list(self._batch(indices))).to(self.device)
|
147
|
-
|
148
|
-
return self[:]
|
147
|
+
return self[:]
|
149
148
|
|
150
149
|
def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
|
151
150
|
"""
|
@@ -248,6 +247,7 @@ class Embeddings:
|
|
248
247
|
_logger.log(logging.DEBUG, f"Saved embeddings cache from {path}")
|
249
248
|
except Exception as e:
|
250
249
|
_logger.log(logging.ERROR, f"Failed to save embeddings cache: {e}")
|
250
|
+
raise e
|
251
251
|
|
252
252
|
@classmethod
|
253
253
|
def load(cls, path: Path | str) -> Embeddings:
|
dataeval/data/_images.py
CHANGED
@@ -73,15 +73,14 @@ class Images(Generic[T]):
|
|
73
73
|
def __getitem__(self, key: int | slice, /) -> Sequence[T] | T:
|
74
74
|
if isinstance(key, slice):
|
75
75
|
return [self._get_image(k) for k in range(len(self._dataset))[key]]
|
76
|
-
|
76
|
+
if hasattr(key, "__int__"):
|
77
77
|
return self._get_image(int(key))
|
78
78
|
raise TypeError(f"Key must be integers or slices, not {type(key)}")
|
79
79
|
|
80
80
|
def _get_image(self, index: int) -> T:
|
81
81
|
if self._is_tuple_datum:
|
82
82
|
return cast(Dataset[tuple[T, Any, Any]], self._dataset)[index][0]
|
83
|
-
|
84
|
-
return cast(Dataset[T], self._dataset)[index]
|
83
|
+
return cast(Dataset[T], self._dataset)[index]
|
85
84
|
|
86
85
|
def __iter__(self) -> Iterator[T]:
|
87
86
|
for i in range(len(self._dataset)):
|
dataeval/data/_metadata.py
CHANGED
@@ -196,7 +196,7 @@ class Metadata:
|
|
196
196
|
self._process()
|
197
197
|
return int(self._image_indices.max() + 1)
|
198
198
|
|
199
|
-
def _collate(self, force: bool = False):
|
199
|
+
def _collate(self, force: bool = False) -> None:
|
200
200
|
if self._collated and not force:
|
201
201
|
return
|
202
202
|
|
@@ -243,7 +243,7 @@ class Metadata:
|
|
243
243
|
self._class_names = [index2label.get(i, str(i)) for i in np.unique(self._class_labels)]
|
244
244
|
self._collated = True
|
245
245
|
|
246
|
-
def _merge(self, force: bool = False):
|
246
|
+
def _merge(self, force: bool = False) -> None:
|
247
247
|
if self._merged is not None and not force:
|
248
248
|
return
|
249
249
|
|
@@ -266,48 +266,26 @@ class Metadata:
|
|
266
266
|
"Metadata dictionary needs to be a single dictionary whose values "
|
267
267
|
"are arraylike containing the metadata on a per image or per object basis."
|
268
268
|
)
|
269
|
-
else
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
)
|
269
|
+
check_length = len(v) if check_length is None else check_length
|
270
|
+
if check_length != len(v):
|
271
|
+
raise ValueError(
|
272
|
+
"The lists/arrays in the metadata dict have varying lengths. "
|
273
|
+
"Metadata requires them to be uniform in length."
|
274
|
+
)
|
276
275
|
if len(self._class_labels) != check_length:
|
277
276
|
raise ValueError(
|
278
277
|
f"The length of the label array {len(self._class_labels)} is not the same as "
|
279
278
|
f"the length of the metadata arrays {check_length}."
|
280
279
|
)
|
281
280
|
|
282
|
-
def
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
# Create image indices from targets
|
287
|
-
self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
|
288
|
-
|
289
|
-
# Validate the metadata dimensions
|
290
|
-
self._validate()
|
291
|
-
|
292
|
-
# Include specified metadata keys
|
293
|
-
if self.include:
|
294
|
-
metadata = {i: self.merged[i] for i in self.include if i in self.merged}
|
295
|
-
continuous_factor_bins = (
|
296
|
-
{i: self.continuous_factor_bins[i] for i in self.include if i in self.continuous_factor_bins}
|
297
|
-
if self.continuous_factor_bins
|
298
|
-
else {}
|
299
|
-
)
|
300
|
-
else:
|
301
|
-
metadata = self.merged
|
302
|
-
continuous_factor_bins = dict(self.continuous_factor_bins) if self.continuous_factor_bins else {}
|
303
|
-
for k in self.exclude:
|
304
|
-
metadata.pop(k, None)
|
305
|
-
continuous_factor_bins.pop(k, None)
|
306
|
-
|
307
|
-
# Remove generated "_image_index" if present
|
308
|
-
if "_image_index" in metadata:
|
309
|
-
metadata.pop("_image_index", None)
|
281
|
+
def _filter(self, d: Mapping[str, Any]) -> dict[str, Any]:
|
282
|
+
return (
|
283
|
+
{k: d[k] for k in self.include if k in d} if self.include else {k: d[k] for k in d if k not in self.exclude}
|
284
|
+
)
|
310
285
|
|
286
|
+
def _split_continuous_discrete(
|
287
|
+
self, metadata: dict[str, NDArray[Any]], continuous_factor_bins: dict[str, int | Sequence[float]]
|
288
|
+
) -> tuple[dict[str, NDArray[Any]], dict[str, NDArray[np.int64]]]:
|
311
289
|
# Bin according to user supplied bins
|
312
290
|
continuous_metadata = {}
|
313
291
|
discrete_metadata = {}
|
@@ -346,6 +324,28 @@ class Metadata:
|
|
346
324
|
else:
|
347
325
|
_, discrete_metadata[key] = np.unique(data, return_inverse=True)
|
348
326
|
|
327
|
+
return continuous_metadata, discrete_metadata
|
328
|
+
|
329
|
+
def _process(self, force: bool = False) -> None:
|
330
|
+
if self._processed and not force:
|
331
|
+
return
|
332
|
+
|
333
|
+
# Create image indices from targets
|
334
|
+
self._image_indices = np.arange(len(self.raw)) if self.targets.source is None else self.targets.source
|
335
|
+
|
336
|
+
# Validate the metadata dimensions
|
337
|
+
self._validate()
|
338
|
+
|
339
|
+
# Filter the merged metadata and continuous factor bins
|
340
|
+
metadata = self._filter(self.merged)
|
341
|
+
continuous_factor_bins = self._filter(self.continuous_factor_bins)
|
342
|
+
|
343
|
+
# Remove generated "_image_index" if present
|
344
|
+
metadata.pop("_image_index", None)
|
345
|
+
|
346
|
+
# Split the metadata into continuous and discrete
|
347
|
+
continuous_metadata, discrete_metadata = self._split_continuous_discrete(metadata, continuous_factor_bins)
|
348
|
+
|
349
349
|
# Split out the dictionaries into the keys and values
|
350
350
|
self._discrete_factor_names = list(discrete_metadata.keys())
|
351
351
|
self._discrete_data = (
|
@@ -363,6 +363,17 @@ class Metadata:
|
|
363
363
|
self._processed = True
|
364
364
|
|
365
365
|
def add_factors(self, factors: Mapping[str, ArrayLike]) -> None:
|
366
|
+
"""
|
367
|
+
Add additional factors to the metadata.
|
368
|
+
|
369
|
+
The number of measures per factor must match the number of images
|
370
|
+
in the dataset or the number of detections in the dataset.
|
371
|
+
|
372
|
+
Parameters
|
373
|
+
----------
|
374
|
+
factors : Mapping[str, ArrayLike]
|
375
|
+
Dictionary of factors to add to the metadata.
|
376
|
+
"""
|
366
377
|
self._merge()
|
367
378
|
|
368
379
|
targets = len(self.targets.source) if self.targets.source is not None else len(self.targets)
|
dataeval/data/_selection.py
CHANGED
@@ -110,8 +110,7 @@ class Select(AnnotatedDataset[_TDatum]):
|
|
110
110
|
grouped: dict[int, list[Selection[_TDatum]]] = {}
|
111
111
|
for selection in selections_list:
|
112
112
|
grouped.setdefault(selection.stage, []).append(selection)
|
113
|
-
|
114
|
-
return selection_list
|
113
|
+
return [selection for category in sorted(grouped) for selection in grouped[category]]
|
115
114
|
|
116
115
|
def _apply_selections(self) -> None:
|
117
116
|
for selection in self._selections:
|
dataeval/data/_split.py
CHANGED
@@ -23,7 +23,7 @@ _logger = logging.getLogger(__name__)
|
|
23
23
|
class KFoldSplitter(Protocol):
|
24
24
|
"""Protocol covering sklearn KFold variant splitters"""
|
25
25
|
|
26
|
-
def __init__(self, n_splits: int): ...
|
26
|
+
def __init__(self, n_splits: int) -> None: ...
|
27
27
|
def split(self, X: Any, y: Any, groups: Any) -> Iterator[tuple[NDArray[Any], NDArray[Any]]]: ...
|
28
28
|
|
29
29
|
|
@@ -209,8 +209,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
|
|
209
209
|
split_set = set(split_on)
|
210
210
|
indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
|
211
211
|
binned_features = metadata.discrete_data[:, indices]
|
212
|
-
|
213
|
-
return group_ids
|
212
|
+
return np.unique(binned_features, axis=0, return_inverse=True)[1]
|
214
213
|
|
215
214
|
|
216
215
|
def make_splits(
|
dataeval/data/_targets.py
CHANGED
@@ -24,11 +24,13 @@ class Targets:
|
|
24
24
|
labels : NDArray[np.intp]
|
25
25
|
Labels (N,) for N images or objects
|
26
26
|
scores : NDArray[np.float32]
|
27
|
-
Probability scores (N,M) for N images of M classes or confidence score (N,) of objects
|
27
|
+
Probability scores (N, M) for N images of M classes or confidence score (N,) of objects
|
28
28
|
bboxes : NDArray[np.float32] | None
|
29
|
-
Bounding boxes (N,4) for N objects in (x0,y0,x1,y1) format
|
29
|
+
Bounding boxes (N, 4) for N objects in (x0, y0, x1, y1) format
|
30
30
|
source : NDArray[np.intp] | None
|
31
31
|
Source image index (N,) for N objects
|
32
|
+
size : int
|
33
|
+
Count of objects
|
32
34
|
"""
|
33
35
|
|
34
36
|
labels: NDArray[np.intp]
|
@@ -55,13 +57,16 @@ class Targets:
|
|
55
57
|
)
|
56
58
|
|
57
59
|
if self.bboxes is not None and len(self.bboxes) > 0 and self.bboxes.shape[-1] != 4:
|
58
|
-
raise ValueError("Bounding boxes must be in (x0,y0,x1,y1) format.")
|
60
|
+
raise ValueError("Bounding boxes must be in (x0, y0, x1, y1) format.")
|
61
|
+
|
62
|
+
@property
|
63
|
+
def size(self) -> int:
|
64
|
+
return len(self.labels)
|
59
65
|
|
60
66
|
def __len__(self) -> int:
|
61
67
|
if self.source is None:
|
62
68
|
return len(self.labels)
|
63
|
-
|
64
|
-
return len(np.unique(self.source))
|
69
|
+
return len(np.unique(self.source))
|
65
70
|
|
66
71
|
def __getitem__(self, idx: int, /) -> Targets:
|
67
72
|
if self.source is None or self.bboxes is None:
|
@@ -71,14 +76,13 @@ class Targets:
|
|
71
76
|
None,
|
72
77
|
None,
|
73
78
|
)
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
)
|
79
|
+
mask = np.where(self.source == idx, True, False)
|
80
|
+
return Targets(
|
81
|
+
np.atleast_1d(self.labels[mask]),
|
82
|
+
np.atleast_1d(self.scores[mask]),
|
83
|
+
np.atleast_2d(self.bboxes[mask]),
|
84
|
+
np.atleast_1d(self.source[mask]),
|
85
|
+
)
|
82
86
|
|
83
87
|
def __iter__(self) -> Iterator[Targets]:
|
84
88
|
for i in range(len(self.labels)) if self.source is None else np.unique(self.source):
|
@@ -68,11 +68,8 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
|
|
68
68
|
|
69
69
|
|
70
70
|
def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
|
71
|
-
if isinstance(obj,
|
72
|
-
if isinstance(obj, Array)
|
73
|
-
return obj[mask]
|
74
|
-
elif isinstance(obj, Sequence):
|
75
|
-
return cast(_T, [item for i, item in enumerate(obj) if mask[i]])
|
71
|
+
if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
|
72
|
+
return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
|
76
73
|
return obj
|
77
74
|
|
78
75
|
|
@@ -99,8 +99,7 @@ class _KNNSorter(_Sorter):
|
|
99
99
|
np.fill_diagonal(dists, np.inf)
|
100
100
|
else:
|
101
101
|
dists = pairwise_distances(embeddings, reference)
|
102
|
-
|
103
|
-
return inds
|
102
|
+
return np.argsort(np.sort(dists, axis=1)[:, self._k])
|
104
103
|
|
105
104
|
|
106
105
|
class _KMeansSorter(_Sorter):
|
@@ -124,15 +123,13 @@ class _KMeansSorter(_Sorter):
|
|
124
123
|
class _KMeansDistanceSorter(_KMeansSorter):
|
125
124
|
def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
|
126
125
|
clst = self._get_clusters(embeddings if reference is None else reference)
|
127
|
-
|
128
|
-
return inds
|
126
|
+
return np.argsort(clst._dist2center(embeddings))
|
129
127
|
|
130
128
|
|
131
129
|
class _KMeansComplexitySorter(_KMeansSorter):
|
132
130
|
def _sort(self, embeddings: NDArray[Any], reference: NDArray[Any] | None = None) -> NDArray[np.intp]:
|
133
131
|
clst = self._get_clusters(embeddings if reference is None else reference)
|
134
|
-
|
135
|
-
return inds
|
132
|
+
return clst._sort_by_weights(embeddings)
|
136
133
|
|
137
134
|
|
138
135
|
class Prioritize(Selection[Any]):
|
@@ -266,10 +263,10 @@ class Prioritize(Selection[Any]):
|
|
266
263
|
def _get_sorter(self, samples: int) -> _Sorter:
|
267
264
|
if self._method == "knn":
|
268
265
|
return _KNNSorter(samples, self._k)
|
269
|
-
|
266
|
+
if self._method == "kmeans_distance":
|
270
267
|
return _KMeansDistanceSorter(samples, self._c)
|
271
|
-
|
272
|
-
|
268
|
+
# self._method == "kmeans_complexity"
|
269
|
+
return _KMeansComplexitySorter(samples, self._c)
|
273
270
|
|
274
271
|
def _to_normalized_ndarray(self, embeddings: Embeddings, selection: list[int] | None = None) -> NDArray[Any]:
|
275
272
|
emb: NDArray[Any] = embeddings.to_numpy(selection)
|
@@ -30,7 +30,9 @@ class Shuffle(Selection[Any]):
|
|
30
30
|
seed: int | NDArray[Any] | SeedSequence | BitGenerator | Generator | None
|
31
31
|
stage = SelectionStage.ORDER
|
32
32
|
|
33
|
-
def __init__(
|
33
|
+
def __init__(
|
34
|
+
self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
|
35
|
+
) -> None:
|
34
36
|
self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
|
35
37
|
|
36
38
|
def __call__(self, dataset: Select[Any]) -> None:
|
@@ -13,7 +13,7 @@ __all__ = []
|
|
13
13
|
import math
|
14
14
|
from abc import abstractmethod
|
15
15
|
from functools import wraps
|
16
|
-
from typing import Callable, Literal, Protocol, TypeVar, runtime_checkable
|
16
|
+
from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
|
17
17
|
|
18
18
|
import numpy as np
|
19
19
|
from numpy.typing import NDArray
|
@@ -40,7 +40,7 @@ def update_strategy(fn: Callable[..., R]) -> Callable[..., R]:
|
|
40
40
|
"""Decorator to update x_ref with x using selected update methodology"""
|
41
41
|
|
42
42
|
@wraps(fn)
|
43
|
-
def _(self: BaseDrift, data: Embeddings | Array, *args, **kwargs) -> R:
|
43
|
+
def _(self: BaseDrift, data: Embeddings | Array, *args: tuple[Any, ...], **kwargs: dict[str, Any]) -> R:
|
44
44
|
output = fn(self, data, *args, **kwargs)
|
45
45
|
|
46
46
|
# update reference dataset
|
@@ -184,7 +184,7 @@ class BaseDriftUnivariate(BaseDrift):
|
|
184
184
|
threshold = self.p_val / self.n_features
|
185
185
|
drift_pred = bool((p_vals < threshold).any())
|
186
186
|
return drift_pred, threshold
|
187
|
-
|
187
|
+
if self.correction == "fdr":
|
188
188
|
n = p_vals.shape[0]
|
189
189
|
i = np.arange(n) + np.int_(1)
|
190
190
|
p_sorted = np.sort(p_vals)
|
@@ -195,8 +195,7 @@ class BaseDriftUnivariate(BaseDrift):
|
|
195
195
|
except ValueError: # sorted p-values not below thresholds
|
196
196
|
return bool(below_threshold.any()), q_threshold.min()
|
197
197
|
return bool(below_threshold.any()), q_threshold[idx_threshold]
|
198
|
-
|
199
|
-
raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
|
198
|
+
raise ValueError("`correction` needs to be either `bonferroni` or `fdr`.")
|
200
199
|
|
201
200
|
@set_metadata
|
202
201
|
@update_strategy
|
dataeval/detectors/drift/_mmd.py
CHANGED
@@ -95,8 +95,7 @@ class DriftMMD(BaseDrift):
|
|
95
95
|
k_xy = self._kernel(x, y)
|
96
96
|
k_xx = self._k_xx if self._k_xx is not None and self.update_strategy is None else self._kernel(x, x)
|
97
97
|
k_yy = self._kernel(y, y)
|
98
|
-
|
99
|
-
return kernel_mat
|
98
|
+
return torch.cat([torch.cat([k_xx, k_xy], 1), torch.cat([k_xy.T, k_yy], 1)], 0)
|
100
99
|
|
101
100
|
def score(self, data: Embeddings | Array) -> tuple[float, float, float]:
|
102
101
|
"""
|
@@ -205,8 +204,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
|
|
205
204
|
n = min(x.shape[0], y.shape[0])
|
206
205
|
n = n if (x[:n] == y[:n]).all() and x.shape == y.shape else 0
|
207
206
|
n_median = n + (torch.prod(torch.as_tensor(dist.shape)) - n) // 2 - 1
|
208
|
-
|
209
|
-
return sigma
|
207
|
+
return (0.5 * dist.flatten().sort().values[int(n_median)].unsqueeze(dim=-1)) ** 0.5
|
210
208
|
|
211
209
|
|
212
210
|
class GaussianRBF(torch.nn.Module):
|
@@ -310,5 +308,4 @@ def mmd2_from_kernel_matrix(
|
|
310
308
|
kernel_mat = kernel_mat[idx][:, idx]
|
311
309
|
k_xx, k_yy, k_xy = kernel_mat[:-m, :-m], kernel_mat[-m:, -m:], kernel_mat[-m:, :-m]
|
312
310
|
c_xx, c_yy = 1 / (n * (n - 1)), 1 / (m * (m - 1))
|
313
|
-
|
314
|
-
return mmd2
|
311
|
+
return c_xx * k_xx.sum() + c_yy * k_yy.sum() - 2.0 * k_xy.mean()
|
@@ -27,7 +27,9 @@ def _validate(data: pd.DataFrame, expected_features: int | None = None) -> int:
|
|
27
27
|
return data.shape[-1]
|
28
28
|
|
29
29
|
|
30
|
-
def _create_multilevel_index(
|
30
|
+
def _create_multilevel_index(
|
31
|
+
chunks: Sequence[Chunk], result_group_name: str, result_column_names: Sequence[str]
|
32
|
+
) -> pd.MultiIndex:
|
31
33
|
chunk_column_names = (*chunks[0].KEYS, "period")
|
32
34
|
chunk_tuples = [("chunk", chunk_column_name) for chunk_column_name in chunk_column_names]
|
33
35
|
result_tuples = [(result_group_name, column_name) for column_name in result_column_names]
|
@@ -37,7 +39,7 @@ def _create_multilevel_index(chunks: Sequence[Chunk], result_group_name: str, re
|
|
37
39
|
class AbstractCalculator(ABC):
|
38
40
|
"""Base class for drift calculation."""
|
39
41
|
|
40
|
-
def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None):
|
42
|
+
def __init__(self, chunker: Chunker | None = None, logger: Logger | None = None) -> None:
|
41
43
|
self.chunker = chunker if isinstance(chunker, Chunker) else CountBasedChunker(10)
|
42
44
|
self.result: DriftMVDCOutput | None = None
|
43
45
|
self.n_features: int | None = None
|
@@ -16,7 +16,6 @@ from abc import ABC, abstractmethod
|
|
16
16
|
from typing import Any, Generic, Literal, Sequence, TypeVar, cast
|
17
17
|
|
18
18
|
import pandas as pd
|
19
|
-
from dateutil.parser import ParserError
|
20
19
|
from pandas import Index, Period
|
21
20
|
from typing_extensions import Self
|
22
21
|
|
@@ -31,7 +30,7 @@ class Chunk(ABC):
|
|
31
30
|
def __init__(
|
32
31
|
self,
|
33
32
|
data: pd.DataFrame,
|
34
|
-
):
|
33
|
+
) -> None:
|
35
34
|
self.key: str
|
36
35
|
self.data = data
|
37
36
|
|
@@ -39,11 +38,11 @@ class Chunk(ABC):
|
|
39
38
|
self.end_index: int = -1
|
40
39
|
self.chunk_index: int = -1
|
41
40
|
|
42
|
-
def __repr__(self):
|
41
|
+
def __repr__(self) -> str:
|
43
42
|
attr_str = ", ".join([f"{k}={v}" for k, v in self.dict().items()])
|
44
43
|
return f"{self.__class__.__name__}(data=pd.DataFrame(shape={self.data.shape}), {attr_str})"
|
45
44
|
|
46
|
-
def __len__(self):
|
45
|
+
def __len__(self) -> int:
|
47
46
|
return self.data.shape[0]
|
48
47
|
|
49
48
|
@abstractmethod
|
@@ -76,7 +75,7 @@ class IndexChunk(Chunk):
|
|
76
75
|
data: pd.DataFrame,
|
77
76
|
start_index: int,
|
78
77
|
end_index: int,
|
79
|
-
):
|
78
|
+
) -> None:
|
80
79
|
super().__init__(data)
|
81
80
|
self.key = f"[{start_index}:{end_index}]"
|
82
81
|
self.start_index: int = start_index
|
@@ -113,7 +112,7 @@ class PeriodChunk(Chunk):
|
|
113
112
|
|
114
113
|
KEYS = ("key", "chunk_index", "start_date", "end_date", "chunk_size")
|
115
114
|
|
116
|
-
def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int):
|
115
|
+
def __init__(self, data: pd.DataFrame, period: Period, chunk_size: int) -> None:
|
117
116
|
super().__init__(data)
|
118
117
|
self.key = str(period)
|
119
118
|
self.start_datetime = period.start_time
|
@@ -127,6 +126,7 @@ class PeriodChunk(Chunk):
|
|
127
126
|
a, b = (self, other) if self < other else (other, self)
|
128
127
|
result = copy.deepcopy(a)
|
129
128
|
result.data = pd.concat([a.data, b.data])
|
129
|
+
result.end_index = b.end_index
|
130
130
|
result.end_datetime = b.end_datetime
|
131
131
|
result.chunk_size += b.chunk_size
|
132
132
|
return result
|
@@ -237,13 +237,7 @@ class PeriodBasedChunker(Chunker[PeriodChunk]):
|
|
237
237
|
if self.timestamp_column_name not in data:
|
238
238
|
raise ValueError(f"timestamp column '{self.timestamp_column_name}' not in columns")
|
239
239
|
|
240
|
-
|
241
|
-
grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
|
242
|
-
except ParserError:
|
243
|
-
raise ValueError(
|
244
|
-
f"could not parse date_column '{self.timestamp_column_name}' values as dates."
|
245
|
-
f"Please verify if you've specified the correct date column."
|
246
|
-
)
|
240
|
+
grouped = data.groupby(pd.to_datetime(data[self.timestamp_column_name]).dt.to_period(self.offset))
|
247
241
|
|
248
242
|
for k, v in grouped.groups.items():
|
249
243
|
period, index = cast(Period, k), cast(Index, v)
|
@@ -281,7 +275,7 @@ class SizeBasedChunker(Chunker[IndexChunk]):
|
|
281
275
|
self,
|
282
276
|
chunk_size: int,
|
283
277
|
incomplete: Literal["append", "drop", "keep"] = "keep",
|
284
|
-
):
|
278
|
+
) -> None:
|
285
279
|
"""Create a new SizeBasedChunker.
|
286
280
|
|
287
281
|
Parameters
|
@@ -314,12 +308,11 @@ class SizeBasedChunker(Chunker[IndexChunk]):
|
|
314
308
|
def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
|
315
309
|
def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> IndexChunk:
|
316
310
|
chunk_data = data.iloc[index : index + chunk_size]
|
317
|
-
|
311
|
+
return IndexChunk(
|
318
312
|
data=chunk_data,
|
319
313
|
start_index=index,
|
320
314
|
end_index=index + chunk_size - 1,
|
321
315
|
)
|
322
|
-
return chunk
|
323
316
|
|
324
317
|
chunks = [
|
325
318
|
_create_chunk(index=i, data=data, chunk_size=self.chunk_size)
|
@@ -364,7 +357,7 @@ class CountBasedChunker(Chunker[IndexChunk]):
|
|
364
357
|
self,
|
365
358
|
chunk_number: int,
|
366
359
|
incomplete: Literal["append", "drop", "keep"] = "keep",
|
367
|
-
):
|
360
|
+
) -> None:
|
368
361
|
"""Creates a new CountBasedChunker.
|
369
362
|
|
370
363
|
It will calculate the amount of observations per chunk based on the given chunk count.
|
@@ -400,5 +393,4 @@ class CountBasedChunker(Chunker[IndexChunk]):
|
|
400
393
|
def _split(self, data: pd.DataFrame) -> list[IndexChunk]:
|
401
394
|
chunk_size = data.shape[0] // self.chunk_number
|
402
395
|
chunker = SizeBasedChunker(chunk_size, self.incomplete)
|
403
|
-
|
404
|
-
return chunks
|
396
|
+
return chunker.split(data=data)
|
@@ -20,7 +20,7 @@ from sklearn.model_selection import StratifiedKFold
|
|
20
20
|
from dataeval.config import get_max_processes, get_seed
|
21
21
|
from dataeval.detectors.drift._nml._base import AbstractCalculator, _create_multilevel_index
|
22
22
|
from dataeval.detectors.drift._nml._chunk import Chunk, Chunker
|
23
|
-
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
|
23
|
+
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold, Threshold
|
24
24
|
from dataeval.outputs._base import set_metadata
|
25
25
|
from dataeval.outputs._drift import DriftMVDCOutput
|
26
26
|
|
@@ -38,10 +38,8 @@ DEFAULT_LGBM_HYPERPARAMS = {
|
|
38
38
|
"min_child_weight": 0.001,
|
39
39
|
"min_split_gain": 0.0,
|
40
40
|
"n_estimators": 100,
|
41
|
-
"n_jobs": get_max_processes() or 0,
|
42
41
|
"num_leaves": 31,
|
43
42
|
"objective": None,
|
44
|
-
"random_state": get_seed(),
|
45
43
|
"reg_alpha": 0.0,
|
46
44
|
"reg_lambda": 0.0,
|
47
45
|
"subsample": 1.0,
|
@@ -126,7 +124,7 @@ class DomainClassifierCalculator(AbstractCalculator):
|
|
126
124
|
self.result._data = pd.concat([self.result._data, res], ignore_index=True)
|
127
125
|
return self.result
|
128
126
|
|
129
|
-
def _calculate_chunk(self, chunk: Chunk):
|
127
|
+
def _calculate_chunk(self, chunk: Chunk) -> float:
|
130
128
|
if self.result is None:
|
131
129
|
# Use information from chunk indices to identify reference chunk's location. This is possible because
|
132
130
|
# both the internal reference data copy and the chunk data were sorted by timestamp, so these
|
@@ -151,7 +149,7 @@ class DomainClassifierCalculator(AbstractCalculator):
|
|
151
149
|
_try = y[train_index]
|
152
150
|
_tsx = df_X.iloc[test_index]
|
153
151
|
_tsy = y[test_index]
|
154
|
-
model = LGBMClassifier(**self.hyperparameters)
|
152
|
+
model = LGBMClassifier(**self.hyperparameters, n_jobs=get_max_processes(), random_state=get_seed())
|
155
153
|
model.fit(_trx, _try)
|
156
154
|
preds = np.asarray(model.predict_proba(_tsx), dtype=np.float32)[:, 1]
|
157
155
|
all_preds.append(preds)
|
@@ -159,24 +157,15 @@ class DomainClassifierCalculator(AbstractCalculator):
|
|
159
157
|
|
160
158
|
np_all_preds = np.concatenate(all_preds, axis=0)
|
161
159
|
np_all_tgts = np.concatenate(all_tgts, axis=0)
|
162
|
-
|
163
|
-
|
164
|
-
result = roc_auc_score(np_all_tgts, np_all_preds)
|
165
|
-
except ValueError as err:
|
166
|
-
if str(err) != "Only one class present in y_true. ROC AUC score is not defined in that case.":
|
167
|
-
raise
|
168
|
-
else:
|
169
|
-
# by definition if reference and chunk exactly match we can't discriminate
|
170
|
-
result = 0.5
|
171
|
-
return result
|
160
|
+
result = roc_auc_score(np_all_tgts, np_all_preds)
|
161
|
+
return 0.5 if result == np.nan else float(result)
|
172
162
|
|
173
163
|
def _populate_alert_thresholds(self, result_data: pd.DataFrame) -> pd.DataFrame:
|
174
164
|
if self.result is None:
|
175
|
-
self._threshold_values =
|
176
|
-
threshold=self.threshold,
|
165
|
+
self._threshold_values = self.threshold.calculate(
|
177
166
|
data=result_data.loc[:, ("domain_classifier_auroc", "value")], # type: ignore | dataframe loc
|
178
|
-
|
179
|
-
|
167
|
+
lower_limit=0.0,
|
168
|
+
upper_limit=1.0,
|
180
169
|
logger=self._logger,
|
181
170
|
)
|
182
171
|
|