dataeval 0.87.0__py3-none-any.whl → 0.88.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/_log.py +1 -1
- dataeval/_version.py +2 -2
- dataeval/data/_embeddings.py +78 -35
- dataeval/data/_images.py +41 -8
- dataeval/data/_metadata.py +294 -41
- dataeval/data/_selection.py +22 -7
- dataeval/data/_split.py +2 -1
- dataeval/data/selections/_classfilter.py +4 -3
- dataeval/data/selections/_indices.py +2 -1
- dataeval/data/selections/_shuffle.py +3 -2
- dataeval/detectors/drift/_base.py +2 -1
- dataeval/detectors/drift/_mmd.py +2 -1
- dataeval/detectors/drift/_nml/_base.py +1 -1
- dataeval/detectors/drift/_nml/_chunk.py +2 -1
- dataeval/detectors/drift/_nml/_result.py +3 -2
- dataeval/detectors/drift/_nml/_thresholds.py +6 -5
- dataeval/detectors/drift/_uncertainty.py +2 -1
- dataeval/detectors/linters/duplicates.py +2 -1
- dataeval/detectors/linters/outliers.py +4 -3
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/detectors/ood/base.py +2 -1
- dataeval/detectors/ood/mixin.py +2 -1
- dataeval/metadata/_utils.py +1 -1
- dataeval/metrics/bias/_balance.py +1 -1
- dataeval/metrics/stats/_base.py +3 -29
- dataeval/metrics/stats/_boxratiostats.py +2 -1
- dataeval/metrics/stats/_dimensionstats.py +2 -1
- dataeval/metrics/stats/_hashstats.py +2 -1
- dataeval/metrics/stats/_pixelstats.py +2 -1
- dataeval/metrics/stats/_visualstats.py +2 -1
- dataeval/outputs/_base.py +2 -3
- dataeval/outputs/_bias.py +2 -1
- dataeval/outputs/_estimators.py +1 -1
- dataeval/outputs/_linters.py +3 -3
- dataeval/outputs/_stats.py +3 -3
- dataeval/outputs/_utils.py +1 -1
- dataeval/outputs/_workflows.py +29 -24
- dataeval/typing.py +11 -9
- dataeval/utils/_array.py +3 -2
- dataeval/utils/_bin.py +2 -1
- dataeval/utils/_method.py +2 -3
- dataeval/utils/_multiprocessing.py +34 -0
- dataeval/utils/_plot.py +2 -1
- dataeval/utils/data/__init__.py +4 -5
- dataeval/utils/data/{metadata.py → _merge.py} +3 -2
- dataeval/utils/data/_validate.py +2 -1
- dataeval/utils/data/collate.py +2 -1
- dataeval/utils/torch/_internal.py +2 -1
- dataeval/utils/torch/trainer.py +1 -1
- dataeval/workflows/sufficiency.py +13 -9
- {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/METADATA +4 -5
- dataeval-0.88.0.dist-info/RECORD +105 -0
- dataeval/utils/data/_dataset.py +0 -253
- dataeval-0.87.0.dist-info/RECORD +0 -105
- {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
- {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/licenses/LICENSE +0 -0
dataeval/_log.py
CHANGED
dataeval/_version.py
CHANGED
dataeval/data/_embeddings.py
CHANGED
@@ -5,8 +5,9 @@ __all__ = []
|
|
5
5
|
import logging
|
6
6
|
import math
|
7
7
|
import os
|
8
|
+
from collections.abc import Iterator, Sequence
|
8
9
|
from pathlib import Path
|
9
|
-
from typing import Any,
|
10
|
+
from typing import Any, cast
|
10
11
|
|
11
12
|
import torch
|
12
13
|
import xxhash as xxh
|
@@ -15,7 +16,14 @@ from torch.utils.data import DataLoader, Subset
|
|
15
16
|
from tqdm import tqdm
|
16
17
|
|
17
18
|
from dataeval.config import DeviceLike, get_device
|
18
|
-
from dataeval.typing import
|
19
|
+
from dataeval.typing import (
|
20
|
+
AnnotatedDataset,
|
21
|
+
AnnotatedModel,
|
22
|
+
Array,
|
23
|
+
ArrayLike,
|
24
|
+
Dataset,
|
25
|
+
Transform,
|
26
|
+
)
|
19
27
|
from dataeval.utils._array import as_numpy
|
20
28
|
from dataeval.utils.torch.models import SupportsEncode
|
21
29
|
|
@@ -26,38 +34,41 @@ class Embeddings:
|
|
26
34
|
"""
|
27
35
|
Collection of image embeddings from a dataset.
|
28
36
|
|
29
|
-
Embeddings are accessed by index or slice and are
|
37
|
+
Embeddings are accessed by index or slice and are loaded on-demand.
|
30
38
|
|
31
39
|
Parameters
|
32
40
|
----------
|
33
41
|
dataset : ImageClassificationDataset or ObjectDetectionDataset
|
34
42
|
Dataset to access original images from.
|
35
43
|
batch_size : int
|
36
|
-
Batch size to use when encoding images.
|
44
|
+
Batch size to use when encoding images. When less than 1, automatically sets to 1 for safe processing.
|
37
45
|
transforms : Transform or Sequence[Transform] or None, default None
|
38
|
-
|
46
|
+
Image transformationss to apply before encoding. When None, uses raw images without
|
47
|
+
preprocessing.
|
39
48
|
model : torch.nn.Module or None, default None
|
40
|
-
|
49
|
+
Neural network model that generates embeddings from images. When None, uses Flatten layer for simple
|
50
|
+
baseline compatibility with all DataEval tools without requiring pre-trained weights or GPU resources.
|
41
51
|
device : DeviceLike or None, default None
|
42
|
-
|
43
|
-
|
52
|
+
Hardware device for computation. When None, automatically selects DataEval's configured device, falling
|
53
|
+
back to PyTorch's default.
|
44
54
|
cache : Path, str, or bool, default False
|
45
|
-
|
46
|
-
When
|
55
|
+
When True, caches embeddings in memory for faster repeated access.
|
56
|
+
When Path or string is provided, persists embeddings to disk for reuse across sessions.
|
57
|
+
Default False minimizes memory usage.
|
47
58
|
verbose : bool, default False
|
48
|
-
|
59
|
+
When True, displays a progress bar when encoding images. Default False reduces console output
|
60
|
+
for cleaner automated workflows.
|
49
61
|
|
50
62
|
Attributes
|
51
63
|
----------
|
52
64
|
batch_size : int
|
53
|
-
|
65
|
+
Number of images processed per batch during encoding. Minimum value of 1.
|
54
66
|
cache : Path or bool
|
55
|
-
|
67
|
+
Disk path where embeddings are stored, or True when cached in memory.
|
56
68
|
device : torch.device
|
57
|
-
|
58
|
-
default or torch default.
|
69
|
+
Hardware device used for tensor computations.
|
59
70
|
verbose : bool
|
60
|
-
Whether
|
71
|
+
Whether progress information is displayed during operations.
|
61
72
|
"""
|
62
73
|
|
63
74
|
device: torch.device
|
@@ -66,6 +77,7 @@ class Embeddings:
|
|
66
77
|
|
67
78
|
def __init__(
|
68
79
|
self,
|
80
|
+
# Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
|
69
81
|
dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike],
|
70
82
|
batch_size: int,
|
71
83
|
transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
|
@@ -80,8 +92,8 @@ class Embeddings:
|
|
80
92
|
|
81
93
|
self._embeddings_only: bool = False
|
82
94
|
self._dataset = dataset
|
83
|
-
model = torch.nn.Flatten() if model is None else model
|
84
95
|
self._transforms = [transforms] if isinstance(transforms, Transform) else transforms
|
96
|
+
model = torch.nn.Flatten() if model is None else model
|
85
97
|
self._model = model.to(self.device).eval() if isinstance(model, torch.nn.Module) else model
|
86
98
|
self._encoder = model.encode if isinstance(model, SupportsEncode) else model
|
87
99
|
self._collate_fn = lambda datum: [torch.as_tensor(d[0] if isinstance(d, tuple) else d) for d in datum]
|
@@ -110,7 +122,7 @@ class Embeddings:
|
|
110
122
|
if isinstance(value, bool) and not value:
|
111
123
|
self._cached_idx = set()
|
112
124
|
self._embeddings = torch.empty(())
|
113
|
-
elif isinstance(value,
|
125
|
+
elif isinstance(value, Path | str):
|
114
126
|
value = self._resolve_path(value)
|
115
127
|
|
116
128
|
if isinstance(value, Path) and value != getattr(self, "_cache", None):
|
@@ -127,20 +139,24 @@ class Embeddings:
|
|
127
139
|
|
128
140
|
def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
|
129
141
|
"""
|
130
|
-
|
142
|
+
Convert dataset items to embedding tensor.
|
143
|
+
|
144
|
+
Process specified dataset indices through the model in batches and
|
145
|
+
return concatenated embeddings as a single tensor.
|
131
146
|
|
132
147
|
Parameters
|
133
148
|
----------
|
134
149
|
indices : Sequence[int] or None, default None
|
135
|
-
|
150
|
+
Dataset indices to convert to embeddings. When None, processes entire dataset.
|
136
151
|
|
137
152
|
Returns
|
138
153
|
-------
|
139
154
|
torch.Tensor
|
155
|
+
Concatenated embeddings with shape (n_samples, embedding_dim).
|
140
156
|
|
141
|
-
|
142
|
-
|
143
|
-
Processing large
|
157
|
+
Warnings
|
158
|
+
--------
|
159
|
+
Processing large datasets can be memory and compute intensive.
|
144
160
|
"""
|
145
161
|
if indices is not None:
|
146
162
|
return torch.vstack(list(self._batch(indices))).to(self.device)
|
@@ -148,35 +164,45 @@ class Embeddings:
|
|
148
164
|
|
149
165
|
def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
|
150
166
|
"""
|
151
|
-
|
167
|
+
Convert dataset items to embedding array.
|
152
168
|
|
153
169
|
Parameters
|
154
170
|
----------
|
155
171
|
indices : Sequence[int] or None, default None
|
156
|
-
|
172
|
+
Dataset indices to convert to embeddings. When None, processes entire dataset.
|
157
173
|
|
158
174
|
Returns
|
159
175
|
-------
|
160
176
|
NDArray[Any]
|
177
|
+
Embedding array with shape (n_samples, embedding_dim)
|
161
178
|
|
162
179
|
Warning
|
163
180
|
-------
|
164
|
-
Processing large
|
181
|
+
Processing large datasets can be memory and compute intensive.
|
165
182
|
"""
|
166
183
|
return self.to_tensor(indices).cpu().numpy()
|
167
184
|
|
168
185
|
def new(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Embeddings:
|
169
186
|
"""
|
170
|
-
|
187
|
+
Create new Embeddings instance with a different dataset.
|
188
|
+
|
189
|
+
Generate a new Embeddings object using the same model, transforms,
|
190
|
+
and configuration but with a different dataset.
|
171
191
|
|
172
192
|
Parameters
|
173
193
|
----------
|
174
194
|
dataset : ImageClassificationDataset or ObjectDetectionDataset
|
175
|
-
Dataset
|
195
|
+
Dataset that provides images for the new Embeddings instance.
|
176
196
|
|
177
197
|
Returns
|
178
198
|
-------
|
179
199
|
Embeddings
|
200
|
+
New Embeddings object configured identically to the current instance.
|
201
|
+
|
202
|
+
Raises
|
203
|
+
------
|
204
|
+
ValueError
|
205
|
+
When called on embeddings-only instance that lacks a model.
|
180
206
|
"""
|
181
207
|
if self._embeddings_only:
|
182
208
|
raise ValueError("Embeddings object does not have a model.")
|
@@ -187,15 +213,15 @@ class Embeddings:
|
|
187
213
|
@classmethod
|
188
214
|
def from_array(cls, array: ArrayLike, device: DeviceLike | None = None) -> Embeddings:
|
189
215
|
"""
|
190
|
-
|
216
|
+
Create Embeddings instance from an existing image array.
|
191
217
|
|
192
218
|
Parameters
|
193
219
|
----------
|
194
220
|
array : ArrayLike
|
195
|
-
|
221
|
+
In-memory image data to wrap in an Embeddings object.
|
196
222
|
device : DeviceLike or None, default None
|
197
|
-
|
198
|
-
|
223
|
+
Hardware device for computation. When None, automatically selects DataEval's configured device, falling
|
224
|
+
back to PyTorch's default.
|
199
225
|
|
200
226
|
Returns
|
201
227
|
-------
|
@@ -219,12 +245,15 @@ class Embeddings:
|
|
219
245
|
|
220
246
|
def save(self, path: Path | str) -> None:
|
221
247
|
"""
|
222
|
-
|
248
|
+
Save embeddings to disk.
|
249
|
+
|
250
|
+
Persist current embeddings to the specified file path for later
|
251
|
+
loading and reuse.
|
223
252
|
|
224
253
|
Parameters
|
225
254
|
----------
|
226
255
|
path : Path or str
|
227
|
-
|
256
|
+
File path where embeddings will be saved.
|
228
257
|
"""
|
229
258
|
self._save(self._resolve_path(path), True)
|
230
259
|
|
@@ -254,10 +283,24 @@ class Embeddings:
|
|
254
283
|
"""
|
255
284
|
Loads the embeddings from disk.
|
256
285
|
|
286
|
+
Create an Embeddings instance from previously saved embedding data.
|
287
|
+
|
257
288
|
Parameters
|
258
289
|
----------
|
259
290
|
path : Path or str
|
260
|
-
|
291
|
+
File path to load embeddings from.
|
292
|
+
|
293
|
+
Returns
|
294
|
+
-------
|
295
|
+
Embeddings
|
296
|
+
Embeddings-only instance containing the loaded data.
|
297
|
+
|
298
|
+
Raises
|
299
|
+
------
|
300
|
+
FileNotFoundError
|
301
|
+
When the specified file path does not exist.
|
302
|
+
Exception
|
303
|
+
When file loading or parsing fails.
|
261
304
|
"""
|
262
305
|
emb = Embeddings([], 0)
|
263
306
|
path = Path(os.path.abspath(path)) if isinstance(path, str) else path
|
dataeval/data/_images.py
CHANGED
@@ -2,7 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
5
|
+
from collections.abc import Iterator, Sequence
|
6
|
+
from typing import TYPE_CHECKING, Any, Generic, TypeVar, cast, overload
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
|
@@ -19,12 +20,18 @@ class Images(Generic[T]):
|
|
19
20
|
"""
|
20
21
|
Collection of image data from a dataset.
|
21
22
|
|
22
|
-
Images are accessed by index or slice and are
|
23
|
+
Images are accessed by index or slice and are loaded on-demand for
|
24
|
+
memory-efficient processing of large datasets.
|
23
25
|
|
24
26
|
Parameters
|
25
27
|
----------
|
26
28
|
dataset : Dataset[tuple[T, ...]] or Dataset[T]
|
27
|
-
Dataset
|
29
|
+
Dataset that provides image data for access and visualization.
|
30
|
+
|
31
|
+
Attributes
|
32
|
+
----------
|
33
|
+
None
|
34
|
+
All dataset access is handled through indexing operations.
|
28
35
|
"""
|
29
36
|
|
30
37
|
def __init__(
|
@@ -36,16 +43,20 @@ class Images(Generic[T]):
|
|
36
43
|
|
37
44
|
def to_list(self) -> Sequence[T]:
|
38
45
|
"""
|
39
|
-
|
46
|
+
Convert entire dataset to a sequence of images.
|
40
47
|
|
41
|
-
|
42
|
-
|
43
|
-
Will load the entire dataset and return the images as a
|
44
|
-
single sequence of images in memory.
|
48
|
+
Load all images from the dataset and return a single sequence
|
49
|
+
in memory for batch processing or analysis.
|
45
50
|
|
46
51
|
Returns
|
47
52
|
-------
|
48
53
|
list[T]
|
54
|
+
Complete sequence of all images in the dataset
|
55
|
+
|
56
|
+
Warnings
|
57
|
+
--------
|
58
|
+
Loading entire dataset into memory can consume significant resources
|
59
|
+
for large image collections.
|
49
60
|
"""
|
50
61
|
return self[:]
|
51
62
|
|
@@ -55,6 +66,28 @@ class Images(Generic[T]):
|
|
55
66
|
images_per_row: int = 3,
|
56
67
|
figsize: tuple[int, int] = (10, 10),
|
57
68
|
) -> Figure:
|
69
|
+
"""
|
70
|
+
Display images in a grid layout.
|
71
|
+
|
72
|
+
Create matplotlib figure showing specified images arranged in a
|
73
|
+
grid format for visual inspection and comparison.
|
74
|
+
|
75
|
+
Parameters
|
76
|
+
----------
|
77
|
+
indices : Sequence[int]
|
78
|
+
Dataset indices of images to display in the plot.
|
79
|
+
images_per_row : int, default 3
|
80
|
+
Number of images displayed per row in the grid. Default 3 provides a balanced layout
|
81
|
+
for most screen sizes.
|
82
|
+
figsize : tuple[int, int], default (10, 10)
|
83
|
+
Figure dimensions as (width, height) in inches. Default (10, 10)
|
84
|
+
accommodates typical grid layouts with readable detail.
|
85
|
+
|
86
|
+
Returns
|
87
|
+
-------
|
88
|
+
Figure
|
89
|
+
Matplotlib figure object containing the image grid display.
|
90
|
+
"""
|
58
91
|
import matplotlib.pyplot as plt
|
59
92
|
|
60
93
|
num_images = len(indices)
|