dataeval 0.86.9__py3-none-any.whl → 0.88.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_log.py +1 -1
  3. dataeval/_version.py +2 -2
  4. dataeval/config.py +4 -19
  5. dataeval/data/_embeddings.py +78 -35
  6. dataeval/data/_images.py +41 -8
  7. dataeval/data/_metadata.py +348 -66
  8. dataeval/data/_selection.py +22 -7
  9. dataeval/data/_split.py +3 -2
  10. dataeval/data/selections/_classbalance.py +4 -3
  11. dataeval/data/selections/_classfilter.py +9 -8
  12. dataeval/data/selections/_indices.py +4 -3
  13. dataeval/data/selections/_prioritize.py +249 -29
  14. dataeval/data/selections/_reverse.py +1 -1
  15. dataeval/data/selections/_shuffle.py +5 -4
  16. dataeval/detectors/drift/_base.py +2 -1
  17. dataeval/detectors/drift/_mmd.py +2 -1
  18. dataeval/detectors/drift/_nml/_base.py +1 -1
  19. dataeval/detectors/drift/_nml/_chunk.py +2 -1
  20. dataeval/detectors/drift/_nml/_result.py +3 -2
  21. dataeval/detectors/drift/_nml/_thresholds.py +6 -5
  22. dataeval/detectors/drift/_uncertainty.py +2 -1
  23. dataeval/detectors/linters/duplicates.py +2 -1
  24. dataeval/detectors/linters/outliers.py +4 -3
  25. dataeval/detectors/ood/__init__.py +2 -1
  26. dataeval/detectors/ood/ae.py +1 -1
  27. dataeval/detectors/ood/base.py +39 -1
  28. dataeval/detectors/ood/knn.py +95 -0
  29. dataeval/detectors/ood/mixin.py +2 -1
  30. dataeval/metadata/_utils.py +1 -1
  31. dataeval/metrics/bias/_balance.py +29 -22
  32. dataeval/metrics/bias/_diversity.py +4 -4
  33. dataeval/metrics/bias/_parity.py +2 -2
  34. dataeval/metrics/stats/_base.py +3 -29
  35. dataeval/metrics/stats/_boxratiostats.py +2 -1
  36. dataeval/metrics/stats/_dimensionstats.py +2 -1
  37. dataeval/metrics/stats/_hashstats.py +21 -3
  38. dataeval/metrics/stats/_pixelstats.py +2 -1
  39. dataeval/metrics/stats/_visualstats.py +2 -1
  40. dataeval/outputs/_base.py +2 -3
  41. dataeval/outputs/_bias.py +2 -1
  42. dataeval/outputs/_estimators.py +1 -1
  43. dataeval/outputs/_linters.py +3 -3
  44. dataeval/outputs/_stats.py +3 -3
  45. dataeval/outputs/_utils.py +1 -1
  46. dataeval/outputs/_workflows.py +49 -31
  47. dataeval/typing.py +23 -9
  48. dataeval/utils/__init__.py +2 -2
  49. dataeval/utils/_array.py +3 -2
  50. dataeval/utils/_bin.py +9 -7
  51. dataeval/utils/_method.py +2 -3
  52. dataeval/utils/_multiprocessing.py +34 -0
  53. dataeval/utils/_plot.py +2 -1
  54. dataeval/utils/data/__init__.py +6 -5
  55. dataeval/utils/data/{metadata.py → _merge.py} +3 -2
  56. dataeval/utils/data/_validate.py +170 -0
  57. dataeval/utils/data/collate.py +2 -1
  58. dataeval/utils/torch/_internal.py +2 -1
  59. dataeval/utils/torch/trainer.py +1 -1
  60. dataeval/workflows/sufficiency.py +13 -9
  61. {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/METADATA +8 -21
  62. dataeval-0.88.0.dist-info/RECORD +105 -0
  63. dataeval/utils/data/_dataset.py +0 -246
  64. dataeval/utils/datasets/__init__.py +0 -21
  65. dataeval/utils/datasets/_antiuav.py +0 -189
  66. dataeval/utils/datasets/_base.py +0 -266
  67. dataeval/utils/datasets/_cifar10.py +0 -201
  68. dataeval/utils/datasets/_fileio.py +0 -142
  69. dataeval/utils/datasets/_milco.py +0 -197
  70. dataeval/utils/datasets/_mixin.py +0 -54
  71. dataeval/utils/datasets/_mnist.py +0 -202
  72. dataeval/utils/datasets/_seadrone.py +0 -512
  73. dataeval/utils/datasets/_ships.py +0 -144
  74. dataeval/utils/datasets/_types.py +0 -48
  75. dataeval/utils/datasets/_voc.py +0 -583
  76. dataeval-0.86.9.dist-info/RECORD +0 -115
  77. {dataeval-0.86.9.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
  78. /dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.88.0.dist-info/licenses/LICENSE +0 -0
dataeval/__init__.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
9
 
10
10
  try:
11
11
  from ._version import __version__
12
- except ImportError:
12
+ except ImportError: # pragma: no cover
13
13
  __version__ = "unknown"
14
14
 
15
15
  # Strongly type for pyright
dataeval/_log.py CHANGED
@@ -1,6 +1,6 @@
1
1
  __all__ = []
2
2
 
3
- from typing import Callable
3
+ from collections.abc import Callable
4
4
 
5
5
 
6
6
  class LogMessage:
dataeval/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.86.9'
21
- __version_tuple__ = version_tuple = (0, 86, 9)
20
+ __version__ = version = '0.88.0'
21
+ __version_tuple__ = version_tuple = (0, 88, 0)
dataeval/config.py CHANGED
@@ -4,19 +4,15 @@ Global configuration settings for DataEval.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
7
+ __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"]
8
8
 
9
- import sys
10
- from typing import Any, Union
11
-
12
- if sys.version_info >= (3, 10):
13
- from typing import TypeAlias
14
- else:
15
- from typing_extensions import TypeAlias
9
+ from typing import Any
16
10
 
17
11
  import numpy as np
18
12
  import torch
19
13
 
14
+ from dataeval.typing import DeviceLike
15
+
20
16
  ### GLOBALS ###
21
17
 
22
18
  _device: torch.device | None = None
@@ -27,17 +23,6 @@ _seed: int | None = None
27
23
 
28
24
  EPSILON = 1e-12
29
25
 
30
- ### TYPES ###
31
-
32
- DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
33
- """
34
- Type alias for types that are acceptable for specifying a torch.device.
35
-
36
- See Also
37
- --------
38
- `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
39
- """
40
-
41
26
  ### FUNCS ###
42
27
 
43
28
 
@@ -5,8 +5,9 @@ __all__ = []
5
5
  import logging
6
6
  import math
7
7
  import os
8
+ from collections.abc import Iterator, Sequence
8
9
  from pathlib import Path
9
- from typing import Any, Iterator, Sequence, cast
10
+ from typing import Any, cast
10
11
 
11
12
  import torch
12
13
  import xxhash as xxh
@@ -15,7 +16,14 @@ from torch.utils.data import DataLoader, Subset
15
16
  from tqdm import tqdm
16
17
 
17
18
  from dataeval.config import DeviceLike, get_device
18
- from dataeval.typing import AnnotatedDataset, AnnotatedModel, Array, ArrayLike, Dataset, Transform
19
+ from dataeval.typing import (
20
+ AnnotatedDataset,
21
+ AnnotatedModel,
22
+ Array,
23
+ ArrayLike,
24
+ Dataset,
25
+ Transform,
26
+ )
19
27
  from dataeval.utils._array import as_numpy
20
28
  from dataeval.utils.torch.models import SupportsEncode
21
29
 
@@ -26,38 +34,41 @@ class Embeddings:
26
34
  """
27
35
  Collection of image embeddings from a dataset.
28
36
 
29
- Embeddings are accessed by index or slice and are only loaded on-demand.
37
+ Embeddings are accessed by index or slice and are loaded on-demand.
30
38
 
31
39
  Parameters
32
40
  ----------
33
41
  dataset : ImageClassificationDataset or ObjectDetectionDataset
34
42
  Dataset to access original images from.
35
43
  batch_size : int
36
- Batch size to use when encoding images.
44
+ Batch size to use when encoding images. When less than 1, automatically sets to 1 for safe processing.
37
45
  transforms : Transform or Sequence[Transform] or None, default None
38
- Transforms to apply to images before encoding.
46
+ Image transformationss to apply before encoding. When None, uses raw images without
47
+ preprocessing.
39
48
  model : torch.nn.Module or None, default None
40
- Model to use for encoding images.
49
+ Neural network model that generates embeddings from images. When None, uses Flatten layer for simple
50
+ baseline compatibility with all DataEval tools without requiring pre-trained weights or GPU resources.
41
51
  device : DeviceLike or None, default None
42
- The hardware device to use if specified, otherwise uses the DataEval
43
- default or torch default.
52
+ Hardware device for computation. When None, automatically selects DataEval's configured device, falling
53
+ back to PyTorch's default.
44
54
  cache : Path, str, or bool, default False
45
- Whether to cache the embeddings to a file or in memory.
46
- When a Path or string is provided, embeddings will be cached to disk.
55
+ When True, caches embeddings in memory for faster repeated access.
56
+ When Path or string is provided, persists embeddings to disk for reuse across sessions.
57
+ Default False minimizes memory usage.
47
58
  verbose : bool, default False
48
- Whether to print progress bar when encoding images.
59
+ When True, displays a progress bar when encoding images. Default False reduces console output
60
+ for cleaner automated workflows.
49
61
 
50
62
  Attributes
51
63
  ----------
52
64
  batch_size : int
53
- Batch size to use when encoding images.
65
+ Number of images processed per batch during encoding. Minimum value of 1.
54
66
  cache : Path or bool
55
- The path to cache embeddings to file, or True if caching to memory.
67
+ Disk path where embeddings are stored, or True when cached in memory.
56
68
  device : torch.device
57
- The hardware device to use if specified, otherwise uses the DataEval
58
- default or torch default.
69
+ Hardware device used for tensor computations.
59
70
  verbose : bool
60
- Whether to print progress bar when encoding images.
71
+ Whether progress information is displayed during operations.
61
72
  """
62
73
 
63
74
  device: torch.device
@@ -66,6 +77,7 @@ class Embeddings:
66
77
 
67
78
  def __init__(
68
79
  self,
80
+ # Technically more permissive than ImageClassificationDataset or ObjectDetectionDataset
69
81
  dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike],
70
82
  batch_size: int,
71
83
  transforms: Transform[torch.Tensor] | Sequence[Transform[torch.Tensor]] | None = None,
@@ -80,8 +92,8 @@ class Embeddings:
80
92
 
81
93
  self._embeddings_only: bool = False
82
94
  self._dataset = dataset
83
- model = torch.nn.Flatten() if model is None else model
84
95
  self._transforms = [transforms] if isinstance(transforms, Transform) else transforms
96
+ model = torch.nn.Flatten() if model is None else model
85
97
  self._model = model.to(self.device).eval() if isinstance(model, torch.nn.Module) else model
86
98
  self._encoder = model.encode if isinstance(model, SupportsEncode) else model
87
99
  self._collate_fn = lambda datum: [torch.as_tensor(d[0] if isinstance(d, tuple) else d) for d in datum]
@@ -110,7 +122,7 @@ class Embeddings:
110
122
  if isinstance(value, bool) and not value:
111
123
  self._cached_idx = set()
112
124
  self._embeddings = torch.empty(())
113
- elif isinstance(value, (Path, str)):
125
+ elif isinstance(value, Path | str):
114
126
  value = self._resolve_path(value)
115
127
 
116
128
  if isinstance(value, Path) and value != getattr(self, "_cache", None):
@@ -127,20 +139,24 @@ class Embeddings:
127
139
 
128
140
  def to_tensor(self, indices: Sequence[int] | None = None) -> torch.Tensor:
129
141
  """
130
- Converts dataset to embeddings.
142
+ Convert dataset items to embedding tensor.
143
+
144
+ Process specified dataset indices through the model in batches and
145
+ return concatenated embeddings as a single tensor.
131
146
 
132
147
  Parameters
133
148
  ----------
134
149
  indices : Sequence[int] or None, default None
135
- The indices to convert to embeddings
150
+ Dataset indices to convert to embeddings. When None, processes entire dataset.
136
151
 
137
152
  Returns
138
153
  -------
139
154
  torch.Tensor
155
+ Concatenated embeddings with shape (n_samples, embedding_dim).
140
156
 
141
- Warning
142
- -------
143
- Processing large quantities of data can be resource intensive.
157
+ Warnings
158
+ --------
159
+ Processing large datasets can be memory and compute intensive.
144
160
  """
145
161
  if indices is not None:
146
162
  return torch.vstack(list(self._batch(indices))).to(self.device)
@@ -148,35 +164,45 @@ class Embeddings:
148
164
 
149
165
  def to_numpy(self, indices: Sequence[int] | None = None) -> NDArray[Any]:
150
166
  """
151
- Converts dataset to embeddings as numpy array.
167
+ Convert dataset items to embedding array.
152
168
 
153
169
  Parameters
154
170
  ----------
155
171
  indices : Sequence[int] or None, default None
156
- The indices to convert to embeddings
172
+ Dataset indices to convert to embeddings. When None, processes entire dataset.
157
173
 
158
174
  Returns
159
175
  -------
160
176
  NDArray[Any]
177
+ Embedding array with shape (n_samples, embedding_dim)
161
178
 
162
179
  Warning
163
180
  -------
164
- Processing large quantities of data can be resource intensive.
181
+ Processing large datasets can be memory and compute intensive.
165
182
  """
166
183
  return self.to_tensor(indices).cpu().numpy()
167
184
 
168
185
  def new(self, dataset: Dataset[tuple[ArrayLike, Any, Any]] | Dataset[ArrayLike]) -> Embeddings:
169
186
  """
170
- Creates a new Embeddings object with the same parameters but a different dataset.
187
+ Create new Embeddings instance with a different dataset.
188
+
189
+ Generate a new Embeddings object using the same model, transforms,
190
+ and configuration but with a different dataset.
171
191
 
172
192
  Parameters
173
193
  ----------
174
194
  dataset : ImageClassificationDataset or ObjectDetectionDataset
175
- Dataset to access original images from.
195
+ Dataset that provides images for the new Embeddings instance.
176
196
 
177
197
  Returns
178
198
  -------
179
199
  Embeddings
200
+ New Embeddings object configured identically to the current instance.
201
+
202
+ Raises
203
+ ------
204
+ ValueError
205
+ When called on embeddings-only instance that lacks a model.
180
206
  """
181
207
  if self._embeddings_only:
182
208
  raise ValueError("Embeddings object does not have a model.")
@@ -187,15 +213,15 @@ class Embeddings:
187
213
  @classmethod
188
214
  def from_array(cls, array: ArrayLike, device: DeviceLike | None = None) -> Embeddings:
189
215
  """
190
- Instantiates a shallow Embeddings object using an array.
216
+ Create Embeddings instance from an existing image array.
191
217
 
192
218
  Parameters
193
219
  ----------
194
220
  array : ArrayLike
195
- The array to convert to embeddings.
221
+ In-memory image data to wrap in an Embeddings object.
196
222
  device : DeviceLike or None, default None
197
- The hardware device to use if specified, otherwise uses the DataEval
198
- default or torch default.
223
+ Hardware device for computation. When None, automatically selects DataEval's configured device, falling
224
+ back to PyTorch's default.
199
225
 
200
226
  Returns
201
227
  -------
@@ -219,12 +245,15 @@ class Embeddings:
219
245
 
220
246
  def save(self, path: Path | str) -> None:
221
247
  """
222
- Saves the embeddings to disk.
248
+ Save embeddings to disk.
249
+
250
+ Persist current embeddings to the specified file path for later
251
+ loading and reuse.
223
252
 
224
253
  Parameters
225
254
  ----------
226
255
  path : Path or str
227
- The file path to save the embeddings to.
256
+ File path where embeddings will be saved.
228
257
  """
229
258
  self._save(self._resolve_path(path), True)
230
259
 
@@ -254,10 +283,24 @@ class Embeddings:
254
283
  """
255
284
  Loads the embeddings from disk.
256
285
 
286
+ Create an Embeddings instance from previously saved embedding data.
287
+
257
288
  Parameters
258
289
  ----------
259
290
  path : Path or str
260
- The file path to load the embeddings from.
291
+ File path to load embeddings from.
292
+
293
+ Returns
294
+ -------
295
+ Embeddings
296
+ Embeddings-only instance containing the loaded data.
297
+
298
+ Raises
299
+ ------
300
+ FileNotFoundError
301
+ When the specified file path does not exist.
302
+ Exception
303
+ When file loading or parsing fails.
261
304
  """
262
305
  emb = Embeddings([], 0)
263
306
  path = Path(os.path.abspath(path)) if isinstance(path, str) else path
dataeval/data/_images.py CHANGED
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import TYPE_CHECKING, Any, Generic, Iterator, Sequence, TypeVar, cast, overload
5
+ from collections.abc import Iterator, Sequence
6
+ from typing import TYPE_CHECKING, Any, Generic, TypeVar, cast, overload
6
7
 
7
8
  import numpy as np
8
9
 
@@ -19,12 +20,18 @@ class Images(Generic[T]):
19
20
  """
20
21
  Collection of image data from a dataset.
21
22
 
22
- Images are accessed by index or slice and are only loaded on-demand.
23
+ Images are accessed by index or slice and are loaded on-demand for
24
+ memory-efficient processing of large datasets.
23
25
 
24
26
  Parameters
25
27
  ----------
26
28
  dataset : Dataset[tuple[T, ...]] or Dataset[T]
27
- Dataset to access images from.
29
+ Dataset that provides image data for access and visualization.
30
+
31
+ Attributes
32
+ ----------
33
+ None
34
+ All dataset access is handled through indexing operations.
28
35
  """
29
36
 
30
37
  def __init__(
@@ -36,16 +43,20 @@ class Images(Generic[T]):
36
43
 
37
44
  def to_list(self) -> Sequence[T]:
38
45
  """
39
- Converts entire dataset to a sequence of images.
46
+ Convert entire dataset to a sequence of images.
40
47
 
41
- Warning
42
- -------
43
- Will load the entire dataset and return the images as a
44
- single sequence of images in memory.
48
+ Load all images from the dataset and return a single sequence
49
+ in memory for batch processing or analysis.
45
50
 
46
51
  Returns
47
52
  -------
48
53
  list[T]
54
+ Complete sequence of all images in the dataset
55
+
56
+ Warnings
57
+ --------
58
+ Loading entire dataset into memory can consume significant resources
59
+ for large image collections.
49
60
  """
50
61
  return self[:]
51
62
 
@@ -55,6 +66,28 @@ class Images(Generic[T]):
55
66
  images_per_row: int = 3,
56
67
  figsize: tuple[int, int] = (10, 10),
57
68
  ) -> Figure:
69
+ """
70
+ Display images in a grid layout.
71
+
72
+ Create matplotlib figure showing specified images arranged in a
73
+ grid format for visual inspection and comparison.
74
+
75
+ Parameters
76
+ ----------
77
+ indices : Sequence[int]
78
+ Dataset indices of images to display in the plot.
79
+ images_per_row : int, default 3
80
+ Number of images displayed per row in the grid. Default 3 provides a balanced layout
81
+ for most screen sizes.
82
+ figsize : tuple[int, int], default (10, 10)
83
+ Figure dimensions as (width, height) in inches. Default (10, 10)
84
+ accommodates typical grid layouts with readable detail.
85
+
86
+ Returns
87
+ -------
88
+ Figure
89
+ Matplotlib figure object containing the image grid display.
90
+ """
58
91
  import matplotlib.pyplot as plt
59
92
 
60
93
  num_images = len(indices)