dataeval 0.87.0__py3-none-any.whl → 0.88.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. dataeval/_log.py +1 -1
  2. dataeval/_version.py +2 -2
  3. dataeval/data/_embeddings.py +78 -35
  4. dataeval/data/_images.py +41 -8
  5. dataeval/data/_metadata.py +294 -41
  6. dataeval/data/_selection.py +22 -7
  7. dataeval/data/_split.py +2 -1
  8. dataeval/data/selections/_classfilter.py +4 -3
  9. dataeval/data/selections/_indices.py +2 -1
  10. dataeval/data/selections/_shuffle.py +3 -2
  11. dataeval/detectors/drift/_base.py +2 -1
  12. dataeval/detectors/drift/_mmd.py +2 -1
  13. dataeval/detectors/drift/_nml/_base.py +1 -1
  14. dataeval/detectors/drift/_nml/_chunk.py +2 -1
  15. dataeval/detectors/drift/_nml/_result.py +3 -2
  16. dataeval/detectors/drift/_nml/_thresholds.py +6 -5
  17. dataeval/detectors/drift/_uncertainty.py +2 -1
  18. dataeval/detectors/linters/duplicates.py +2 -1
  19. dataeval/detectors/linters/outliers.py +4 -3
  20. dataeval/detectors/ood/ae.py +1 -1
  21. dataeval/detectors/ood/base.py +2 -1
  22. dataeval/detectors/ood/mixin.py +2 -1
  23. dataeval/metadata/_utils.py +1 -1
  24. dataeval/metrics/bias/_balance.py +1 -1
  25. dataeval/metrics/stats/_base.py +3 -29
  26. dataeval/metrics/stats/_boxratiostats.py +2 -1
  27. dataeval/metrics/stats/_dimensionstats.py +2 -1
  28. dataeval/metrics/stats/_hashstats.py +2 -1
  29. dataeval/metrics/stats/_pixelstats.py +2 -1
  30. dataeval/metrics/stats/_visualstats.py +2 -1
  31. dataeval/outputs/_base.py +2 -3
  32. dataeval/outputs/_bias.py +2 -1
  33. dataeval/outputs/_estimators.py +1 -1
  34. dataeval/outputs/_linters.py +3 -3
  35. dataeval/outputs/_stats.py +3 -3
  36. dataeval/outputs/_utils.py +1 -1
  37. dataeval/outputs/_workflows.py +29 -24
  38. dataeval/typing.py +11 -9
  39. dataeval/utils/_array.py +3 -2
  40. dataeval/utils/_bin.py +2 -1
  41. dataeval/utils/_method.py +2 -3
  42. dataeval/utils/_multiprocessing.py +34 -0
  43. dataeval/utils/_plot.py +2 -1
  44. dataeval/utils/data/__init__.py +4 -5
  45. dataeval/utils/data/{metadata.py → _merge.py} +3 -2
  46. dataeval/utils/data/_validate.py +2 -1
  47. dataeval/utils/data/collate.py +2 -1
  48. dataeval/utils/torch/_internal.py +2 -1
  49. dataeval/utils/torch/trainer.py +1 -1
  50. dataeval/workflows/sufficiency.py +13 -9
  51. {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/METADATA +4 -5
  52. dataeval-0.88.0.dist-info/RECORD +105 -0
  53. dataeval/utils/data/_dataset.py +0 -253
  54. dataeval-0.87.0.dist-info/RECORD +0 -105
  55. {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/WHEEL +0 -0
  56. {dataeval-0.87.0.dist-info → dataeval-0.88.0.dist-info}/licenses/LICENSE +0 -0
@@ -3,12 +3,14 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  import warnings
6
+ from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
6
7
  from dataclasses import dataclass
7
- from typing import Any, Iterable, Literal, Mapping, Sequence, Sized
8
+ from typing import Any, Literal
8
9
 
9
10
  import numpy as np
10
11
  import polars as pl
11
12
  from numpy.typing import NDArray
13
+ from tqdm.auto import tqdm
12
14
 
13
15
  from dataeval.typing import (
14
16
  AnnotatedDataset,
@@ -17,7 +19,7 @@ from dataeval.typing import (
17
19
  )
18
20
  from dataeval.utils._array import as_numpy
19
21
  from dataeval.utils._bin import bin_data, digitize_data, is_continuous
20
- from dataeval.utils.data.metadata import merge
22
+ from dataeval.utils.data._merge import merge
21
23
 
22
24
 
23
25
  def _binned(name: str) -> str:
@@ -44,21 +46,32 @@ def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
44
46
 
45
47
 
46
48
  class Metadata:
47
- """
48
- Class containing binned metadata using Polars DataFrames.
49
+ """Collection of binned metadata using Polars DataFrames.
50
+
51
+ Processes dataset metadata by automatically binning continuous factors and digitizing
52
+ categorical factors for analysis and visualization workflows.
49
53
 
50
54
  Parameters
51
55
  ----------
52
56
  dataset : ImageClassificationDataset or ObjectDetectionDataset
53
- Dataset to access original targets and metadata from.
57
+ Dataset that provides original targets and metadata for processing.
54
58
  continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
55
- Mapping from continuous factor name to the number of bins or bin edges
59
+ Mapping from continuous factor names to bin counts or explicit bin edges.
60
+ When None, uses automatic discretization.
56
61
  auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
57
- Method for automatically determining the number of bins for continuous factors
62
+ Binning strategy for continuous factors without explicit bins. Default "uniform_width"
63
+ provides intuitive equal-width intervals for most distributions.
58
64
  exclude : Sequence[str] | None, default None
59
- Filter metadata factors to exclude the specified factors, cannot be set with `include`
65
+ Factor names to exclude from processing. Cannot be used with `include` parameter.
66
+ When None, processes all available factors.
60
67
  include : Sequence[str] | None, default None
61
- Filter metadata factors to include the specified factors, cannot be set with `exclude`
68
+ Factor names to include in processing. Cannot be used with `exclude` parameter.
69
+ When None, processes all available factors.
70
+
71
+ Raises
72
+ ------
73
+ ValueError
74
+ When both exclude and include parameters are specified simultaneously.
62
75
  """
63
76
 
64
77
  def __init__(
@@ -94,17 +107,48 @@ class Metadata:
94
107
 
95
108
  @property
96
109
  def raw(self) -> Sequence[Mapping[str, Any]]:
97
- """The raw list of metadata dictionaries for the dataset."""
110
+ """Original metadata dictionaries extracted from the dataset.
111
+
112
+ Access the unprocessed metadata as it was provided in the original dataset before
113
+ any binning, filtering, or transformation operations.
114
+
115
+ Returns
116
+ -------
117
+ Sequence[Mapping[str, Any]]
118
+ List of metadata dictionaries, one per dataset item, containing the original key-value
119
+ pairs as provided in the source data
120
+
121
+ Notes
122
+ -----
123
+ This property triggers dataset structure analysis on first access.
124
+ """
98
125
  self._structure()
99
126
  return self._raw
100
127
 
101
128
  @property
102
129
  def exclude(self) -> set[str]:
103
- """Factors to exclude from the metadata."""
130
+ """Factor names excluded from metadata processing.
131
+
132
+ Returns
133
+ -------
134
+ set[str]
135
+ Set of factor names that are filtered out during processing.
136
+ Empty set when no exclusions are active.
137
+
138
+ """
104
139
  return self._exclude
105
140
 
106
141
  @exclude.setter
107
142
  def exclude(self, value: Sequence[str]) -> None:
143
+ """Set factor names to exclude from processing.
144
+
145
+ Automatically clears include filter and resets binning state when exclusion list changes.
146
+
147
+ Parameters
148
+ ----------
149
+ value : Sequence[str]
150
+ Factor names to exclude from metadata analysis.
151
+ """
108
152
  exclude = set(value)
109
153
  if self._exclude != exclude:
110
154
  self._exclude = exclude
@@ -113,11 +157,27 @@ class Metadata:
113
157
 
114
158
  @property
115
159
  def include(self) -> set[str]:
116
- """Factors to include from the metadata."""
160
+ """Factor names included in metadata processing.
161
+
162
+ Returns
163
+ -------
164
+ set[str]
165
+ Set of factor names that are processed during analysis. Empty set when no inclusion filter is active.
166
+ """
117
167
  return self._include
118
168
 
119
169
  @include.setter
120
170
  def include(self, value: Sequence[str]) -> None:
171
+ """Set factor names to include in processing.
172
+
173
+ Automatically clears exclude filter and resets binning state when
174
+ inclusion list changes.
175
+
176
+ Parameters
177
+ ----------
178
+ value : Sequence[str]
179
+ Factor names to include in metadata analysis.
180
+ """
121
181
  include = set(value)
122
182
  if self._include != include:
123
183
  self._include = include
@@ -126,41 +186,120 @@ class Metadata:
126
186
 
127
187
  @property
128
188
  def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
129
- """Map of factor names to bin counts or bin edges."""
189
+ """Binning configuration for continuous factors.
190
+
191
+ Returns
192
+ -------
193
+ Mapping[str, int | Sequence[float]]
194
+ Dictionary mapping factor names to either the number of bins
195
+ (int) or explicit bin edges (sequence of floats).
196
+ """
130
197
  return self._continuous_factor_bins
131
198
 
132
199
  @continuous_factor_bins.setter
133
200
  def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
201
+ """Update binning configuration for continuous factors.
202
+
203
+ Triggers re-binning when configuration changes to ensure data
204
+ consistency with new bin specifications.
205
+
206
+ Parameters
207
+ ----------
208
+ bins : Mapping[str, int | Sequence[float]]
209
+ Dictionary mapping factor names to bin counts or explicit edges.
210
+ """
134
211
  if self._continuous_factor_bins != bins:
135
212
  self._continuous_factor_bins = dict(bins)
136
213
  self._reset_bins(bins)
137
214
 
138
215
  @property
139
216
  def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
140
- """Binning method to use when continuous_factor_bins is not defined."""
217
+ """Automatic binning strategy for continuous factors.
218
+
219
+ Returns
220
+ -------
221
+ {"uniform_width", "uniform_count", "clusters"}
222
+ Current method used for automatic discretization of continuous
223
+ factors that lack explicit bin specifications.
224
+ """
141
225
  return self._auto_bin_method
142
226
 
143
227
  @auto_bin_method.setter
144
228
  def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
229
+ """Set automatic binning strategy for continuous factors.
230
+
231
+ Triggers re-binning with the new method when strategy changes to
232
+ ensure consistent discretization across all factors.
233
+
234
+ Parameters
235
+ ----------
236
+ method : {"uniform_width", "uniform_count", "clusters"}
237
+ Binning strategy to apply for continuous factors without
238
+ explicit bin configurations.
239
+ """
145
240
  if self._auto_bin_method != method:
146
241
  self._auto_bin_method = method
147
242
  self._reset_bins()
148
243
 
149
244
  @property
150
245
  def dataframe(self) -> pl.DataFrame:
151
- """Dataframe containing target information and metadata factors."""
246
+ """Processed DataFrame containing targets and metadata factors.
247
+
248
+ Access the main data structure with target information (class labels,
249
+ scores, bounding boxes) and processed metadata factors ready for analysis.
250
+
251
+ Returns
252
+ -------
253
+ pl.DataFrame
254
+ DataFrame with columns for image indices, class labels, scores,
255
+ bounding boxes (when applicable), and all processed metadata factors.
256
+
257
+ Notes
258
+ -----
259
+ This property triggers dataset structure analysis on first access.
260
+ Factor binning occurs automatically when accessing factor-related data.
261
+ """
152
262
  self._structure()
153
263
  return self._dataframe
154
264
 
155
265
  @property
156
266
  def dropped_factors(self) -> Mapping[str, Sequence[str]]:
157
- """Factors that were dropped during preprocessing and the reasons why they were dropped."""
267
+ """Factors removed during preprocessing with removal reasons.
268
+
269
+ Returns
270
+ -------
271
+ Mapping[str, Sequence[str]]
272
+ Dictionary mapping dropped factor names to lists of reasons
273
+ why they were excluded from the final dataset.
274
+
275
+ Notes
276
+ -----
277
+ This property triggers dataset structure analysis on first access.
278
+ Common removal reasons include incompatible data types, excessive
279
+ missing values, or insufficient variation.
280
+ """
158
281
  self._structure()
159
282
  return self._dropped_factors
160
283
 
161
284
  @property
162
285
  def digitized_data(self) -> NDArray[np.int64]:
163
- """Factor data with digitized categorical data."""
286
+ """Factor data with categorical values converted to integer codes.
287
+
288
+ Access processed factor data where categorical factors are digitized
289
+ to integer codes but continuous factors remain in their original form.
290
+
291
+ Returns
292
+ -------
293
+ NDArray[np.int64]
294
+ Array with shape (n_samples, n_factors) containing integer-coded
295
+ categorical data. Returns empty array when no factors are available.
296
+
297
+ Notes
298
+ -----
299
+ This property triggers factor binning analysis on first access.
300
+ Use this for algorithms that can handle mixed categorical and
301
+ continuous data types.
302
+ """
164
303
  if not self.factor_names:
165
304
  return np.array([], dtype=np.int64)
166
305
 
@@ -173,7 +312,23 @@ class Metadata:
173
312
 
174
313
  @property
175
314
  def binned_data(self) -> NDArray[np.int64]:
176
- """Factor data with binned continuous data."""
315
+ """Factor data with continuous values discretized into bins.
316
+
317
+ Access fully processed factor data where both categorical and
318
+ continuous factors are converted to integer bin indices.
319
+
320
+ Returns
321
+ -------
322
+ NDArray[np.int64]
323
+ Array with shape (n_samples, n_factors) containing binned integer
324
+ data ready for categorical analysis algorithms. Returns empty array
325
+ when no factors are available.
326
+
327
+ Notes
328
+ -----
329
+ This property triggers factor binning analysis on first access.
330
+ Use this for algorithms requiring purely discrete input data.
331
+ """
177
332
  if not self.factor_names:
178
333
  return np.array([], dtype=np.int64)
179
334
 
@@ -186,19 +341,59 @@ class Metadata:
186
341
 
187
342
  @property
188
343
  def factor_names(self) -> Sequence[str]:
189
- """Factor names of the metadata."""
344
+ """Names of all processed metadata factors.
345
+
346
+ Returns
347
+ -------
348
+ Sequence[str]
349
+ List of factor names that passed filtering and preprocessing steps.
350
+ Order matches columns in factor_data, digitized_data, and binned_data.
351
+
352
+ Notes
353
+ -----
354
+ This property triggers dataset structure analysis on first access.
355
+ Factor names respect include/exclude filtering settings.
356
+ """
190
357
  self._structure()
191
358
  return list(filter(self._filter, self._factors))
192
359
 
193
360
  @property
194
361
  def factor_info(self) -> Mapping[str, FactorInfo]:
195
- """Factor types of the metadata."""
362
+ """Type information and processing status for each factor.
363
+
364
+ Returns
365
+ -------
366
+ Mapping[str, FactorInfo]
367
+ Dictionary mapping factor names to FactorInfo objects containing
368
+ data type classification and processing flags (binned, digitized).
369
+
370
+ Notes
371
+ -----
372
+ This property triggers factor binning analysis on first access.
373
+ Only includes factors that survived preprocessing and filtering.
374
+ """
196
375
  self._bin()
197
376
  return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
198
377
 
199
378
  @property
200
379
  def factor_data(self) -> NDArray[Any]:
201
- """Factor data as a NumPy array."""
380
+ """Raw factor values before binning or digitization.
381
+
382
+ Access unprocessed factor data in its original numeric form before
383
+ any categorical encoding or binning transformations are applied.
384
+
385
+ Returns
386
+ -------
387
+ NDArray[Any]
388
+ Array with shape (n_samples, n_factors) containing original factor
389
+ values. Returns empty array when no factors are available.
390
+
391
+ Notes
392
+ -----
393
+ Use this for algorithms that can work with mixed data types or when
394
+ you need access to original continuous values. For analysis-ready
395
+ integer data, use binned_data or digitized_data instead.
396
+ """
202
397
  if not self.factor_names:
203
398
  return np.array([], dtype=np.float64)
204
399
 
@@ -207,24 +402,67 @@ class Metadata:
207
402
 
208
403
  @property
209
404
  def class_labels(self) -> NDArray[np.intp]:
210
- """Class labels as a NumPy array."""
405
+ """Target class labels as integer indices.
406
+
407
+ Returns
408
+ -------
409
+ NDArray[np.intp]
410
+ Array of class indices corresponding to dataset targets. For
411
+ object detection datasets, contains one label per detection.
412
+
413
+ Notes
414
+ -----
415
+ This property triggers dataset structure analysis on first access.
416
+ Use class_names property to get human-readable label names.
417
+ """
211
418
  self._structure()
212
419
  return self._class_labels
213
420
 
214
421
  @property
215
422
  def class_names(self) -> Sequence[str]:
216
- """Class names as a list of strings."""
423
+ """Human-readable names corresponding to class labels.
424
+
425
+ Returns
426
+ -------
427
+ Sequence[str]
428
+ List of class names where index corresponds to class label value.
429
+ Derived from dataset metadata or auto-generated from label indices.
430
+
431
+ Notes
432
+ -----
433
+ This property triggers dataset structure analysis on first access.
434
+ """
217
435
  self._structure()
218
436
  return self._class_names
219
437
 
220
438
  @property
221
439
  def image_indices(self) -> NDArray[np.intp]:
222
- """Indices of images as a NumPy array."""
440
+ """Dataset indices linking targets back to source images.
441
+
442
+ Returns
443
+ -------
444
+ NDArray[np.intp]
445
+ Array mapping each target/detection back to its source image
446
+ index in the original dataset. Essential for object detection
447
+ datasets where multiple detections come from single images.
448
+
449
+ Notes
450
+ -----
451
+ This property triggers dataset structure analysis on first access.
452
+ """
223
453
  self._structure()
224
454
  return self._image_indices
225
455
 
226
456
  @property
227
457
  def image_count(self) -> int:
458
+ """Total number of images in the dataset.
459
+
460
+ Returns
461
+ -------
462
+ int
463
+ Count of unique images in the source dataset, regardless of
464
+ how many targets/detections each image contains.
465
+ """
228
466
  if self._count == 0:
229
467
  self._structure()
230
468
  return self._count
@@ -252,7 +490,7 @@ class Metadata:
252
490
  scores = []
253
491
  srcidx = []
254
492
  is_od = None
255
- for i in range(len(self._dataset)):
493
+ for i in tqdm(range(len(self._dataset))):
256
494
  _, target, metadata = self._dataset[i]
257
495
 
258
496
  raw.append(metadata)
@@ -261,15 +499,15 @@ class Metadata:
261
499
  target_labels = as_numpy(target.labels)
262
500
  target_len = len(target_labels)
263
501
  if target_len:
264
- labels.extend(target_labels.tolist())
265
- bboxes.extend(as_numpy(target.boxes).tolist())
266
- scores.extend(as_numpy(target.scores).tolist())
502
+ labels.append(target_labels)
503
+ bboxes.append(as_numpy(target.boxes))
504
+ scores.append(as_numpy(target.scores))
267
505
  srcidx.extend([i] * target_len)
268
506
  elif isinstance(target, Array):
269
- if len(target):
270
- target_len = 1
271
- labels.append(int(np.argmax(as_numpy(target))))
272
- scores.append(target)
507
+ target_scores = as_numpy(target)
508
+ if len(target_scores):
509
+ labels.append([np.argmax(target_scores)])
510
+ scores.append([target_scores])
273
511
  srcidx.append(i)
274
512
  else:
275
513
  raise TypeError("Encountered unsupported target type in dataset")
@@ -278,10 +516,11 @@ class Metadata:
278
516
  if is_od != is_od_target:
279
517
  raise ValueError("Encountered unexpected target type in dataset")
280
518
 
281
- labels = as_numpy(labels).astype(np.intp)
282
- scores = as_numpy(scores).astype(np.float32)
283
- bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
284
- srcidx = as_numpy(srcidx).astype(np.intp)
519
+ np_asarray: Callable[..., np.ndarray] = np.concatenate if srcidx else np.asarray
520
+ labels = np_asarray(labels, dtype=np.intp)
521
+ scores = np_asarray(scores, dtype=np.float32)
522
+ bboxes = np_asarray(bboxes, dtype=np.float32) if is_od else None
523
+ srcidx = np.asarray(srcidx, dtype=np.intp)
285
524
 
286
525
  index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
287
526
 
@@ -370,16 +609,30 @@ class Metadata:
370
609
  self._is_binned = True
371
610
 
372
611
  def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
373
- """
374
- Add additional factors to the metadata.
612
+ """Add additional factors to metadata collection.
375
613
 
376
- The number of measures per factor must match the number of images
377
- in the dataset or the number of detections in the dataset.
614
+ Extend the current metadata with new factors, automatically handling
615
+ length validation and integration with existing data structures.
378
616
 
379
617
  Parameters
380
618
  ----------
381
619
  factors : Mapping[str, Array | Sequence[Any]]
382
- Dictionary of factors to add to the metadata.
620
+ Dictionary mapping factor names to their values. Factor length must
621
+ match either the number of images or number of detections in the dataset.
622
+
623
+ Raises
624
+ ------
625
+ ValueError
626
+ When factor lengths do not match dataset dimensions.
627
+
628
+ Examples
629
+ --------
630
+ >>> metadata = Metadata(dataset)
631
+ >>> new_factors = {
632
+ ... "brightness": [0.2, 0.8, 0.5, 0.3, 0.4, 0.1, 0.3, 0.2],
633
+ ... "contrast": [1.1, 0.9, 1.0, 0.8, 1.2, 1.0, 0.7, 1.3],
634
+ ... }
635
+ >>> metadata.add_factors(new_factors)
383
636
  """
384
637
  self._structure()
385
638
 
@@ -2,8 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
+ from collections.abc import Iterator, Sequence
5
6
  from enum import IntEnum
6
- from typing import Generic, Iterator, Sequence, TypeVar
7
+ from typing import Generic, TypeVar
7
8
 
8
9
  from dataeval.typing import AnnotatedDataset, DatasetMetadata
9
10
 
@@ -31,14 +32,21 @@ class Subselection(Generic[_TDatum]):
31
32
 
32
33
  class Select(AnnotatedDataset[_TDatum]):
33
34
  """
34
- Wraps a dataset and applies selection criteria to it.
35
+ Dataset wrapper that applies selection criteria for filtering.
36
+
37
+ Wraps an existing dataset and applies one or more selection filters to
38
+ create a subset view without modifying the original dataset. Supports
39
+ chaining multiple selection criteria for complex filtering operations.
35
40
 
36
41
  Parameters
37
42
  ----------
38
- dataset : Dataset
39
- The dataset to wrap.
40
- selections : Selection or list[Selection], optional
41
- The selection criteria to apply to the dataset.
43
+ dataset : AnnotatedDataset[_TDatum]
44
+ Source dataset to wrap and filter. Must implement AnnotatedDataset
45
+ interface with indexed access to data tuples.
46
+ selections : Selection or Sequence[Selection] or None, default None
47
+ Selection criteria to apply for filtering the dataset. When None,
48
+ returns all items from the source dataset. Default None creates
49
+ unfiltered view for consistent interface.
42
50
 
43
51
  Examples
44
52
  --------
@@ -49,7 +57,7 @@ class Select(AnnotatedDataset[_TDatum]):
49
57
  >>> # - f"data_{idx}", one_hot_encoded(idx % class_count), {"id": idx}
50
58
  >>> dataset = SampleDataset(size=100, class_count=10)
51
59
 
52
- >>> # Apply a selection criteria to the dataset
60
+ >>> # Apply selection criteria to the dataset
53
61
  >>> selections = [Limit(size=5), ClassFilter(classes=[0, 2])]
54
62
  >>> selected_dataset = Select(dataset, selections=selections)
55
63
 
@@ -61,6 +69,12 @@ class Select(AnnotatedDataset[_TDatum]):
61
69
  (data_10, 0, {'id': 10})
62
70
  (data_12, 2, {'id': 12})
63
71
  (data_20, 0, {'id': 20})
72
+
73
+ Notes
74
+ -----
75
+ Selection criteria are applied in the order provided, allowing for
76
+ efficient sequential filtering. The wrapper maintains all metadata
77
+ and interface compatibility with the original dataset.
64
78
  """
65
79
 
66
80
  _dataset: AnnotatedDataset[_TDatum]
@@ -91,6 +105,7 @@ class Select(AnnotatedDataset[_TDatum]):
91
105
 
92
106
  @property
93
107
  def metadata(self) -> DatasetMetadata:
108
+ """Dataset metadata information including identifier and configuration."""
94
109
  return self._metadata
95
110
 
96
111
  def __str__(self) -> str:
dataeval/data/_split.py CHANGED
@@ -4,7 +4,8 @@ __all__ = []
4
4
 
5
5
  import logging
6
6
  import warnings
7
- from typing import Any, Iterator, Protocol, Sequence
7
+ from collections.abc import Iterator, Sequence
8
+ from typing import Any, Protocol
8
9
 
9
10
  import numpy as np
10
11
  from numpy.typing import NDArray
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Generic, Iterable, Mapping, Sequence, Sized, TypeVar, cast
5
+ from collections.abc import Iterable, Mapping, Sequence, Sized
6
+ from typing import Any, Generic, TypeVar, cast
6
7
 
7
8
  import numpy as np
8
9
  from numpy.typing import NDArray
@@ -45,7 +46,7 @@ class ClassFilter(Selection[Any]):
45
46
  if label in self.classes:
46
47
  # Include the image index
47
48
  selection.append(idx)
48
- elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
49
+ elif isinstance(target, ObjectDetectionTarget | SegmentationTarget):
49
50
  # Get the set of labels from the target
50
51
  labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
51
52
  # Check to see if any labels are in the classes to filter for
@@ -68,7 +69,7 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
68
69
 
69
70
 
70
71
  def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
71
- if not isinstance(obj, (str, bytes, bytearray)) and isinstance(obj, (Sequence, Array)) and len(obj) == len(mask):
72
+ if not isinstance(obj, str | bytes | bytearray) and isinstance(obj, Sequence | Array) and len(obj) == len(mask):
72
73
  return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
73
74
  return obj
74
75
 
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Sequence
5
+ from collections.abc import Sequence
6
+ from typing import Any
6
7
 
7
8
  from dataeval.data._selection import Select, Selection, SelectionStage
8
9
 
@@ -2,7 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from typing import Any, Sequence
5
+ from collections.abc import Sequence
6
+ from typing import Any
6
7
 
7
8
  import numpy as np
8
9
  from numpy.random import BitGenerator, Generator, SeedSequence
@@ -33,7 +34,7 @@ class Shuffle(Selection[Any]):
33
34
  def __init__(
34
35
  self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
35
36
  ) -> None:
36
- self.seed = as_numpy(seed) if isinstance(seed, (Sequence, Array)) else seed
37
+ self.seed = as_numpy(seed) if isinstance(seed, Sequence | Array) else seed
37
38
 
38
39
  def __call__(self, dataset: Select[Any]) -> None:
39
40
  rng = np.random.default_rng(self.seed)
@@ -12,8 +12,9 @@ __all__ = []
12
12
 
13
13
  import math
14
14
  from abc import abstractmethod
15
+ from collections.abc import Callable
15
16
  from functools import wraps
16
- from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
17
+ from typing import Any, Literal, Protocol, TypeVar, runtime_checkable
17
18
 
18
19
  import numpy as np
19
20
  from numpy.typing import NDArray
@@ -10,7 +10,8 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = []
12
12
 
13
- from typing import Any, Callable
13
+ from collections.abc import Callable
14
+ from typing import Any
14
15
 
15
16
  import torch
16
17