dataeval 0.87.0__py3-none-any.whl → 0.88.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/_log.py +1 -1
- dataeval/_version.py +2 -2
- dataeval/data/_embeddings.py +78 -35
- dataeval/data/_images.py +41 -8
- dataeval/data/_metadata.py +294 -41
- dataeval/data/_selection.py +22 -7
- dataeval/data/_split.py +2 -1
- dataeval/data/selections/_classfilter.py +4 -3
- dataeval/data/selections/_indices.py +2 -1
- dataeval/data/selections/_shuffle.py +3 -2
- dataeval/detectors/drift/_base.py +2 -1
- dataeval/detectors/drift/_mmd.py +2 -1
- dataeval/detectors/drift/_nml/_base.py +1 -1
- dataeval/detectors/drift/_nml/_chunk.py +2 -1
- dataeval/detectors/drift/_nml/_result.py +3 -2
- dataeval/detectors/drift/_nml/_thresholds.py +6 -5
- dataeval/detectors/drift/_uncertainty.py +2 -1
- dataeval/detectors/linters/duplicates.py +2 -1
- dataeval/detectors/linters/outliers.py +4 -3
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/detectors/ood/base.py +2 -1
- dataeval/detectors/ood/mixin.py +2 -1
- dataeval/metadata/_utils.py +1 -1
- dataeval/metrics/bias/_balance.py +1 -1
- dataeval/metrics/stats/_base.py +3 -29
- dataeval/metrics/stats/_boxratiostats.py +2 -1
- dataeval/metrics/stats/_dimensionstats.py +2 -1
- dataeval/metrics/stats/_hashstats.py +2 -1
- dataeval/metrics/stats/_pixelstats.py +2 -1
- dataeval/metrics/stats/_visualstats.py +2 -1
- dataeval/outputs/_base.py +2 -3
- dataeval/outputs/_bias.py +2 -1
- dataeval/outputs/_estimators.py +1 -1
- dataeval/outputs/_linters.py +3 -3
- dataeval/outputs/_stats.py +3 -3
- dataeval/outputs/_utils.py +1 -1
- dataeval/outputs/_workflows.py +85 -30
- dataeval/typing.py +11 -9
- dataeval/utils/_array.py +3 -2
- dataeval/utils/_bin.py +2 -1
- dataeval/utils/_method.py +2 -3
- dataeval/utils/_multiprocessing.py +34 -0
- dataeval/utils/_plot.py +2 -1
- dataeval/utils/data/__init__.py +4 -5
- dataeval/utils/data/{metadata.py → _merge.py} +3 -2
- dataeval/utils/data/_validate.py +2 -1
- dataeval/utils/data/collate.py +2 -1
- dataeval/utils/torch/_internal.py +2 -1
- dataeval/utils/torch/trainer.py +1 -1
- dataeval/workflows/sufficiency.py +12 -9
- {dataeval-0.87.0.dist-info → dataeval-0.88.1.dist-info}/METADATA +4 -5
- dataeval-0.88.1.dist-info/RECORD +105 -0
- dataeval/utils/data/_dataset.py +0 -253
- dataeval-0.87.0.dist-info/RECORD +0 -105
- {dataeval-0.87.0.dist-info → dataeval-0.88.1.dist-info}/WHEEL +0 -0
- {dataeval-0.87.0.dist-info → dataeval-0.88.1.dist-info}/licenses/LICENSE +0 -0
dataeval/data/_metadata.py
CHANGED
@@ -3,12 +3,14 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
import warnings
|
6
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence, Sized
|
6
7
|
from dataclasses import dataclass
|
7
|
-
from typing import Any,
|
8
|
+
from typing import Any, Literal
|
8
9
|
|
9
10
|
import numpy as np
|
10
11
|
import polars as pl
|
11
12
|
from numpy.typing import NDArray
|
13
|
+
from tqdm.auto import tqdm
|
12
14
|
|
13
15
|
from dataeval.typing import (
|
14
16
|
AnnotatedDataset,
|
@@ -17,7 +19,7 @@ from dataeval.typing import (
|
|
17
19
|
)
|
18
20
|
from dataeval.utils._array import as_numpy
|
19
21
|
from dataeval.utils._bin import bin_data, digitize_data, is_continuous
|
20
|
-
from dataeval.utils.data.
|
22
|
+
from dataeval.utils.data._merge import merge
|
21
23
|
|
22
24
|
|
23
25
|
def _binned(name: str) -> str:
|
@@ -44,21 +46,32 @@ def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
|
|
44
46
|
|
45
47
|
|
46
48
|
class Metadata:
|
47
|
-
"""
|
48
|
-
|
49
|
+
"""Collection of binned metadata using Polars DataFrames.
|
50
|
+
|
51
|
+
Processes dataset metadata by automatically binning continuous factors and digitizing
|
52
|
+
categorical factors for analysis and visualization workflows.
|
49
53
|
|
50
54
|
Parameters
|
51
55
|
----------
|
52
56
|
dataset : ImageClassificationDataset or ObjectDetectionDataset
|
53
|
-
Dataset
|
57
|
+
Dataset that provides original targets and metadata for processing.
|
54
58
|
continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None
|
55
|
-
Mapping from continuous factor
|
59
|
+
Mapping from continuous factor names to bin counts or explicit bin edges.
|
60
|
+
When None, uses automatic discretization.
|
56
61
|
auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"
|
57
|
-
|
62
|
+
Binning strategy for continuous factors without explicit bins. Default "uniform_width"
|
63
|
+
provides intuitive equal-width intervals for most distributions.
|
58
64
|
exclude : Sequence[str] | None, default None
|
59
|
-
|
65
|
+
Factor names to exclude from processing. Cannot be used with `include` parameter.
|
66
|
+
When None, processes all available factors.
|
60
67
|
include : Sequence[str] | None, default None
|
61
|
-
|
68
|
+
Factor names to include in processing. Cannot be used with `exclude` parameter.
|
69
|
+
When None, processes all available factors.
|
70
|
+
|
71
|
+
Raises
|
72
|
+
------
|
73
|
+
ValueError
|
74
|
+
When both exclude and include parameters are specified simultaneously.
|
62
75
|
"""
|
63
76
|
|
64
77
|
def __init__(
|
@@ -94,17 +107,48 @@ class Metadata:
|
|
94
107
|
|
95
108
|
@property
|
96
109
|
def raw(self) -> Sequence[Mapping[str, Any]]:
|
97
|
-
"""
|
110
|
+
"""Original metadata dictionaries extracted from the dataset.
|
111
|
+
|
112
|
+
Access the unprocessed metadata as it was provided in the original dataset before
|
113
|
+
any binning, filtering, or transformation operations.
|
114
|
+
|
115
|
+
Returns
|
116
|
+
-------
|
117
|
+
Sequence[Mapping[str, Any]]
|
118
|
+
List of metadata dictionaries, one per dataset item, containing the original key-value
|
119
|
+
pairs as provided in the source data
|
120
|
+
|
121
|
+
Notes
|
122
|
+
-----
|
123
|
+
This property triggers dataset structure analysis on first access.
|
124
|
+
"""
|
98
125
|
self._structure()
|
99
126
|
return self._raw
|
100
127
|
|
101
128
|
@property
|
102
129
|
def exclude(self) -> set[str]:
|
103
|
-
"""
|
130
|
+
"""Factor names excluded from metadata processing.
|
131
|
+
|
132
|
+
Returns
|
133
|
+
-------
|
134
|
+
set[str]
|
135
|
+
Set of factor names that are filtered out during processing.
|
136
|
+
Empty set when no exclusions are active.
|
137
|
+
|
138
|
+
"""
|
104
139
|
return self._exclude
|
105
140
|
|
106
141
|
@exclude.setter
|
107
142
|
def exclude(self, value: Sequence[str]) -> None:
|
143
|
+
"""Set factor names to exclude from processing.
|
144
|
+
|
145
|
+
Automatically clears include filter and resets binning state when exclusion list changes.
|
146
|
+
|
147
|
+
Parameters
|
148
|
+
----------
|
149
|
+
value : Sequence[str]
|
150
|
+
Factor names to exclude from metadata analysis.
|
151
|
+
"""
|
108
152
|
exclude = set(value)
|
109
153
|
if self._exclude != exclude:
|
110
154
|
self._exclude = exclude
|
@@ -113,11 +157,27 @@ class Metadata:
|
|
113
157
|
|
114
158
|
@property
|
115
159
|
def include(self) -> set[str]:
|
116
|
-
"""
|
160
|
+
"""Factor names included in metadata processing.
|
161
|
+
|
162
|
+
Returns
|
163
|
+
-------
|
164
|
+
set[str]
|
165
|
+
Set of factor names that are processed during analysis. Empty set when no inclusion filter is active.
|
166
|
+
"""
|
117
167
|
return self._include
|
118
168
|
|
119
169
|
@include.setter
|
120
170
|
def include(self, value: Sequence[str]) -> None:
|
171
|
+
"""Set factor names to include in processing.
|
172
|
+
|
173
|
+
Automatically clears exclude filter and resets binning state when
|
174
|
+
inclusion list changes.
|
175
|
+
|
176
|
+
Parameters
|
177
|
+
----------
|
178
|
+
value : Sequence[str]
|
179
|
+
Factor names to include in metadata analysis.
|
180
|
+
"""
|
121
181
|
include = set(value)
|
122
182
|
if self._include != include:
|
123
183
|
self._include = include
|
@@ -126,41 +186,120 @@ class Metadata:
|
|
126
186
|
|
127
187
|
@property
|
128
188
|
def continuous_factor_bins(self) -> Mapping[str, int | Sequence[float]]:
|
129
|
-
"""
|
189
|
+
"""Binning configuration for continuous factors.
|
190
|
+
|
191
|
+
Returns
|
192
|
+
-------
|
193
|
+
Mapping[str, int | Sequence[float]]
|
194
|
+
Dictionary mapping factor names to either the number of bins
|
195
|
+
(int) or explicit bin edges (sequence of floats).
|
196
|
+
"""
|
130
197
|
return self._continuous_factor_bins
|
131
198
|
|
132
199
|
@continuous_factor_bins.setter
|
133
200
|
def continuous_factor_bins(self, bins: Mapping[str, int | Sequence[float]]) -> None:
|
201
|
+
"""Update binning configuration for continuous factors.
|
202
|
+
|
203
|
+
Triggers re-binning when configuration changes to ensure data
|
204
|
+
consistency with new bin specifications.
|
205
|
+
|
206
|
+
Parameters
|
207
|
+
----------
|
208
|
+
bins : Mapping[str, int | Sequence[float]]
|
209
|
+
Dictionary mapping factor names to bin counts or explicit edges.
|
210
|
+
"""
|
134
211
|
if self._continuous_factor_bins != bins:
|
135
212
|
self._continuous_factor_bins = dict(bins)
|
136
213
|
self._reset_bins(bins)
|
137
214
|
|
138
215
|
@property
|
139
216
|
def auto_bin_method(self) -> Literal["uniform_width", "uniform_count", "clusters"]:
|
140
|
-
"""
|
217
|
+
"""Automatic binning strategy for continuous factors.
|
218
|
+
|
219
|
+
Returns
|
220
|
+
-------
|
221
|
+
{"uniform_width", "uniform_count", "clusters"}
|
222
|
+
Current method used for automatic discretization of continuous
|
223
|
+
factors that lack explicit bin specifications.
|
224
|
+
"""
|
141
225
|
return self._auto_bin_method
|
142
226
|
|
143
227
|
@auto_bin_method.setter
|
144
228
|
def auto_bin_method(self, method: Literal["uniform_width", "uniform_count", "clusters"]) -> None:
|
229
|
+
"""Set automatic binning strategy for continuous factors.
|
230
|
+
|
231
|
+
Triggers re-binning with the new method when strategy changes to
|
232
|
+
ensure consistent discretization across all factors.
|
233
|
+
|
234
|
+
Parameters
|
235
|
+
----------
|
236
|
+
method : {"uniform_width", "uniform_count", "clusters"}
|
237
|
+
Binning strategy to apply for continuous factors without
|
238
|
+
explicit bin configurations.
|
239
|
+
"""
|
145
240
|
if self._auto_bin_method != method:
|
146
241
|
self._auto_bin_method = method
|
147
242
|
self._reset_bins()
|
148
243
|
|
149
244
|
@property
|
150
245
|
def dataframe(self) -> pl.DataFrame:
|
151
|
-
"""
|
246
|
+
"""Processed DataFrame containing targets and metadata factors.
|
247
|
+
|
248
|
+
Access the main data structure with target information (class labels,
|
249
|
+
scores, bounding boxes) and processed metadata factors ready for analysis.
|
250
|
+
|
251
|
+
Returns
|
252
|
+
-------
|
253
|
+
pl.DataFrame
|
254
|
+
DataFrame with columns for image indices, class labels, scores,
|
255
|
+
bounding boxes (when applicable), and all processed metadata factors.
|
256
|
+
|
257
|
+
Notes
|
258
|
+
-----
|
259
|
+
This property triggers dataset structure analysis on first access.
|
260
|
+
Factor binning occurs automatically when accessing factor-related data.
|
261
|
+
"""
|
152
262
|
self._structure()
|
153
263
|
return self._dataframe
|
154
264
|
|
155
265
|
@property
|
156
266
|
def dropped_factors(self) -> Mapping[str, Sequence[str]]:
|
157
|
-
"""Factors
|
267
|
+
"""Factors removed during preprocessing with removal reasons.
|
268
|
+
|
269
|
+
Returns
|
270
|
+
-------
|
271
|
+
Mapping[str, Sequence[str]]
|
272
|
+
Dictionary mapping dropped factor names to lists of reasons
|
273
|
+
why they were excluded from the final dataset.
|
274
|
+
|
275
|
+
Notes
|
276
|
+
-----
|
277
|
+
This property triggers dataset structure analysis on first access.
|
278
|
+
Common removal reasons include incompatible data types, excessive
|
279
|
+
missing values, or insufficient variation.
|
280
|
+
"""
|
158
281
|
self._structure()
|
159
282
|
return self._dropped_factors
|
160
283
|
|
161
284
|
@property
|
162
285
|
def digitized_data(self) -> NDArray[np.int64]:
|
163
|
-
"""Factor data with
|
286
|
+
"""Factor data with categorical values converted to integer codes.
|
287
|
+
|
288
|
+
Access processed factor data where categorical factors are digitized
|
289
|
+
to integer codes but continuous factors remain in their original form.
|
290
|
+
|
291
|
+
Returns
|
292
|
+
-------
|
293
|
+
NDArray[np.int64]
|
294
|
+
Array with shape (n_samples, n_factors) containing integer-coded
|
295
|
+
categorical data. Returns empty array when no factors are available.
|
296
|
+
|
297
|
+
Notes
|
298
|
+
-----
|
299
|
+
This property triggers factor binning analysis on first access.
|
300
|
+
Use this for algorithms that can handle mixed categorical and
|
301
|
+
continuous data types.
|
302
|
+
"""
|
164
303
|
if not self.factor_names:
|
165
304
|
return np.array([], dtype=np.int64)
|
166
305
|
|
@@ -173,7 +312,23 @@ class Metadata:
|
|
173
312
|
|
174
313
|
@property
|
175
314
|
def binned_data(self) -> NDArray[np.int64]:
|
176
|
-
"""Factor data with
|
315
|
+
"""Factor data with continuous values discretized into bins.
|
316
|
+
|
317
|
+
Access fully processed factor data where both categorical and
|
318
|
+
continuous factors are converted to integer bin indices.
|
319
|
+
|
320
|
+
Returns
|
321
|
+
-------
|
322
|
+
NDArray[np.int64]
|
323
|
+
Array with shape (n_samples, n_factors) containing binned integer
|
324
|
+
data ready for categorical analysis algorithms. Returns empty array
|
325
|
+
when no factors are available.
|
326
|
+
|
327
|
+
Notes
|
328
|
+
-----
|
329
|
+
This property triggers factor binning analysis on first access.
|
330
|
+
Use this for algorithms requiring purely discrete input data.
|
331
|
+
"""
|
177
332
|
if not self.factor_names:
|
178
333
|
return np.array([], dtype=np.int64)
|
179
334
|
|
@@ -186,19 +341,59 @@ class Metadata:
|
|
186
341
|
|
187
342
|
@property
|
188
343
|
def factor_names(self) -> Sequence[str]:
|
189
|
-
"""
|
344
|
+
"""Names of all processed metadata factors.
|
345
|
+
|
346
|
+
Returns
|
347
|
+
-------
|
348
|
+
Sequence[str]
|
349
|
+
List of factor names that passed filtering and preprocessing steps.
|
350
|
+
Order matches columns in factor_data, digitized_data, and binned_data.
|
351
|
+
|
352
|
+
Notes
|
353
|
+
-----
|
354
|
+
This property triggers dataset structure analysis on first access.
|
355
|
+
Factor names respect include/exclude filtering settings.
|
356
|
+
"""
|
190
357
|
self._structure()
|
191
358
|
return list(filter(self._filter, self._factors))
|
192
359
|
|
193
360
|
@property
|
194
361
|
def factor_info(self) -> Mapping[str, FactorInfo]:
|
195
|
-
"""
|
362
|
+
"""Type information and processing status for each factor.
|
363
|
+
|
364
|
+
Returns
|
365
|
+
-------
|
366
|
+
Mapping[str, FactorInfo]
|
367
|
+
Dictionary mapping factor names to FactorInfo objects containing
|
368
|
+
data type classification and processing flags (binned, digitized).
|
369
|
+
|
370
|
+
Notes
|
371
|
+
-----
|
372
|
+
This property triggers factor binning analysis on first access.
|
373
|
+
Only includes factors that survived preprocessing and filtering.
|
374
|
+
"""
|
196
375
|
self._bin()
|
197
376
|
return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
|
198
377
|
|
199
378
|
@property
|
200
379
|
def factor_data(self) -> NDArray[Any]:
|
201
|
-
"""
|
380
|
+
"""Raw factor values before binning or digitization.
|
381
|
+
|
382
|
+
Access unprocessed factor data in its original numeric form before
|
383
|
+
any categorical encoding or binning transformations are applied.
|
384
|
+
|
385
|
+
Returns
|
386
|
+
-------
|
387
|
+
NDArray[Any]
|
388
|
+
Array with shape (n_samples, n_factors) containing original factor
|
389
|
+
values. Returns empty array when no factors are available.
|
390
|
+
|
391
|
+
Notes
|
392
|
+
-----
|
393
|
+
Use this for algorithms that can work with mixed data types or when
|
394
|
+
you need access to original continuous values. For analysis-ready
|
395
|
+
integer data, use binned_data or digitized_data instead.
|
396
|
+
"""
|
202
397
|
if not self.factor_names:
|
203
398
|
return np.array([], dtype=np.float64)
|
204
399
|
|
@@ -207,24 +402,67 @@ class Metadata:
|
|
207
402
|
|
208
403
|
@property
|
209
404
|
def class_labels(self) -> NDArray[np.intp]:
|
210
|
-
"""
|
405
|
+
"""Target class labels as integer indices.
|
406
|
+
|
407
|
+
Returns
|
408
|
+
-------
|
409
|
+
NDArray[np.intp]
|
410
|
+
Array of class indices corresponding to dataset targets. For
|
411
|
+
object detection datasets, contains one label per detection.
|
412
|
+
|
413
|
+
Notes
|
414
|
+
-----
|
415
|
+
This property triggers dataset structure analysis on first access.
|
416
|
+
Use class_names property to get human-readable label names.
|
417
|
+
"""
|
211
418
|
self._structure()
|
212
419
|
return self._class_labels
|
213
420
|
|
214
421
|
@property
|
215
422
|
def class_names(self) -> Sequence[str]:
|
216
|
-
"""
|
423
|
+
"""Human-readable names corresponding to class labels.
|
424
|
+
|
425
|
+
Returns
|
426
|
+
-------
|
427
|
+
Sequence[str]
|
428
|
+
List of class names where index corresponds to class label value.
|
429
|
+
Derived from dataset metadata or auto-generated from label indices.
|
430
|
+
|
431
|
+
Notes
|
432
|
+
-----
|
433
|
+
This property triggers dataset structure analysis on first access.
|
434
|
+
"""
|
217
435
|
self._structure()
|
218
436
|
return self._class_names
|
219
437
|
|
220
438
|
@property
|
221
439
|
def image_indices(self) -> NDArray[np.intp]:
|
222
|
-
"""
|
440
|
+
"""Dataset indices linking targets back to source images.
|
441
|
+
|
442
|
+
Returns
|
443
|
+
-------
|
444
|
+
NDArray[np.intp]
|
445
|
+
Array mapping each target/detection back to its source image
|
446
|
+
index in the original dataset. Essential for object detection
|
447
|
+
datasets where multiple detections come from single images.
|
448
|
+
|
449
|
+
Notes
|
450
|
+
-----
|
451
|
+
This property triggers dataset structure analysis on first access.
|
452
|
+
"""
|
223
453
|
self._structure()
|
224
454
|
return self._image_indices
|
225
455
|
|
226
456
|
@property
|
227
457
|
def image_count(self) -> int:
|
458
|
+
"""Total number of images in the dataset.
|
459
|
+
|
460
|
+
Returns
|
461
|
+
-------
|
462
|
+
int
|
463
|
+
Count of unique images in the source dataset, regardless of
|
464
|
+
how many targets/detections each image contains.
|
465
|
+
"""
|
228
466
|
if self._count == 0:
|
229
467
|
self._structure()
|
230
468
|
return self._count
|
@@ -252,7 +490,7 @@ class Metadata:
|
|
252
490
|
scores = []
|
253
491
|
srcidx = []
|
254
492
|
is_od = None
|
255
|
-
for i in range(len(self._dataset)):
|
493
|
+
for i in tqdm(range(len(self._dataset))):
|
256
494
|
_, target, metadata = self._dataset[i]
|
257
495
|
|
258
496
|
raw.append(metadata)
|
@@ -261,15 +499,15 @@ class Metadata:
|
|
261
499
|
target_labels = as_numpy(target.labels)
|
262
500
|
target_len = len(target_labels)
|
263
501
|
if target_len:
|
264
|
-
labels.
|
265
|
-
bboxes.
|
266
|
-
scores.
|
502
|
+
labels.append(target_labels)
|
503
|
+
bboxes.append(as_numpy(target.boxes))
|
504
|
+
scores.append(as_numpy(target.scores))
|
267
505
|
srcidx.extend([i] * target_len)
|
268
506
|
elif isinstance(target, Array):
|
269
|
-
|
270
|
-
|
271
|
-
labels.append(
|
272
|
-
scores.append(
|
507
|
+
target_scores = as_numpy(target)
|
508
|
+
if len(target_scores):
|
509
|
+
labels.append([np.argmax(target_scores)])
|
510
|
+
scores.append([target_scores])
|
273
511
|
srcidx.append(i)
|
274
512
|
else:
|
275
513
|
raise TypeError("Encountered unsupported target type in dataset")
|
@@ -278,10 +516,11 @@ class Metadata:
|
|
278
516
|
if is_od != is_od_target:
|
279
517
|
raise ValueError("Encountered unexpected target type in dataset")
|
280
518
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
519
|
+
np_asarray: Callable[..., np.ndarray] = np.concatenate if srcidx else np.asarray
|
520
|
+
labels = np_asarray(labels, dtype=np.intp)
|
521
|
+
scores = np_asarray(scores, dtype=np.float32)
|
522
|
+
bboxes = np_asarray(bboxes, dtype=np.float32) if is_od else None
|
523
|
+
srcidx = np.asarray(srcidx, dtype=np.intp)
|
285
524
|
|
286
525
|
index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
|
287
526
|
|
@@ -370,16 +609,30 @@ class Metadata:
|
|
370
609
|
self._is_binned = True
|
371
610
|
|
372
611
|
def add_factors(self, factors: Mapping[str, Array | Sequence[Any]]) -> None:
|
373
|
-
"""
|
374
|
-
Add additional factors to the metadata.
|
612
|
+
"""Add additional factors to metadata collection.
|
375
613
|
|
376
|
-
|
377
|
-
|
614
|
+
Extend the current metadata with new factors, automatically handling
|
615
|
+
length validation and integration with existing data structures.
|
378
616
|
|
379
617
|
Parameters
|
380
618
|
----------
|
381
619
|
factors : Mapping[str, Array | Sequence[Any]]
|
382
|
-
Dictionary
|
620
|
+
Dictionary mapping factor names to their values. Factor length must
|
621
|
+
match either the number of images or number of detections in the dataset.
|
622
|
+
|
623
|
+
Raises
|
624
|
+
------
|
625
|
+
ValueError
|
626
|
+
When factor lengths do not match dataset dimensions.
|
627
|
+
|
628
|
+
Examples
|
629
|
+
--------
|
630
|
+
>>> metadata = Metadata(dataset)
|
631
|
+
>>> new_factors = {
|
632
|
+
... "brightness": [0.2, 0.8, 0.5, 0.3, 0.4, 0.1, 0.3, 0.2],
|
633
|
+
... "contrast": [1.1, 0.9, 1.0, 0.8, 1.2, 1.0, 0.7, 1.3],
|
634
|
+
... }
|
635
|
+
>>> metadata.add_factors(new_factors)
|
383
636
|
"""
|
384
637
|
self._structure()
|
385
638
|
|
dataeval/data/_selection.py
CHANGED
@@ -2,8 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
+
from collections.abc import Iterator, Sequence
|
5
6
|
from enum import IntEnum
|
6
|
-
from typing import Generic,
|
7
|
+
from typing import Generic, TypeVar
|
7
8
|
|
8
9
|
from dataeval.typing import AnnotatedDataset, DatasetMetadata
|
9
10
|
|
@@ -31,14 +32,21 @@ class Subselection(Generic[_TDatum]):
|
|
31
32
|
|
32
33
|
class Select(AnnotatedDataset[_TDatum]):
|
33
34
|
"""
|
34
|
-
|
35
|
+
Dataset wrapper that applies selection criteria for filtering.
|
36
|
+
|
37
|
+
Wraps an existing dataset and applies one or more selection filters to
|
38
|
+
create a subset view without modifying the original dataset. Supports
|
39
|
+
chaining multiple selection criteria for complex filtering operations.
|
35
40
|
|
36
41
|
Parameters
|
37
42
|
----------
|
38
|
-
dataset :
|
39
|
-
|
40
|
-
|
41
|
-
|
43
|
+
dataset : AnnotatedDataset[_TDatum]
|
44
|
+
Source dataset to wrap and filter. Must implement AnnotatedDataset
|
45
|
+
interface with indexed access to data tuples.
|
46
|
+
selections : Selection or Sequence[Selection] or None, default None
|
47
|
+
Selection criteria to apply for filtering the dataset. When None,
|
48
|
+
returns all items from the source dataset. Default None creates
|
49
|
+
unfiltered view for consistent interface.
|
42
50
|
|
43
51
|
Examples
|
44
52
|
--------
|
@@ -49,7 +57,7 @@ class Select(AnnotatedDataset[_TDatum]):
|
|
49
57
|
>>> # - f"data_{idx}", one_hot_encoded(idx % class_count), {"id": idx}
|
50
58
|
>>> dataset = SampleDataset(size=100, class_count=10)
|
51
59
|
|
52
|
-
>>> # Apply
|
60
|
+
>>> # Apply selection criteria to the dataset
|
53
61
|
>>> selections = [Limit(size=5), ClassFilter(classes=[0, 2])]
|
54
62
|
>>> selected_dataset = Select(dataset, selections=selections)
|
55
63
|
|
@@ -61,6 +69,12 @@ class Select(AnnotatedDataset[_TDatum]):
|
|
61
69
|
(data_10, 0, {'id': 10})
|
62
70
|
(data_12, 2, {'id': 12})
|
63
71
|
(data_20, 0, {'id': 20})
|
72
|
+
|
73
|
+
Notes
|
74
|
+
-----
|
75
|
+
Selection criteria are applied in the order provided, allowing for
|
76
|
+
efficient sequential filtering. The wrapper maintains all metadata
|
77
|
+
and interface compatibility with the original dataset.
|
64
78
|
"""
|
65
79
|
|
66
80
|
_dataset: AnnotatedDataset[_TDatum]
|
@@ -91,6 +105,7 @@ class Select(AnnotatedDataset[_TDatum]):
|
|
91
105
|
|
92
106
|
@property
|
93
107
|
def metadata(self) -> DatasetMetadata:
|
108
|
+
"""Dataset metadata information including identifier and configuration."""
|
94
109
|
return self._metadata
|
95
110
|
|
96
111
|
def __str__(self) -> str:
|
dataeval/data/_split.py
CHANGED
@@ -2,7 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
5
|
+
from collections.abc import Iterable, Mapping, Sequence, Sized
|
6
|
+
from typing import Any, Generic, TypeVar, cast
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.typing import NDArray
|
@@ -45,7 +46,7 @@ class ClassFilter(Selection[Any]):
|
|
45
46
|
if label in self.classes:
|
46
47
|
# Include the image index
|
47
48
|
selection.append(idx)
|
48
|
-
elif isinstance(target,
|
49
|
+
elif isinstance(target, ObjectDetectionTarget | SegmentationTarget):
|
49
50
|
# Get the set of labels from the target
|
50
51
|
labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
|
51
52
|
# Check to see if any labels are in the classes to filter for
|
@@ -68,7 +69,7 @@ _TTarget = TypeVar("_TTarget", ObjectDetectionTarget, SegmentationTarget)
|
|
68
69
|
|
69
70
|
|
70
71
|
def _try_mask_object(obj: _T, mask: NDArray[np.bool_]) -> _T:
|
71
|
-
if not isinstance(obj,
|
72
|
+
if not isinstance(obj, str | bytes | bytearray) and isinstance(obj, Sequence | Array) and len(obj) == len(mask):
|
72
73
|
return obj[mask] if isinstance(obj, Array) else cast(_T, [item for i, item in enumerate(obj) if mask[i]])
|
73
74
|
return obj
|
74
75
|
|
@@ -2,7 +2,8 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
5
|
+
from collections.abc import Sequence
|
6
|
+
from typing import Any
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from numpy.random import BitGenerator, Generator, SeedSequence
|
@@ -33,7 +34,7 @@ class Shuffle(Selection[Any]):
|
|
33
34
|
def __init__(
|
34
35
|
self, seed: int | Sequence[int] | Array | SeedSequence | BitGenerator | Generator | None = None
|
35
36
|
) -> None:
|
36
|
-
self.seed = as_numpy(seed) if isinstance(seed,
|
37
|
+
self.seed = as_numpy(seed) if isinstance(seed, Sequence | Array) else seed
|
37
38
|
|
38
39
|
def __call__(self, dataset: Select[Any]) -> None:
|
39
40
|
rng = np.random.default_rng(self.seed)
|
@@ -12,8 +12,9 @@ __all__ = []
|
|
12
12
|
|
13
13
|
import math
|
14
14
|
from abc import abstractmethod
|
15
|
+
from collections.abc import Callable
|
15
16
|
from functools import wraps
|
16
|
-
from typing import Any,
|
17
|
+
from typing import Any, Literal, Protocol, TypeVar, runtime_checkable
|
17
18
|
|
18
19
|
import numpy as np
|
19
20
|
from numpy.typing import NDArray
|