dataeval 0.82.1__py3-none-any.whl → 0.84.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +7 -2
- dataeval/config.py +13 -3
- dataeval/metadata/__init__.py +2 -2
- dataeval/metadata/_ood.py +144 -27
- dataeval/metrics/bias/__init__.py +11 -1
- dataeval/metrics/bias/_balance.py +3 -3
- dataeval/metrics/bias/_completeness.py +130 -0
- dataeval/metrics/estimators/_ber.py +2 -1
- dataeval/metrics/stats/_base.py +31 -36
- dataeval/metrics/stats/_dimensionstats.py +2 -2
- dataeval/metrics/stats/_hashstats.py +2 -2
- dataeval/metrics/stats/_imagestats.py +4 -4
- dataeval/metrics/stats/_labelstats.py +4 -45
- dataeval/metrics/stats/_pixelstats.py +2 -2
- dataeval/metrics/stats/_visualstats.py +2 -2
- dataeval/outputs/__init__.py +4 -2
- dataeval/outputs/_bias.py +31 -22
- dataeval/outputs/_metadata.py +7 -0
- dataeval/outputs/_stats.py +2 -3
- dataeval/typing.py +43 -12
- dataeval/utils/_array.py +26 -1
- dataeval/utils/_mst.py +1 -2
- dataeval/utils/data/_dataset.py +2 -0
- dataeval/utils/data/_embeddings.py +115 -32
- dataeval/utils/data/_images.py +38 -15
- dataeval/utils/data/_selection.py +7 -8
- dataeval/utils/data/_split.py +76 -129
- dataeval/utils/data/datasets/_base.py +4 -2
- dataeval/utils/data/datasets/_cifar10.py +17 -9
- dataeval/utils/data/datasets/_milco.py +18 -12
- dataeval/utils/data/datasets/_mnist.py +24 -8
- dataeval/utils/data/datasets/_ships.py +18 -8
- dataeval/utils/data/datasets/_types.py +1 -5
- dataeval/utils/data/datasets/_voc.py +47 -24
- dataeval/utils/data/selections/__init__.py +2 -0
- dataeval/utils/data/selections/_classfilter.py +1 -1
- dataeval/utils/data/selections/_prioritize.py +296 -0
- dataeval/utils/data/selections/_shuffle.py +13 -4
- dataeval/utils/metadata.py +1 -1
- dataeval/utils/torch/_gmm.py +3 -2
- {dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/METADATA +4 -4
- {dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/RECORD +44 -43
- dataeval/detectors/ood/metadata_ood_mi.py +0 -91
- {dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.82.1.dist-info → dataeval-0.84.0.dist-info}/WHEEL +0 -0
dataeval/utils/data/_split.py
CHANGED
@@ -2,19 +2,22 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
+
import logging
|
5
6
|
import warnings
|
6
|
-
from typing import Any, Iterator, Protocol
|
7
|
+
from typing import Any, Iterator, Protocol, Sequence
|
7
8
|
|
8
9
|
import numpy as np
|
9
10
|
from numpy.typing import NDArray
|
10
|
-
from sklearn.cluster import KMeans
|
11
|
-
from sklearn.metrics import silhouette_score
|
12
11
|
from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
|
13
12
|
from sklearn.utils.multiclass import type_of_target
|
14
13
|
|
15
|
-
from dataeval.config import
|
14
|
+
from dataeval.config import EPSILON
|
16
15
|
from dataeval.outputs._base import set_metadata
|
17
16
|
from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
|
17
|
+
from dataeval.typing import AnnotatedDataset
|
18
|
+
from dataeval.utils.data._metadata import Metadata
|
19
|
+
|
20
|
+
_logger = logging.getLogger(__name__)
|
18
21
|
|
19
22
|
|
20
23
|
class KFoldSplitter(Protocol):
|
@@ -85,7 +88,7 @@ def calculate_validation_fraction(num_folds: int, test_frac: float, val_frac: fl
|
|
85
88
|
return val_base * (1.0 / num_folds) * (1.0 - test_frac)
|
86
89
|
|
87
90
|
|
88
|
-
def
|
91
|
+
def validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
|
89
92
|
"""
|
90
93
|
Check to make sure there is more input data than the total number of partitions requested
|
91
94
|
|
@@ -116,7 +119,7 @@ def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
|
|
116
119
|
raise ValueError("Detected continuous labels. Labels must be discrete for proper stratification")
|
117
120
|
|
118
121
|
|
119
|
-
def
|
122
|
+
def validate_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> None:
|
120
123
|
"""
|
121
124
|
Check if the dataset can be stratified by class label over the given number of partitions
|
122
125
|
|
@@ -132,26 +135,23 @@ def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
|
|
132
135
|
bool
|
133
136
|
True if dataset can be stratified else False
|
134
137
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
138
|
+
Raises
|
139
|
+
------
|
140
|
+
ValueError
|
141
|
+
If the dataset cannot be stratified due to the total number of [train, val, test]
|
139
142
|
partitions exceeding the number of instances of the rarest class label.
|
140
143
|
"""
|
141
144
|
|
142
145
|
# Get the minimum count of all labels
|
143
146
|
lowest_label_count = np.unique(labels, return_counts=True)[1].min()
|
144
147
|
if lowest_label_count < num_partitions:
|
145
|
-
|
148
|
+
raise ValueError(
|
146
149
|
f"Unable to stratify due to label frequency. The lowest label count ({lowest_label_count}) is fewer "
|
147
|
-
f"than the total number of partitions ({num_partitions}) requested."
|
148
|
-
UserWarning,
|
150
|
+
f"than the total number of partitions ({num_partitions}) requested."
|
149
151
|
)
|
150
|
-
return False
|
151
|
-
return True
|
152
152
|
|
153
153
|
|
154
|
-
def
|
154
|
+
def validate_groupable(groups: NDArray[np.intp], num_partitions: int) -> None:
|
155
155
|
"""
|
156
156
|
Warns user if the number of unique group_ids is incompatible with a grouped partition containing
|
157
157
|
num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
|
@@ -159,7 +159,7 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
|
|
159
159
|
|
160
160
|
Parameters
|
161
161
|
----------
|
162
|
-
|
162
|
+
groups : NDArray of ints
|
163
163
|
The id of the group each sample at the corresponding index belongs to
|
164
164
|
num_partitions : int
|
165
165
|
Total number of train, val, and test splits requested
|
@@ -169,60 +169,24 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
|
|
169
169
|
bool
|
170
170
|
True if the dataset can be grouped by the given group ids else False
|
171
171
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
172
|
+
Raises
|
173
|
+
------
|
174
|
+
ValueError
|
175
|
+
If there are is only one unique group.
|
176
|
+
ValueError
|
177
|
+
If there are fewer groups than the requested number of partitions plus one
|
176
178
|
"""
|
177
179
|
|
178
|
-
num_unique_groups = len(np.unique(
|
180
|
+
num_unique_groups = len(np.unique(groups))
|
179
181
|
# Cannot separate if only one group exists
|
180
182
|
if num_unique_groups == 1:
|
181
|
-
|
183
|
+
raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than 1.")
|
182
184
|
|
183
185
|
if num_unique_groups < num_partitions:
|
184
|
-
|
185
|
-
f"Groups must be greater than num partitions. Got {num_unique_groups} and {num_partitions}. "
|
186
|
-
"Reverting to ungrouped partitioning",
|
187
|
-
UserWarning,
|
188
|
-
)
|
189
|
-
return False
|
190
|
-
return True
|
191
|
-
|
192
|
-
|
193
|
-
def bin_kmeans(array: NDArray[Any]) -> NDArray[np.intp]:
|
194
|
-
"""
|
195
|
-
Find bins of continuous data by iteratively applying k-means clustering, and keeping the
|
196
|
-
clustering with the highest silhouette score.
|
197
|
-
|
198
|
-
Parameters
|
199
|
-
----------
|
200
|
-
array : NDArray
|
201
|
-
continuous data to bin
|
186
|
+
raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than num partitions ({num_partitions}).")
|
202
187
|
|
203
|
-
Returns
|
204
|
-
-------
|
205
|
-
NDArray[int]:
|
206
|
-
bin numbers assigned by the kmeans best clusterer.
|
207
|
-
"""
|
208
188
|
|
209
|
-
|
210
|
-
array = array.reshape([-1, 1])
|
211
|
-
best_score = 0.60
|
212
|
-
else:
|
213
|
-
best_score = 0.50
|
214
|
-
bin_index = np.zeros(len(array), dtype=np.intp)
|
215
|
-
for k in range(2, 20):
|
216
|
-
clusterer = KMeans(n_clusters=k, random_state=get_seed())
|
217
|
-
cluster_labels = clusterer.fit_predict(array)
|
218
|
-
score = silhouette_score(array, cluster_labels, sample_size=25_000, random_state=get_seed())
|
219
|
-
if score > best_score:
|
220
|
-
best_score = score
|
221
|
-
bin_index = cluster_labels.astype(np.intp)
|
222
|
-
return bin_index
|
223
|
-
|
224
|
-
|
225
|
-
def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.intp]:
|
189
|
+
def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np.intp] | None:
|
226
190
|
"""
|
227
191
|
Returns individual group numbers based on a subset of metadata defined by groupnames
|
228
192
|
|
@@ -232,32 +196,20 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
|
|
232
196
|
dictionary containing all metadata
|
233
197
|
groupnames : list
|
234
198
|
which groups from the metadata dictionary to consider for dataset grouping
|
235
|
-
num_samples : int
|
236
|
-
number of labels. Used to ensure agreement between input data/labels and metadata entries.
|
237
|
-
|
238
|
-
Raises
|
239
|
-
------
|
240
|
-
IndexError
|
241
|
-
raised if an entry in the metadata dictionary doesn't have the same length as num_samples
|
242
199
|
|
243
200
|
Returns
|
244
201
|
-------
|
245
202
|
np.ndarray
|
246
203
|
group identifiers from metadata
|
247
204
|
"""
|
248
|
-
|
249
|
-
if
|
250
|
-
return
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
if type_of_target(feature) == "continuous":
|
258
|
-
features2group[name] = bin_kmeans(feature)
|
259
|
-
binned_features = np.stack(list(features2group.values()), axis=1)
|
260
|
-
_, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
|
205
|
+
# get only the factors that are present in the metadata
|
206
|
+
if split_on is None:
|
207
|
+
return None
|
208
|
+
|
209
|
+
split_set = set(split_on)
|
210
|
+
indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
|
211
|
+
binned_features = metadata.discrete_data[:, indices]
|
212
|
+
group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
|
261
213
|
return group_ids
|
262
214
|
|
263
215
|
|
@@ -294,10 +246,18 @@ def make_splits(
|
|
294
246
|
split_defs: list[TrainValSplit] = []
|
295
247
|
n_labels = len(np.unique(labels))
|
296
248
|
splitter = KFOLD_GROUP_STRATIFIED_MAP[(groups is not None, stratified)](n_folds)
|
249
|
+
_logger.log(logging.DEBUG, f"splitter={splitter.__class__.__name__}(n_splits={n_folds})")
|
297
250
|
good = False
|
298
251
|
attempts = 0
|
299
252
|
while not good and attempts < 3:
|
300
253
|
attempts += 1
|
254
|
+
_logger.log(
|
255
|
+
logging.DEBUG,
|
256
|
+
f"attempt={attempts}: splitter.split("
|
257
|
+
+ f"index=arr(len={len(index)}, unique={np.unique(index)}), "
|
258
|
+
+ f"labels=arr(len={len(index)}, unique={np.unique(index)}), "
|
259
|
+
+ ("groups=None" if groups is None else f"groups=arr(len={len(groups)}, unique={np.unique(groups)}))"),
|
260
|
+
)
|
301
261
|
splits = splitter.split(index, labels, groups)
|
302
262
|
split_defs.clear()
|
303
263
|
for train_idx, eval_idx in splits:
|
@@ -341,20 +301,20 @@ def find_best_split(
|
|
341
301
|
counts = np.bincount(arr, minlength=minlength)
|
342
302
|
return counts / np.sum(counts)
|
343
303
|
|
344
|
-
def weight(arr: NDArray, class_freq: NDArray) ->
|
345
|
-
return np.sum(np.abs(freq(arr, len(class_freq)) - class_freq))
|
304
|
+
def weight(arr: NDArray, class_freq: NDArray) -> float:
|
305
|
+
return float(np.sum(np.abs(freq(arr, len(class_freq)) - class_freq)))
|
346
306
|
|
347
|
-
def class_freq_diff(split: TrainValSplit) ->
|
307
|
+
def class_freq_diff(split: TrainValSplit) -> float:
|
348
308
|
class_freq = freq(labels)
|
349
309
|
return weight(labels[split.train], class_freq) + weight(labels[split.val], class_freq)
|
350
310
|
|
351
|
-
def split_ratio(split: TrainValSplit) ->
|
352
|
-
return
|
311
|
+
def split_ratio(split: TrainValSplit) -> float:
|
312
|
+
return len(split.val) / (len(split.val) + len(split.train))
|
353
313
|
|
354
|
-
def split_diff(split: TrainValSplit) ->
|
314
|
+
def split_diff(split: TrainValSplit) -> float:
|
355
315
|
return abs(split_frac - split_ratio(split))
|
356
316
|
|
357
|
-
def split_inv_diff(split: TrainValSplit) ->
|
317
|
+
def split_inv_diff(split: TrainValSplit) -> float:
|
358
318
|
return abs(1 - split_frac - split_ratio(split))
|
359
319
|
|
360
320
|
# Selects minimization function based on inputs
|
@@ -399,11 +359,12 @@ def single_split(
|
|
399
359
|
Indices of data partitioned for training and evaluation
|
400
360
|
"""
|
401
361
|
|
402
|
-
|
403
|
-
max_folds =
|
404
|
-
|
405
|
-
divisor = split_frac
|
406
|
-
n_folds =
|
362
|
+
unique_groups = 2 if groups is None else len(np.unique(groups))
|
363
|
+
max_folds = min(min(np.unique(labels, return_counts=True)[1]), unique_groups) if stratified else unique_groups
|
364
|
+
|
365
|
+
divisor = split_frac if split_frac <= 2 / 3 else 1 - split_frac
|
366
|
+
n_folds = min(max(round(1 / (divisor + EPSILON)), 2), max_folds) # Clips value between 2 and max_folds
|
367
|
+
_logger.log(logging.DEBUG, f"n_folds={n_folds} clipped between[2, {max_folds}]")
|
407
368
|
|
408
369
|
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
409
370
|
return find_best_split(labels, split_candidates, stratified, split_frac)
|
@@ -411,22 +372,20 @@ def single_split(
|
|
411
372
|
|
412
373
|
@set_metadata
|
413
374
|
def split_dataset(
|
414
|
-
|
375
|
+
dataset: AnnotatedDataset[Any] | Metadata,
|
415
376
|
num_folds: int = 1,
|
416
377
|
stratify: bool = False,
|
417
|
-
split_on:
|
418
|
-
metadata: dict[str, Any] | None = None,
|
378
|
+
split_on: Sequence[str] | None = None,
|
419
379
|
test_frac: float = 0.0,
|
420
380
|
val_frac: float = 0.0,
|
421
381
|
) -> SplitDatasetOutput:
|
422
382
|
"""
|
423
|
-
|
424
|
-
Indices for a test holdout may also be optionally included
|
383
|
+
Dataset splitting function. Returns a dataclass containing a list of train and validation indices.
|
425
384
|
|
426
385
|
Parameters
|
427
386
|
----------
|
428
|
-
|
429
|
-
|
387
|
+
dataset : AnnotatedDataset or Metadata
|
388
|
+
Dataset to split.
|
430
389
|
num_folds : int, default 1
|
431
390
|
Number of [train, val] folds. If equal to 1, val_frac must be greater than 0.0
|
432
391
|
stratify : bool, default False
|
@@ -436,8 +395,6 @@ def split_dataset(
|
|
436
395
|
Keys of the metadata dictionary upon which to group the dataset.
|
437
396
|
A grouped partition is divided such that no group is present within both the training and
|
438
397
|
validation set. Split_on groups should be selected to mitigate validation bias
|
439
|
-
metadata : dict or None, default None
|
440
|
-
Dict containing data for potential dataset grouping. See split_on above
|
441
398
|
test_frac : float, default 0.0
|
442
399
|
Fraction of data to be optionally held out for test set
|
443
400
|
val_frac : float, default 0.0
|
@@ -450,13 +407,8 @@ def split_dataset(
|
|
450
407
|
Output class containing a list of indices of training
|
451
408
|
and validation data for each fold and optional test indices
|
452
409
|
|
453
|
-
|
454
|
-
|
455
|
-
TypeError
|
456
|
-
Raised if split_on is passed, but metadata is None or empty
|
457
|
-
|
458
|
-
Note
|
459
|
-
----
|
410
|
+
Notes
|
411
|
+
-----
|
460
412
|
When specifying groups and/or stratification, ratios for test and validation splits can vary
|
461
413
|
as the stratification and grouping take higher priority than the percentages
|
462
414
|
"""
|
@@ -464,30 +416,25 @@ def split_dataset(
|
|
464
416
|
val_frac = calculate_validation_fraction(num_folds, test_frac, val_frac)
|
465
417
|
total_partitions = num_folds + 1 if test_frac else num_folds
|
466
418
|
|
467
|
-
if isinstance(
|
468
|
-
|
419
|
+
metadata = dataset if isinstance(dataset, Metadata) else Metadata(dataset)
|
420
|
+
labels = metadata.class_labels
|
469
421
|
|
470
|
-
|
422
|
+
validate_labels(labels, total_partitions)
|
423
|
+
if stratify:
|
424
|
+
validate_stratifiable(labels, total_partitions)
|
471
425
|
|
472
|
-
|
473
|
-
|
474
|
-
groups = None
|
475
|
-
if split_on:
|
476
|
-
if metadata is None or metadata == {}:
|
477
|
-
raise TypeError("If split_on is specified, metadata must also be provided, got None")
|
478
|
-
possible_groups = get_group_ids(metadata, split_on, label_length)
|
426
|
+
groups = get_groups(metadata, split_on)
|
427
|
+
if groups is not None:
|
479
428
|
# Accounts for a test set that is 100 % of the data
|
480
429
|
group_partitions = total_partitions + 1 if val_frac else total_partitions
|
481
|
-
|
482
|
-
groups = possible_groups
|
430
|
+
validate_groupable(groups, group_partitions)
|
483
431
|
|
484
|
-
index = np.arange(
|
432
|
+
index = np.arange(len(labels))
|
485
433
|
|
486
|
-
|
487
|
-
single_split(index
|
488
|
-
|
489
|
-
|
490
|
-
)
|
434
|
+
if test_frac:
|
435
|
+
tvs = single_split(index, labels, test_frac, groups, stratify)
|
436
|
+
else:
|
437
|
+
tvs = TrainValSplit(index, np.array([], dtype=np.intp))
|
491
438
|
|
492
439
|
tv_labels = labels[tvs.train]
|
493
440
|
tv_groups = groups[tvs.train] if groups is not None else None
|
@@ -4,7 +4,7 @@ __all__ = []
|
|
4
4
|
|
5
5
|
from abc import abstractmethod
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
|
7
|
+
from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
|
8
8
|
|
9
9
|
from dataeval.utils.data.datasets._fileio import _ensure_exists
|
10
10
|
from dataeval.utils.data.datasets._mixin import BaseDatasetMixin
|
@@ -16,9 +16,11 @@ from dataeval.utils.data.datasets._types import (
|
|
16
16
|
ObjectDetectionTarget,
|
17
17
|
SegmentationDataset,
|
18
18
|
SegmentationTarget,
|
19
|
-
Transform,
|
20
19
|
)
|
21
20
|
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
from dataeval.typing import Transform
|
23
|
+
|
22
24
|
_TArray = TypeVar("_TArray")
|
23
25
|
_TTarget = TypeVar("_TTarget")
|
24
26
|
_TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Literal, Sequence, TypeVar
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
@@ -11,7 +11,9 @@ from PIL import Image
|
|
11
11
|
|
12
12
|
from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
|
13
13
|
from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
|
14
|
-
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from dataeval.typing import Transform
|
15
17
|
|
16
18
|
CIFARClassStringMap = Literal["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
|
17
19
|
TCIFARClassMap = TypeVar("TCIFARClassMap", CIFARClassStringMap, int, list[CIFARClassStringMap], list[int])
|
@@ -30,21 +32,27 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
30
32
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
31
33
|
image_set : "train", "test" or "base", default "train"
|
32
34
|
If "base", returns all of the data to allow the user to create their own splits.
|
33
|
-
transforms : Transform
|
35
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
34
36
|
Transform(s) to apply to the data.
|
35
37
|
verbose : bool, default False
|
36
38
|
If True, outputs print statements.
|
37
39
|
|
38
40
|
Attributes
|
39
41
|
----------
|
40
|
-
|
42
|
+
path : pathlib.Path
|
43
|
+
Location of the folder containing the data.
|
44
|
+
image_set : "train", "test" or "base"
|
45
|
+
The selected image set from the dataset.
|
46
|
+
index2label : dict[int, str]
|
41
47
|
Dictionary which translates from class integers to the associated class strings.
|
42
|
-
label2index : dict
|
48
|
+
label2index : dict[str, int]
|
43
49
|
Dictionary which translates from class strings to the associated class integers.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
50
|
+
metadata : DatasetMetadata
|
51
|
+
Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
|
52
|
+
transforms : Sequence[Transform]
|
53
|
+
The transforms to be applied to the data.
|
54
|
+
size : int
|
55
|
+
The size of the dataset.
|
48
56
|
"""
|
49
57
|
|
50
58
|
_resources = [
|
@@ -1,23 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
|
4
|
-
|
5
3
|
__all__ = []
|
6
4
|
|
7
5
|
from pathlib import Path
|
8
|
-
from typing import Any, Sequence
|
6
|
+
from typing import TYPE_CHECKING, Any, Sequence
|
9
7
|
|
10
8
|
from numpy.typing import NDArray
|
11
9
|
|
12
10
|
from dataeval.utils.data.datasets._base import BaseODDataset, DataLocation
|
13
|
-
from dataeval.utils.data.datasets.
|
11
|
+
from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from dataeval.typing import Transform
|
14
15
|
|
15
16
|
|
16
17
|
class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
17
18
|
"""
|
18
19
|
A side-scan sonar dataset focused on mine (object) detection.
|
19
20
|
|
20
|
-
|
21
21
|
The dataset comes from the paper
|
22
22
|
`Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
|
23
23
|
by N.P. Santos et. al. (2024).
|
@@ -43,21 +43,27 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
43
43
|
download : bool, default False
|
44
44
|
If True, downloads the dataset from the internet and puts it in root directory.
|
45
45
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
46
|
-
transforms : Transform
|
46
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
47
47
|
Transform(s) to apply to the data.
|
48
48
|
verbose : bool, default False
|
49
49
|
If True, outputs print statements.
|
50
50
|
|
51
51
|
Attributes
|
52
52
|
----------
|
53
|
-
|
53
|
+
path : pathlib.Path
|
54
|
+
Location of the folder containing the data.
|
55
|
+
image_set : "base"
|
56
|
+
The base image set is the only available image set for the MILCO dataset.
|
57
|
+
index2label : dict[int, str]
|
54
58
|
Dictionary which translates from class integers to the associated class strings.
|
55
|
-
label2index : dict
|
59
|
+
label2index : dict[str, int]
|
56
60
|
Dictionary which translates from class strings to the associated class integers.
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
+
metadata : DatasetMetadata
|
62
|
+
Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
|
63
|
+
transforms : Sequence[Transform]
|
64
|
+
The transforms to be applied to the data.
|
65
|
+
size : int
|
66
|
+
The size of the dataset.
|
61
67
|
"""
|
62
68
|
|
63
69
|
_resources = [
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Literal, Sequence, TypeVar
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
11
|
from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
|
12
12
|
from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
|
13
|
-
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from dataeval.typing import Transform
|
14
16
|
|
15
17
|
MNISTClassStringMap = Literal["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
|
16
18
|
TMNISTClassMap = TypeVar("TMNISTClassMap", MNISTClassStringMap, int, list[MNISTClassStringMap], list[int])
|
@@ -52,19 +54,33 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
52
54
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
53
55
|
image_set : "train", "test" or "base", default "train"
|
54
56
|
If "base", returns all of the data to allow the user to create their own splits.
|
57
|
+
corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
|
58
|
+
"shear", "scale", "rotate", "brightness", "translate", "stripe", "fog", "spatter", \
|
59
|
+
"dotted_line", "zigzag", "canny_edges" or None, default None
|
60
|
+
Corruption to apply to the data.
|
61
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
62
|
+
Transform(s) to apply to the data.
|
55
63
|
verbose : bool, default False
|
56
64
|
If True, outputs print statements.
|
57
65
|
|
58
66
|
Attributes
|
59
67
|
----------
|
60
|
-
|
68
|
+
path : pathlib.Path
|
69
|
+
Location of the folder containing the data.
|
70
|
+
image_set : "train", "test" or "base"
|
71
|
+
The selected image set from the dataset.
|
72
|
+
index2label : dict[int, str]
|
61
73
|
Dictionary which translates from class integers to the associated class strings.
|
62
|
-
label2index : dict
|
74
|
+
label2index : dict[str, int]
|
63
75
|
Dictionary which translates from class strings to the associated class integers.
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
76
|
+
metadata : DatasetMetadata
|
77
|
+
Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
|
78
|
+
corruption : str or None
|
79
|
+
Corruption applied to the data.
|
80
|
+
transforms : Sequence[Transform]
|
81
|
+
The transforms to be applied to the data.
|
82
|
+
size : int
|
83
|
+
The size of the dataset.
|
68
84
|
"""
|
69
85
|
|
70
86
|
_resources = [
|
@@ -3,14 +3,16 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Any, Sequence
|
6
|
+
from typing import TYPE_CHECKING, Any, Sequence
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
11
|
from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
|
12
12
|
from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
|
13
|
-
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from dataeval.typing import Transform
|
14
16
|
|
15
17
|
|
16
18
|
class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
@@ -32,19 +34,27 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
32
34
|
download : bool, default False
|
33
35
|
If True, downloads the dataset from the internet and puts it in root directory.
|
34
36
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
37
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
38
|
+
Transform(s) to apply to the data.
|
35
39
|
verbose : bool, default False
|
36
40
|
If True, outputs print statements.
|
37
41
|
|
38
42
|
Attributes
|
39
43
|
----------
|
40
|
-
|
44
|
+
path : pathlib.Path
|
45
|
+
Location of the folder containing the data.
|
46
|
+
image_set : "base"
|
47
|
+
The base image set is the only available image set for the Ships dataset.
|
48
|
+
index2label : dict[int, str]
|
41
49
|
Dictionary which translates from class integers to the associated class strings.
|
42
|
-
label2index : dict
|
50
|
+
label2index : dict[str, int]
|
43
51
|
Dictionary which translates from class strings to the associated class integers.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
52
|
+
metadata : DatasetMetadata
|
53
|
+
Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
|
54
|
+
transforms : Sequence[Transform]
|
55
|
+
The transforms to be applied to the data.
|
56
|
+
size : int
|
57
|
+
The size of the dataset.
|
48
58
|
"""
|
49
59
|
|
50
60
|
_resources = [
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import Any, Generic,
|
6
|
+
from typing import Any, Generic, TypedDict, TypeVar
|
7
7
|
|
8
8
|
from torch.utils.data import Dataset
|
9
9
|
from typing_extensions import NotRequired, Required
|
@@ -46,7 +46,3 @@ class SegmentationTarget(Generic[_TArray]):
|
|
46
46
|
|
47
47
|
|
48
48
|
class SegmentationDataset(AnnotatedDataset[tuple[_TArray, SegmentationTarget[_TArray], dict[str, Any]]]): ...
|
49
|
-
|
50
|
-
|
51
|
-
class Transform(Generic[_TArray], Protocol):
|
52
|
-
def __call__(self, data: _TArray, /) -> _TArray: ...
|