dataeval 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/config.py +3 -3
- dataeval/detectors/drift/__init__.py +2 -2
- dataeval/detectors/drift/_base.py +55 -203
- dataeval/detectors/drift/_cvm.py +19 -30
- dataeval/detectors/drift/_ks.py +18 -30
- dataeval/detectors/drift/_mmd.py +189 -53
- dataeval/detectors/drift/_uncertainty.py +52 -56
- dataeval/detectors/drift/updates.py +13 -12
- dataeval/detectors/linters/duplicates.py +5 -3
- dataeval/detectors/linters/outliers.py +2 -2
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/metrics/bias/__init__.py +11 -1
- dataeval/metrics/bias/_completeness.py +130 -0
- dataeval/metrics/stats/_base.py +28 -32
- dataeval/metrics/stats/_dimensionstats.py +2 -2
- dataeval/metrics/stats/_hashstats.py +2 -2
- dataeval/metrics/stats/_imagestats.py +4 -4
- dataeval/metrics/stats/_labelstats.py +4 -45
- dataeval/metrics/stats/_pixelstats.py +2 -2
- dataeval/metrics/stats/_visualstats.py +2 -2
- dataeval/outputs/__init__.py +2 -1
- dataeval/outputs/_bias.py +31 -22
- dataeval/outputs/_stats.py +2 -3
- dataeval/typing.py +25 -22
- dataeval/utils/_array.py +43 -7
- dataeval/utils/data/_dataset.py +8 -4
- dataeval/utils/data/_embeddings.py +141 -24
- dataeval/utils/data/_images.py +38 -15
- dataeval/utils/data/_metadata.py +5 -4
- dataeval/utils/data/_selection.py +3 -15
- dataeval/utils/data/_split.py +76 -129
- dataeval/utils/data/datasets/_base.py +7 -4
- dataeval/utils/data/datasets/_cifar10.py +9 -9
- dataeval/utils/data/datasets/_milco.py +42 -14
- dataeval/utils/data/datasets/_mnist.py +9 -5
- dataeval/utils/data/datasets/_ships.py +8 -4
- dataeval/utils/data/datasets/_voc.py +40 -19
- dataeval/utils/data/selections/__init__.py +2 -0
- dataeval/utils/data/selections/_classbalance.py +38 -0
- dataeval/utils/data/selections/_classfilter.py +14 -29
- dataeval/utils/data/selections/_prioritize.py +1 -1
- dataeval/utils/data/selections/_shuffle.py +2 -2
- dataeval/utils/metadata.py +1 -1
- dataeval/utils/torch/_internal.py +12 -35
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +49 -48
- dataeval/detectors/drift/_torch.py +0 -222
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0
dataeval/utils/data/_split.py
CHANGED
@@ -2,19 +2,22 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
+
import logging
|
5
6
|
import warnings
|
6
|
-
from typing import Any, Iterator, Protocol
|
7
|
+
from typing import Any, Iterator, Protocol, Sequence
|
7
8
|
|
8
9
|
import numpy as np
|
9
10
|
from numpy.typing import NDArray
|
10
|
-
from sklearn.cluster import KMeans
|
11
|
-
from sklearn.metrics import silhouette_score
|
12
11
|
from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
|
13
12
|
from sklearn.utils.multiclass import type_of_target
|
14
13
|
|
15
|
-
from dataeval.config import
|
14
|
+
from dataeval.config import EPSILON
|
16
15
|
from dataeval.outputs._base import set_metadata
|
17
16
|
from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
|
17
|
+
from dataeval.typing import AnnotatedDataset
|
18
|
+
from dataeval.utils.data._metadata import Metadata
|
19
|
+
|
20
|
+
_logger = logging.getLogger(__name__)
|
18
21
|
|
19
22
|
|
20
23
|
class KFoldSplitter(Protocol):
|
@@ -85,7 +88,7 @@ def calculate_validation_fraction(num_folds: int, test_frac: float, val_frac: fl
|
|
85
88
|
return val_base * (1.0 / num_folds) * (1.0 - test_frac)
|
86
89
|
|
87
90
|
|
88
|
-
def
|
91
|
+
def validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
|
89
92
|
"""
|
90
93
|
Check to make sure there is more input data than the total number of partitions requested
|
91
94
|
|
@@ -116,7 +119,7 @@ def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
|
|
116
119
|
raise ValueError("Detected continuous labels. Labels must be discrete for proper stratification")
|
117
120
|
|
118
121
|
|
119
|
-
def
|
122
|
+
def validate_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> None:
|
120
123
|
"""
|
121
124
|
Check if the dataset can be stratified by class label over the given number of partitions
|
122
125
|
|
@@ -132,26 +135,23 @@ def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
|
|
132
135
|
bool
|
133
136
|
True if dataset can be stratified else False
|
134
137
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
138
|
+
Raises
|
139
|
+
------
|
140
|
+
ValueError
|
141
|
+
If the dataset cannot be stratified due to the total number of [train, val, test]
|
139
142
|
partitions exceeding the number of instances of the rarest class label.
|
140
143
|
"""
|
141
144
|
|
142
145
|
# Get the minimum count of all labels
|
143
146
|
lowest_label_count = np.unique(labels, return_counts=True)[1].min()
|
144
147
|
if lowest_label_count < num_partitions:
|
145
|
-
|
148
|
+
raise ValueError(
|
146
149
|
f"Unable to stratify due to label frequency. The lowest label count ({lowest_label_count}) is fewer "
|
147
|
-
f"than the total number of partitions ({num_partitions}) requested."
|
148
|
-
UserWarning,
|
150
|
+
f"than the total number of partitions ({num_partitions}) requested."
|
149
151
|
)
|
150
|
-
return False
|
151
|
-
return True
|
152
152
|
|
153
153
|
|
154
|
-
def
|
154
|
+
def validate_groupable(groups: NDArray[np.intp], num_partitions: int) -> None:
|
155
155
|
"""
|
156
156
|
Warns user if the number of unique group_ids is incompatible with a grouped partition containing
|
157
157
|
num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
|
@@ -159,7 +159,7 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
|
|
159
159
|
|
160
160
|
Parameters
|
161
161
|
----------
|
162
|
-
|
162
|
+
groups : NDArray of ints
|
163
163
|
The id of the group each sample at the corresponding index belongs to
|
164
164
|
num_partitions : int
|
165
165
|
Total number of train, val, and test splits requested
|
@@ -169,60 +169,24 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
|
|
169
169
|
bool
|
170
170
|
True if the dataset can be grouped by the given group ids else False
|
171
171
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
172
|
+
Raises
|
173
|
+
------
|
174
|
+
ValueError
|
175
|
+
If there are is only one unique group.
|
176
|
+
ValueError
|
177
|
+
If there are fewer groups than the requested number of partitions plus one
|
176
178
|
"""
|
177
179
|
|
178
|
-
num_unique_groups = len(np.unique(
|
180
|
+
num_unique_groups = len(np.unique(groups))
|
179
181
|
# Cannot separate if only one group exists
|
180
182
|
if num_unique_groups == 1:
|
181
|
-
|
183
|
+
raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than 1.")
|
182
184
|
|
183
185
|
if num_unique_groups < num_partitions:
|
184
|
-
|
185
|
-
f"Groups must be greater than num partitions. Got {num_unique_groups} and {num_partitions}. "
|
186
|
-
"Reverting to ungrouped partitioning",
|
187
|
-
UserWarning,
|
188
|
-
)
|
189
|
-
return False
|
190
|
-
return True
|
191
|
-
|
192
|
-
|
193
|
-
def bin_kmeans(array: NDArray[Any]) -> NDArray[np.intp]:
|
194
|
-
"""
|
195
|
-
Find bins of continuous data by iteratively applying k-means clustering, and keeping the
|
196
|
-
clustering with the highest silhouette score.
|
197
|
-
|
198
|
-
Parameters
|
199
|
-
----------
|
200
|
-
array : NDArray
|
201
|
-
continuous data to bin
|
186
|
+
raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than num partitions ({num_partitions}).")
|
202
187
|
|
203
|
-
Returns
|
204
|
-
-------
|
205
|
-
NDArray[int]:
|
206
|
-
bin numbers assigned by the kmeans best clusterer.
|
207
|
-
"""
|
208
188
|
|
209
|
-
|
210
|
-
array = array.reshape([-1, 1])
|
211
|
-
best_score = 0.60
|
212
|
-
else:
|
213
|
-
best_score = 0.50
|
214
|
-
bin_index = np.zeros(len(array), dtype=np.intp)
|
215
|
-
for k in range(2, 20):
|
216
|
-
clusterer = KMeans(n_clusters=k, random_state=get_seed())
|
217
|
-
cluster_labels = clusterer.fit_predict(array)
|
218
|
-
score = silhouette_score(array, cluster_labels, sample_size=25_000, random_state=get_seed())
|
219
|
-
if score > best_score:
|
220
|
-
best_score = score
|
221
|
-
bin_index = cluster_labels.astype(np.intp)
|
222
|
-
return bin_index
|
223
|
-
|
224
|
-
|
225
|
-
def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.intp]:
|
189
|
+
def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np.intp] | None:
|
226
190
|
"""
|
227
191
|
Returns individual group numbers based on a subset of metadata defined by groupnames
|
228
192
|
|
@@ -232,32 +196,20 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
|
|
232
196
|
dictionary containing all metadata
|
233
197
|
groupnames : list
|
234
198
|
which groups from the metadata dictionary to consider for dataset grouping
|
235
|
-
num_samples : int
|
236
|
-
number of labels. Used to ensure agreement between input data/labels and metadata entries.
|
237
|
-
|
238
|
-
Raises
|
239
|
-
------
|
240
|
-
IndexError
|
241
|
-
raised if an entry in the metadata dictionary doesn't have the same length as num_samples
|
242
199
|
|
243
200
|
Returns
|
244
201
|
-------
|
245
202
|
np.ndarray
|
246
203
|
group identifiers from metadata
|
247
204
|
"""
|
248
|
-
|
249
|
-
if
|
250
|
-
return
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
if type_of_target(feature) == "continuous":
|
258
|
-
features2group[name] = bin_kmeans(feature)
|
259
|
-
binned_features = np.stack(list(features2group.values()), axis=1)
|
260
|
-
_, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
|
205
|
+
# get only the factors that are present in the metadata
|
206
|
+
if split_on is None:
|
207
|
+
return None
|
208
|
+
|
209
|
+
split_set = set(split_on)
|
210
|
+
indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
|
211
|
+
binned_features = metadata.discrete_data[:, indices]
|
212
|
+
group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
|
261
213
|
return group_ids
|
262
214
|
|
263
215
|
|
@@ -294,10 +246,18 @@ def make_splits(
|
|
294
246
|
split_defs: list[TrainValSplit] = []
|
295
247
|
n_labels = len(np.unique(labels))
|
296
248
|
splitter = KFOLD_GROUP_STRATIFIED_MAP[(groups is not None, stratified)](n_folds)
|
249
|
+
_logger.log(logging.DEBUG, f"splitter={splitter.__class__.__name__}(n_splits={n_folds})")
|
297
250
|
good = False
|
298
251
|
attempts = 0
|
299
252
|
while not good and attempts < 3:
|
300
253
|
attempts += 1
|
254
|
+
_logger.log(
|
255
|
+
logging.DEBUG,
|
256
|
+
f"attempt={attempts}: splitter.split("
|
257
|
+
+ f"index=arr(len={len(index)}, unique={np.unique(index)}), "
|
258
|
+
+ f"labels=arr(len={len(index)}, unique={np.unique(index)}), "
|
259
|
+
+ ("groups=None" if groups is None else f"groups=arr(len={len(groups)}, unique={np.unique(groups)}))"),
|
260
|
+
)
|
301
261
|
splits = splitter.split(index, labels, groups)
|
302
262
|
split_defs.clear()
|
303
263
|
for train_idx, eval_idx in splits:
|
@@ -341,20 +301,20 @@ def find_best_split(
|
|
341
301
|
counts = np.bincount(arr, minlength=minlength)
|
342
302
|
return counts / np.sum(counts)
|
343
303
|
|
344
|
-
def weight(arr: NDArray, class_freq: NDArray) ->
|
345
|
-
return np.sum(np.abs(freq(arr, len(class_freq)) - class_freq))
|
304
|
+
def weight(arr: NDArray, class_freq: NDArray) -> float:
|
305
|
+
return float(np.sum(np.abs(freq(arr, len(class_freq)) - class_freq)))
|
346
306
|
|
347
|
-
def class_freq_diff(split: TrainValSplit) ->
|
307
|
+
def class_freq_diff(split: TrainValSplit) -> float:
|
348
308
|
class_freq = freq(labels)
|
349
309
|
return weight(labels[split.train], class_freq) + weight(labels[split.val], class_freq)
|
350
310
|
|
351
|
-
def split_ratio(split: TrainValSplit) ->
|
352
|
-
return
|
311
|
+
def split_ratio(split: TrainValSplit) -> float:
|
312
|
+
return len(split.val) / (len(split.val) + len(split.train))
|
353
313
|
|
354
|
-
def split_diff(split: TrainValSplit) ->
|
314
|
+
def split_diff(split: TrainValSplit) -> float:
|
355
315
|
return abs(split_frac - split_ratio(split))
|
356
316
|
|
357
|
-
def split_inv_diff(split: TrainValSplit) ->
|
317
|
+
def split_inv_diff(split: TrainValSplit) -> float:
|
358
318
|
return abs(1 - split_frac - split_ratio(split))
|
359
319
|
|
360
320
|
# Selects minimization function based on inputs
|
@@ -399,11 +359,12 @@ def single_split(
|
|
399
359
|
Indices of data partitioned for training and evaluation
|
400
360
|
"""
|
401
361
|
|
402
|
-
|
403
|
-
max_folds =
|
404
|
-
|
405
|
-
divisor = split_frac
|
406
|
-
n_folds =
|
362
|
+
unique_groups = 2 if groups is None else len(np.unique(groups))
|
363
|
+
max_folds = min(min(np.unique(labels, return_counts=True)[1]), unique_groups) if stratified else unique_groups
|
364
|
+
|
365
|
+
divisor = split_frac if split_frac <= 2 / 3 else 1 - split_frac
|
366
|
+
n_folds = min(max(round(1 / (divisor + EPSILON)), 2), max_folds) # Clips value between 2 and max_folds
|
367
|
+
_logger.log(logging.DEBUG, f"n_folds={n_folds} clipped between[2, {max_folds}]")
|
407
368
|
|
408
369
|
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
409
370
|
return find_best_split(labels, split_candidates, stratified, split_frac)
|
@@ -411,22 +372,20 @@ def single_split(
|
|
411
372
|
|
412
373
|
@set_metadata
|
413
374
|
def split_dataset(
|
414
|
-
|
375
|
+
dataset: AnnotatedDataset[Any] | Metadata,
|
415
376
|
num_folds: int = 1,
|
416
377
|
stratify: bool = False,
|
417
|
-
split_on:
|
418
|
-
metadata: dict[str, Any] | None = None,
|
378
|
+
split_on: Sequence[str] | None = None,
|
419
379
|
test_frac: float = 0.0,
|
420
380
|
val_frac: float = 0.0,
|
421
381
|
) -> SplitDatasetOutput:
|
422
382
|
"""
|
423
|
-
|
424
|
-
Indices for a test holdout may also be optionally included
|
383
|
+
Dataset splitting function. Returns a dataclass containing a list of train and validation indices.
|
425
384
|
|
426
385
|
Parameters
|
427
386
|
----------
|
428
|
-
|
429
|
-
|
387
|
+
dataset : AnnotatedDataset or Metadata
|
388
|
+
Dataset to split.
|
430
389
|
num_folds : int, default 1
|
431
390
|
Number of [train, val] folds. If equal to 1, val_frac must be greater than 0.0
|
432
391
|
stratify : bool, default False
|
@@ -436,8 +395,6 @@ def split_dataset(
|
|
436
395
|
Keys of the metadata dictionary upon which to group the dataset.
|
437
396
|
A grouped partition is divided such that no group is present within both the training and
|
438
397
|
validation set. Split_on groups should be selected to mitigate validation bias
|
439
|
-
metadata : dict or None, default None
|
440
|
-
Dict containing data for potential dataset grouping. See split_on above
|
441
398
|
test_frac : float, default 0.0
|
442
399
|
Fraction of data to be optionally held out for test set
|
443
400
|
val_frac : float, default 0.0
|
@@ -450,13 +407,8 @@ def split_dataset(
|
|
450
407
|
Output class containing a list of indices of training
|
451
408
|
and validation data for each fold and optional test indices
|
452
409
|
|
453
|
-
|
454
|
-
|
455
|
-
TypeError
|
456
|
-
Raised if split_on is passed, but metadata is None or empty
|
457
|
-
|
458
|
-
Note
|
459
|
-
----
|
410
|
+
Notes
|
411
|
+
-----
|
460
412
|
When specifying groups and/or stratification, ratios for test and validation splits can vary
|
461
413
|
as the stratification and grouping take higher priority than the percentages
|
462
414
|
"""
|
@@ -464,30 +416,25 @@ def split_dataset(
|
|
464
416
|
val_frac = calculate_validation_fraction(num_folds, test_frac, val_frac)
|
465
417
|
total_partitions = num_folds + 1 if test_frac else num_folds
|
466
418
|
|
467
|
-
if isinstance(
|
468
|
-
|
419
|
+
metadata = dataset if isinstance(dataset, Metadata) else Metadata(dataset)
|
420
|
+
labels = metadata.class_labels
|
469
421
|
|
470
|
-
|
422
|
+
validate_labels(labels, total_partitions)
|
423
|
+
if stratify:
|
424
|
+
validate_stratifiable(labels, total_partitions)
|
471
425
|
|
472
|
-
|
473
|
-
|
474
|
-
groups = None
|
475
|
-
if split_on:
|
476
|
-
if metadata is None or metadata == {}:
|
477
|
-
raise TypeError("If split_on is specified, metadata must also be provided, got None")
|
478
|
-
possible_groups = get_group_ids(metadata, split_on, label_length)
|
426
|
+
groups = get_groups(metadata, split_on)
|
427
|
+
if groups is not None:
|
479
428
|
# Accounts for a test set that is 100 % of the data
|
480
429
|
group_partitions = total_partitions + 1 if val_frac else total_partitions
|
481
|
-
|
482
|
-
groups = possible_groups
|
430
|
+
validate_groupable(groups, group_partitions)
|
483
431
|
|
484
|
-
index = np.arange(
|
432
|
+
index = np.arange(len(labels))
|
485
433
|
|
486
|
-
|
487
|
-
single_split(index
|
488
|
-
|
489
|
-
|
490
|
-
)
|
434
|
+
if test_frac:
|
435
|
+
tvs = single_split(index, labels, test_frac, groups, stratify)
|
436
|
+
else:
|
437
|
+
tvs = TrainValSplit(index, np.array([], dtype=np.intp))
|
491
438
|
|
492
439
|
tv_labels = labels[tvs.train]
|
493
440
|
tv_groups = groups[tvs.train] if groups is not None else None
|
@@ -19,9 +19,12 @@ from dataeval.utils.data.datasets._types import (
|
|
19
19
|
)
|
20
20
|
|
21
21
|
if TYPE_CHECKING:
|
22
|
-
from dataeval.typing import Transform
|
22
|
+
from dataeval.typing import Array, Transform
|
23
|
+
|
24
|
+
_TArray = TypeVar("_TArray", bound=Array)
|
25
|
+
else:
|
26
|
+
_TArray = TypeVar("_TArray")
|
23
27
|
|
24
|
-
_TArray = TypeVar("_TArray")
|
25
28
|
_TTarget = TypeVar("_TTarget")
|
26
29
|
_TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
|
27
30
|
|
@@ -51,9 +54,9 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
|
|
51
54
|
def __init__(
|
52
55
|
self,
|
53
56
|
root: str | Path,
|
54
|
-
|
55
|
-
image_set: Literal["train", "val", "test", "base"] = "train",
|
57
|
+
image_set: Literal["train", "val", "test", "operational", "base"] = "train",
|
56
58
|
transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
|
59
|
+
download: bool = False,
|
57
60
|
verbose: bool = False,
|
58
61
|
) -> None:
|
59
62
|
self._root: Path = root.absolute() if isinstance(root, Path) else Path(root).absolute()
|
@@ -27,13 +27,13 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
27
27
|
----------
|
28
28
|
root : str or pathlib.Path
|
29
29
|
Root directory of dataset where the ``mnist`` folder exists.
|
30
|
-
download : bool, default False
|
31
|
-
If True, downloads the dataset from the internet and puts it in root directory.
|
32
|
-
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
33
30
|
image_set : "train", "test" or "base", default "train"
|
34
31
|
If "base", returns all of the data to allow the user to create their own splits.
|
35
32
|
transforms : Transform, Sequence[Transform] or None, default None
|
36
33
|
Transform(s) to apply to the data.
|
34
|
+
download : bool, default False
|
35
|
+
If True, downloads the dataset from the internet and puts it in root directory.
|
36
|
+
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
37
37
|
verbose : bool, default False
|
38
38
|
If True, outputs print statements.
|
39
39
|
|
@@ -43,16 +43,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
43
43
|
Location of the folder containing the data.
|
44
44
|
image_set : "train", "test" or "base"
|
45
45
|
The selected image set from the dataset.
|
46
|
+
transforms : Sequence[Transform]
|
47
|
+
The transforms to be applied to the data.
|
48
|
+
size : int
|
49
|
+
The size of the dataset.
|
46
50
|
index2label : dict[int, str]
|
47
51
|
Dictionary which translates from class integers to the associated class strings.
|
48
52
|
label2index : dict[str, int]
|
49
53
|
Dictionary which translates from class strings to the associated class integers.
|
50
54
|
metadata : DatasetMetadata
|
51
55
|
Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
|
52
|
-
transforms : Sequence[Transform]
|
53
|
-
The transforms to be applied to the data.
|
54
|
-
size : int
|
55
|
-
The size of the dataset.
|
56
56
|
"""
|
57
57
|
|
58
58
|
_resources = [
|
@@ -80,16 +80,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
80
80
|
def __init__(
|
81
81
|
self,
|
82
82
|
root: str | Path,
|
83
|
-
download: bool = False,
|
84
83
|
image_set: Literal["train", "test", "base"] = "train",
|
85
84
|
transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
|
85
|
+
download: bool = False,
|
86
86
|
verbose: bool = False,
|
87
87
|
) -> None:
|
88
88
|
super().__init__(
|
89
89
|
root,
|
90
|
-
download,
|
91
90
|
image_set,
|
92
91
|
transforms,
|
92
|
+
download,
|
93
93
|
verbose,
|
94
94
|
)
|
95
95
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import TYPE_CHECKING, Any, Sequence
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, Sequence
|
7
7
|
|
8
8
|
from numpy.typing import NDArray
|
9
9
|
|
@@ -16,21 +16,20 @@ if TYPE_CHECKING:
|
|
16
16
|
|
17
17
|
class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
18
18
|
"""
|
19
|
-
A side-scan sonar dataset focused on mine
|
19
|
+
A side-scan sonar dataset focused on mine-like object detection.
|
20
20
|
|
21
21
|
The dataset comes from the paper
|
22
22
|
`Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
|
23
23
|
by N.P. Santos et. al. (2024).
|
24
24
|
|
25
|
-
This class only accesses a portion of the above dataset due to size constraints.
|
26
25
|
The full dataset contains 1170 side-scan sonar images collected using a 900-1800 kHz Marine Sonic
|
27
26
|
dual frequency side-scan sonar of a Teledyne Marine Gavia Autonomous Underwater Vehicle.
|
28
27
|
All the images were carefully analyzed and annotated, including the image coordinates of the
|
29
28
|
Bounding Box (BB) of the detected objects divided into NOn-Mine-like BOttom Objects (NOMBO)
|
30
29
|
and MIne-Like COntacts (MILCO) classes.
|
31
30
|
|
32
|
-
This dataset is consists of
|
33
|
-
In these
|
31
|
+
This dataset is consists of 345 images from 2010, 120 images from 2015, 93 images from 2017, 564 images from 2018,
|
32
|
+
and 48 images from 2021). In these 1170 images, there are 432 MILCO objects, and 235 NOMBO objects.
|
34
33
|
The class “0” corresponds to a MILCO object and the class “1” corresponds to a NOMBO object.
|
35
34
|
The raw BB coordinates provided in the downloaded text files are (x, y, w, h),
|
36
35
|
given as percentages of the image (x_BB = x/img_width, y_BB = y/img_height, etc.).
|
@@ -40,11 +39,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
40
39
|
----------
|
41
40
|
root : str or pathlib.Path
|
42
41
|
Root directory of dataset where the ``milco`` folder exists.
|
42
|
+
image_set: "train", "operational", or "base", default "train"
|
43
|
+
If "train", then the images from 2015, 2017 and 2021 are selected,
|
44
|
+
resulting in 315 MILCO objects and 177 NOMBO objects.
|
45
|
+
If "operational", then the images from 2010 and 2018 are selected,
|
46
|
+
resulting in 117 MILCO objects and 58 NOMBO objects.
|
47
|
+
If "base", then the full dataset is selected.
|
48
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
49
|
+
Transform(s) to apply to the data.
|
43
50
|
download : bool, default False
|
44
51
|
If True, downloads the dataset from the internet and puts it in root directory.
|
45
52
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
46
|
-
transforms : Transform, Sequence[Transform] or None, default None
|
47
|
-
Transform(s) to apply to the data.
|
48
53
|
verbose : bool, default False
|
49
54
|
If True, outputs print statements.
|
50
55
|
|
@@ -52,8 +57,8 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
52
57
|
----------
|
53
58
|
path : pathlib.Path
|
54
59
|
Location of the folder containing the data.
|
55
|
-
image_set : "base"
|
56
|
-
The
|
60
|
+
image_set : "train", "operational" or "base"
|
61
|
+
The selected image set from the dataset.
|
57
62
|
index2label : dict[int, str]
|
58
63
|
Dictionary which translates from class integers to the associated class strings.
|
59
64
|
label2index : dict[str, int]
|
@@ -64,6 +69,10 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
64
69
|
The transforms to be applied to the data.
|
65
70
|
size : int
|
66
71
|
The size of the dataset.
|
72
|
+
|
73
|
+
Note
|
74
|
+
----
|
75
|
+
Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_
|
67
76
|
"""
|
68
77
|
|
69
78
|
_resources = [
|
@@ -85,6 +94,18 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
85
94
|
md5=True,
|
86
95
|
checksum="b84749b21fa95a4a4c7de3741db78bc7",
|
87
96
|
),
|
97
|
+
DataLocation(
|
98
|
+
url="https://figshare.com/ndownloader/files/43169008",
|
99
|
+
filename="2010.zip",
|
100
|
+
md5=True,
|
101
|
+
checksum="43347a0cc383c0d3dbe0d24ae56f328d",
|
102
|
+
),
|
103
|
+
DataLocation(
|
104
|
+
url="https://figshare.com/ndownloader/files/43169011",
|
105
|
+
filename="2018.zip",
|
106
|
+
md5=True,
|
107
|
+
checksum="25d091044a10c78674fedad655023e3b",
|
108
|
+
),
|
88
109
|
]
|
89
110
|
|
90
111
|
index2label: dict[int, str] = {
|
@@ -95,15 +116,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
95
116
|
def __init__(
|
96
117
|
self,
|
97
118
|
root: str | Path,
|
98
|
-
|
119
|
+
image_set: Literal["train", "operational", "base"] = "train",
|
99
120
|
transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
|
121
|
+
download: bool = False,
|
100
122
|
verbose: bool = False,
|
101
123
|
) -> None:
|
102
124
|
super().__init__(
|
103
125
|
root,
|
104
|
-
|
105
|
-
"base",
|
126
|
+
image_set,
|
106
127
|
transforms,
|
128
|
+
download,
|
107
129
|
verbose,
|
108
130
|
)
|
109
131
|
|
@@ -112,10 +134,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
112
134
|
targets: list[str] = []
|
113
135
|
datum_metadata: dict[str, list[Any]] = {}
|
114
136
|
metadata_list: list[dict[str, Any]] = []
|
137
|
+
image_sets: dict[str, list[int]] = {
|
138
|
+
"base": list(range(len(self._resources))),
|
139
|
+
"train": list(range(3)),
|
140
|
+
"operational": list(range(3, len(self._resources))),
|
141
|
+
}
|
115
142
|
|
116
143
|
# Load the data
|
117
|
-
|
118
|
-
|
144
|
+
resource_indices = image_sets[self.image_set]
|
145
|
+
for idx in resource_indices:
|
146
|
+
self._resource = self._resources[idx]
|
119
147
|
filepath, target, metadata = super()._load_data()
|
120
148
|
filepaths.extend(filepath)
|
121
149
|
targets.extend(target)
|
@@ -49,9 +49,6 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
49
49
|
----------
|
50
50
|
root : str or pathlib.Path
|
51
51
|
Root directory of dataset where the ``mnist`` folder exists.
|
52
|
-
download : bool, default False
|
53
|
-
If True, downloads the dataset from the internet and puts it in root directory.
|
54
|
-
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
55
52
|
image_set : "train", "test" or "base", default "train"
|
56
53
|
If "base", returns all of the data to allow the user to create their own splits.
|
57
54
|
corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
|
@@ -60,6 +57,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
60
57
|
Corruption to apply to the data.
|
61
58
|
transforms : Transform, Sequence[Transform] or None, default None
|
62
59
|
Transform(s) to apply to the data.
|
60
|
+
download : bool, default False
|
61
|
+
If True, downloads the dataset from the internet and puts it in root directory.
|
62
|
+
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
63
63
|
verbose : bool, default False
|
64
64
|
If True, outputs print statements.
|
65
65
|
|
@@ -81,6 +81,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
81
81
|
The transforms to be applied to the data.
|
82
82
|
size : int
|
83
83
|
The size of the dataset.
|
84
|
+
|
85
|
+
Note
|
86
|
+
----
|
87
|
+
Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_ for corruption dataset
|
84
88
|
"""
|
85
89
|
|
86
90
|
_resources = [
|
@@ -114,10 +118,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
114
118
|
def __init__(
|
115
119
|
self,
|
116
120
|
root: str | Path,
|
117
|
-
download: bool = False,
|
118
121
|
image_set: Literal["train", "test", "base"] = "train",
|
119
122
|
corruption: CorruptionStringMap | None = None,
|
120
123
|
transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
|
124
|
+
download: bool = False,
|
121
125
|
verbose: bool = False,
|
122
126
|
) -> None:
|
123
127
|
self.corruption = corruption
|
@@ -127,9 +131,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
127
131
|
|
128
132
|
super().__init__(
|
129
133
|
root,
|
130
|
-
download,
|
131
134
|
image_set,
|
132
135
|
transforms,
|
136
|
+
download,
|
133
137
|
verbose,
|
134
138
|
)
|
135
139
|
|
@@ -31,11 +31,11 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
31
31
|
----------
|
32
32
|
root : str or pathlib.Path
|
33
33
|
Root directory of dataset where the ``shipdataset`` folder exists.
|
34
|
+
transforms : Transform, Sequence[Transform] or None, default None
|
35
|
+
Transform(s) to apply to the data.
|
34
36
|
download : bool, default False
|
35
37
|
If True, downloads the dataset from the internet and puts it in root directory.
|
36
38
|
Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
|
37
|
-
transforms : Transform, Sequence[Transform] or None, default None
|
38
|
-
Transform(s) to apply to the data.
|
39
39
|
verbose : bool, default False
|
40
40
|
If True, outputs print statements.
|
41
41
|
|
@@ -55,6 +55,10 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
55
55
|
The transforms to be applied to the data.
|
56
56
|
size : int
|
57
57
|
The size of the dataset.
|
58
|
+
|
59
|
+
Note
|
60
|
+
----
|
61
|
+
Data License: `CC BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_
|
58
62
|
"""
|
59
63
|
|
60
64
|
_resources = [
|
@@ -74,15 +78,15 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
|
|
74
78
|
def __init__(
|
75
79
|
self,
|
76
80
|
root: str | Path,
|
77
|
-
download: bool = False,
|
78
81
|
transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
|
82
|
+
download: bool = False,
|
79
83
|
verbose: bool = False,
|
80
84
|
) -> None:
|
81
85
|
super().__init__(
|
82
86
|
root,
|
83
|
-
download,
|
84
87
|
"base",
|
85
88
|
transforms,
|
89
|
+
download,
|
86
90
|
verbose,
|
87
91
|
)
|
88
92
|
self._scenes: list[str] = self._load_scenes()
|