dataeval 0.83.0__py3-none-any.whl → 0.84.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/config.py +3 -3
  3. dataeval/detectors/drift/__init__.py +2 -2
  4. dataeval/detectors/drift/_base.py +55 -203
  5. dataeval/detectors/drift/_cvm.py +19 -30
  6. dataeval/detectors/drift/_ks.py +18 -30
  7. dataeval/detectors/drift/_mmd.py +189 -53
  8. dataeval/detectors/drift/_uncertainty.py +52 -56
  9. dataeval/detectors/drift/updates.py +13 -12
  10. dataeval/detectors/linters/duplicates.py +5 -3
  11. dataeval/detectors/linters/outliers.py +2 -2
  12. dataeval/detectors/ood/ae.py +1 -1
  13. dataeval/metrics/bias/__init__.py +11 -1
  14. dataeval/metrics/bias/_completeness.py +130 -0
  15. dataeval/metrics/stats/_base.py +28 -32
  16. dataeval/metrics/stats/_dimensionstats.py +2 -2
  17. dataeval/metrics/stats/_hashstats.py +2 -2
  18. dataeval/metrics/stats/_imagestats.py +4 -4
  19. dataeval/metrics/stats/_labelstats.py +4 -45
  20. dataeval/metrics/stats/_pixelstats.py +2 -2
  21. dataeval/metrics/stats/_visualstats.py +2 -2
  22. dataeval/outputs/__init__.py +2 -1
  23. dataeval/outputs/_bias.py +31 -22
  24. dataeval/outputs/_stats.py +2 -3
  25. dataeval/typing.py +25 -22
  26. dataeval/utils/_array.py +43 -7
  27. dataeval/utils/data/_dataset.py +8 -4
  28. dataeval/utils/data/_embeddings.py +141 -24
  29. dataeval/utils/data/_images.py +38 -15
  30. dataeval/utils/data/_metadata.py +5 -4
  31. dataeval/utils/data/_selection.py +3 -15
  32. dataeval/utils/data/_split.py +76 -129
  33. dataeval/utils/data/datasets/_base.py +7 -4
  34. dataeval/utils/data/datasets/_cifar10.py +9 -9
  35. dataeval/utils/data/datasets/_milco.py +42 -14
  36. dataeval/utils/data/datasets/_mnist.py +9 -5
  37. dataeval/utils/data/datasets/_ships.py +8 -4
  38. dataeval/utils/data/datasets/_voc.py +40 -19
  39. dataeval/utils/data/selections/__init__.py +2 -0
  40. dataeval/utils/data/selections/_classbalance.py +38 -0
  41. dataeval/utils/data/selections/_classfilter.py +14 -29
  42. dataeval/utils/data/selections/_prioritize.py +1 -1
  43. dataeval/utils/data/selections/_shuffle.py +2 -2
  44. dataeval/utils/metadata.py +1 -1
  45. dataeval/utils/torch/_internal.py +12 -35
  46. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/METADATA +2 -3
  47. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/RECORD +49 -48
  48. dataeval/detectors/drift/_torch.py +0 -222
  49. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/LICENSE.txt +0 -0
  50. {dataeval-0.83.0.dist-info → dataeval-0.84.1.dist-info}/WHEEL +0 -0
@@ -2,19 +2,22 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
+ import logging
5
6
  import warnings
6
- from typing import Any, Iterator, Protocol
7
+ from typing import Any, Iterator, Protocol, Sequence
7
8
 
8
9
  import numpy as np
9
10
  from numpy.typing import NDArray
10
- from sklearn.cluster import KMeans
11
- from sklearn.metrics import silhouette_score
12
11
  from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
13
12
  from sklearn.utils.multiclass import type_of_target
14
13
 
15
- from dataeval.config import get_seed
14
+ from dataeval.config import EPSILON
16
15
  from dataeval.outputs._base import set_metadata
17
16
  from dataeval.outputs._utils import SplitDatasetOutput, TrainValSplit
17
+ from dataeval.typing import AnnotatedDataset
18
+ from dataeval.utils.data._metadata import Metadata
19
+
20
+ _logger = logging.getLogger(__name__)
18
21
 
19
22
 
20
23
  class KFoldSplitter(Protocol):
@@ -85,7 +88,7 @@ def calculate_validation_fraction(num_folds: int, test_frac: float, val_frac: fl
85
88
  return val_base * (1.0 / num_folds) * (1.0 - test_frac)
86
89
 
87
90
 
88
- def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
91
+ def validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
89
92
  """
90
93
  Check to make sure there is more input data than the total number of partitions requested
91
94
 
@@ -116,7 +119,7 @@ def _validate_labels(labels: NDArray[np.intp], total_partitions: int) -> None:
116
119
  raise ValueError("Detected continuous labels. Labels must be discrete for proper stratification")
117
120
 
118
121
 
119
- def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
122
+ def validate_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> None:
120
123
  """
121
124
  Check if the dataset can be stratified by class label over the given number of partitions
122
125
 
@@ -132,26 +135,23 @@ def is_stratifiable(labels: NDArray[np.intp], num_partitions: int) -> bool:
132
135
  bool
133
136
  True if dataset can be stratified else False
134
137
 
135
- Warns
136
- -----
137
- UserWarning
138
- Warns user if the dataset cannot be stratified due to the total number of [train, val, test]
138
+ Raises
139
+ ------
140
+ ValueError
141
+ If the dataset cannot be stratified due to the total number of [train, val, test]
139
142
  partitions exceeding the number of instances of the rarest class label.
140
143
  """
141
144
 
142
145
  # Get the minimum count of all labels
143
146
  lowest_label_count = np.unique(labels, return_counts=True)[1].min()
144
147
  if lowest_label_count < num_partitions:
145
- warnings.warn(
148
+ raise ValueError(
146
149
  f"Unable to stratify due to label frequency. The lowest label count ({lowest_label_count}) is fewer "
147
- f"than the total number of partitions ({num_partitions}) requested.",
148
- UserWarning,
150
+ f"than the total number of partitions ({num_partitions}) requested."
149
151
  )
150
- return False
151
- return True
152
152
 
153
153
 
154
- def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
154
+ def validate_groupable(groups: NDArray[np.intp], num_partitions: int) -> None:
155
155
  """
156
156
  Warns user if the number of unique group_ids is incompatible with a grouped partition containing
157
157
  num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
@@ -159,7 +159,7 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
159
159
 
160
160
  Parameters
161
161
  ----------
162
- group_ids : NDArray of ints
162
+ groups : NDArray of ints
163
163
  The id of the group each sample at the corresponding index belongs to
164
164
  num_partitions : int
165
165
  Total number of train, val, and test splits requested
@@ -169,60 +169,24 @@ def is_groupable(group_ids: NDArray[np.intp], num_partitions: int) -> bool:
169
169
  bool
170
170
  True if the dataset can be grouped by the given group ids else False
171
171
 
172
- Warns
173
- -----
174
- UserWarning
175
- Warns if there are fewer groups than the requested number of partitions plus one
172
+ Raises
173
+ ------
174
+ ValueError
175
+ If there are is only one unique group.
176
+ ValueError
177
+ If there are fewer groups than the requested number of partitions plus one
176
178
  """
177
179
 
178
- num_unique_groups = len(np.unique(group_ids))
180
+ num_unique_groups = len(np.unique(groups))
179
181
  # Cannot separate if only one group exists
180
182
  if num_unique_groups == 1:
181
- return False
183
+ raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than 1.")
182
184
 
183
185
  if num_unique_groups < num_partitions:
184
- warnings.warn(
185
- f"Groups must be greater than num partitions. Got {num_unique_groups} and {num_partitions}. "
186
- "Reverting to ungrouped partitioning",
187
- UserWarning,
188
- )
189
- return False
190
- return True
191
-
192
-
193
- def bin_kmeans(array: NDArray[Any]) -> NDArray[np.intp]:
194
- """
195
- Find bins of continuous data by iteratively applying k-means clustering, and keeping the
196
- clustering with the highest silhouette score.
197
-
198
- Parameters
199
- ----------
200
- array : NDArray
201
- continuous data to bin
186
+ raise ValueError(f"Unique groups ({num_unique_groups}) must be greater than num partitions ({num_partitions}).")
202
187
 
203
- Returns
204
- -------
205
- NDArray[int]:
206
- bin numbers assigned by the kmeans best clusterer.
207
- """
208
188
 
209
- if array.ndim == 1:
210
- array = array.reshape([-1, 1])
211
- best_score = 0.60
212
- else:
213
- best_score = 0.50
214
- bin_index = np.zeros(len(array), dtype=np.intp)
215
- for k in range(2, 20):
216
- clusterer = KMeans(n_clusters=k, random_state=get_seed())
217
- cluster_labels = clusterer.fit_predict(array)
218
- score = silhouette_score(array, cluster_labels, sample_size=25_000, random_state=get_seed())
219
- if score > best_score:
220
- best_score = score
221
- bin_index = cluster_labels.astype(np.intp)
222
- return bin_index
223
-
224
-
225
- def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.intp]:
189
+ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np.intp] | None:
226
190
  """
227
191
  Returns individual group numbers based on a subset of metadata defined by groupnames
228
192
 
@@ -232,32 +196,20 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
232
196
  dictionary containing all metadata
233
197
  groupnames : list
234
198
  which groups from the metadata dictionary to consider for dataset grouping
235
- num_samples : int
236
- number of labels. Used to ensure agreement between input data/labels and metadata entries.
237
-
238
- Raises
239
- ------
240
- IndexError
241
- raised if an entry in the metadata dictionary doesn't have the same length as num_samples
242
199
 
243
200
  Returns
244
201
  -------
245
202
  np.ndarray
246
203
  group identifiers from metadata
247
204
  """
248
- features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
249
- if not features2group:
250
- return np.zeros(num_samples, dtype=np.intp)
251
- for name, feature in features2group.items():
252
- if len(feature) != num_samples:
253
- raise ValueError(
254
- f"Feature length does not match number of labels. Got {len(feature)} features and {num_samples} samples"
255
- )
256
-
257
- if type_of_target(feature) == "continuous":
258
- features2group[name] = bin_kmeans(feature)
259
- binned_features = np.stack(list(features2group.values()), axis=1)
260
- _, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
205
+ # get only the factors that are present in the metadata
206
+ if split_on is None:
207
+ return None
208
+
209
+ split_set = set(split_on)
210
+ indices = [i for i, name in enumerate(metadata.discrete_factor_names) if name in split_set]
211
+ binned_features = metadata.discrete_data[:, indices]
212
+ group_ids = np.unique(binned_features, axis=0, return_inverse=True)[1]
261
213
  return group_ids
262
214
 
263
215
 
@@ -294,10 +246,18 @@ def make_splits(
294
246
  split_defs: list[TrainValSplit] = []
295
247
  n_labels = len(np.unique(labels))
296
248
  splitter = KFOLD_GROUP_STRATIFIED_MAP[(groups is not None, stratified)](n_folds)
249
+ _logger.log(logging.DEBUG, f"splitter={splitter.__class__.__name__}(n_splits={n_folds})")
297
250
  good = False
298
251
  attempts = 0
299
252
  while not good and attempts < 3:
300
253
  attempts += 1
254
+ _logger.log(
255
+ logging.DEBUG,
256
+ f"attempt={attempts}: splitter.split("
257
+ + f"index=arr(len={len(index)}, unique={np.unique(index)}), "
258
+ + f"labels=arr(len={len(index)}, unique={np.unique(index)}), "
259
+ + ("groups=None" if groups is None else f"groups=arr(len={len(groups)}, unique={np.unique(groups)}))"),
260
+ )
301
261
  splits = splitter.split(index, labels, groups)
302
262
  split_defs.clear()
303
263
  for train_idx, eval_idx in splits:
@@ -341,20 +301,20 @@ def find_best_split(
341
301
  counts = np.bincount(arr, minlength=minlength)
342
302
  return counts / np.sum(counts)
343
303
 
344
- def weight(arr: NDArray, class_freq: NDArray) -> np.float64:
345
- return np.sum(np.abs(freq(arr, len(class_freq)) - class_freq))
304
+ def weight(arr: NDArray, class_freq: NDArray) -> float:
305
+ return float(np.sum(np.abs(freq(arr, len(class_freq)) - class_freq)))
346
306
 
347
- def class_freq_diff(split: TrainValSplit) -> np.float64:
307
+ def class_freq_diff(split: TrainValSplit) -> float:
348
308
  class_freq = freq(labels)
349
309
  return weight(labels[split.train], class_freq) + weight(labels[split.val], class_freq)
350
310
 
351
- def split_ratio(split: TrainValSplit) -> np.float64:
352
- return np.float64(len(split.val) / (len(split.val) + len(split.train)))
311
+ def split_ratio(split: TrainValSplit) -> float:
312
+ return len(split.val) / (len(split.val) + len(split.train))
353
313
 
354
- def split_diff(split: TrainValSplit) -> np.float64:
314
+ def split_diff(split: TrainValSplit) -> float:
355
315
  return abs(split_frac - split_ratio(split))
356
316
 
357
- def split_inv_diff(split: TrainValSplit) -> np.float64:
317
+ def split_inv_diff(split: TrainValSplit) -> float:
358
318
  return abs(1 - split_frac - split_ratio(split))
359
319
 
360
320
  # Selects minimization function based on inputs
@@ -399,11 +359,12 @@ def single_split(
399
359
  Indices of data partitioned for training and evaluation
400
360
  """
401
361
 
402
- _, label_counts = np.unique(labels, return_counts=True)
403
- max_folds = label_counts.min()
404
- min_folds = np.unique(groups).shape[0] if groups is not None else 2
405
- divisor = split_frac + 1e-06 if split_frac <= 2 / 3 else 1 - split_frac - 1e-06
406
- n_folds = round(min(max(1 / divisor, min_folds), max_folds)) # Clips value between min_folds and max_folds
362
+ unique_groups = 2 if groups is None else len(np.unique(groups))
363
+ max_folds = min(min(np.unique(labels, return_counts=True)[1]), unique_groups) if stratified else unique_groups
364
+
365
+ divisor = split_frac if split_frac <= 2 / 3 else 1 - split_frac
366
+ n_folds = min(max(round(1 / (divisor + EPSILON)), 2), max_folds) # Clips value between 2 and max_folds
367
+ _logger.log(logging.DEBUG, f"n_folds={n_folds} clipped between[2, {max_folds}]")
407
368
 
408
369
  split_candidates = make_splits(index, labels, n_folds, groups, stratified)
409
370
  return find_best_split(labels, split_candidates, stratified, split_frac)
@@ -411,22 +372,20 @@ def single_split(
411
372
 
412
373
  @set_metadata
413
374
  def split_dataset(
414
- labels: list[int] | NDArray[np.intp],
375
+ dataset: AnnotatedDataset[Any] | Metadata,
415
376
  num_folds: int = 1,
416
377
  stratify: bool = False,
417
- split_on: list[str] | None = None,
418
- metadata: dict[str, Any] | None = None,
378
+ split_on: Sequence[str] | None = None,
419
379
  test_frac: float = 0.0,
420
380
  val_frac: float = 0.0,
421
381
  ) -> SplitDatasetOutput:
422
382
  """
423
- Top level splitting function. Returns a dataclass containing a list of train and validation indices.
424
- Indices for a test holdout may also be optionally included
383
+ Dataset splitting function. Returns a dataclass containing a list of train and validation indices.
425
384
 
426
385
  Parameters
427
386
  ----------
428
- labels : list or NDArray of ints
429
- Classification Labels used to generate splits. Determines the size of the dataset
387
+ dataset : AnnotatedDataset or Metadata
388
+ Dataset to split.
430
389
  num_folds : int, default 1
431
390
  Number of [train, val] folds. If equal to 1, val_frac must be greater than 0.0
432
391
  stratify : bool, default False
@@ -436,8 +395,6 @@ def split_dataset(
436
395
  Keys of the metadata dictionary upon which to group the dataset.
437
396
  A grouped partition is divided such that no group is present within both the training and
438
397
  validation set. Split_on groups should be selected to mitigate validation bias
439
- metadata : dict or None, default None
440
- Dict containing data for potential dataset grouping. See split_on above
441
398
  test_frac : float, default 0.0
442
399
  Fraction of data to be optionally held out for test set
443
400
  val_frac : float, default 0.0
@@ -450,13 +407,8 @@ def split_dataset(
450
407
  Output class containing a list of indices of training
451
408
  and validation data for each fold and optional test indices
452
409
 
453
- Raises
454
- ------
455
- TypeError
456
- Raised if split_on is passed, but metadata is None or empty
457
-
458
- Note
459
- ----
410
+ Notes
411
+ -----
460
412
  When specifying groups and/or stratification, ratios for test and validation splits can vary
461
413
  as the stratification and grouping take higher priority than the percentages
462
414
  """
@@ -464,30 +416,25 @@ def split_dataset(
464
416
  val_frac = calculate_validation_fraction(num_folds, test_frac, val_frac)
465
417
  total_partitions = num_folds + 1 if test_frac else num_folds
466
418
 
467
- if isinstance(labels, list):
468
- labels = np.array(labels, dtype=np.intp)
419
+ metadata = dataset if isinstance(dataset, Metadata) else Metadata(dataset)
420
+ labels = metadata.class_labels
469
421
 
470
- label_length: int = len(labels)
422
+ validate_labels(labels, total_partitions)
423
+ if stratify:
424
+ validate_stratifiable(labels, total_partitions)
471
425
 
472
- _validate_labels(labels, total_partitions)
473
- stratify &= is_stratifiable(labels, total_partitions)
474
- groups = None
475
- if split_on:
476
- if metadata is None or metadata == {}:
477
- raise TypeError("If split_on is specified, metadata must also be provided, got None")
478
- possible_groups = get_group_ids(metadata, split_on, label_length)
426
+ groups = get_groups(metadata, split_on)
427
+ if groups is not None:
479
428
  # Accounts for a test set that is 100 % of the data
480
429
  group_partitions = total_partitions + 1 if val_frac else total_partitions
481
- if is_groupable(possible_groups, group_partitions):
482
- groups = possible_groups
430
+ validate_groupable(groups, group_partitions)
483
431
 
484
- index = np.arange(label_length)
432
+ index = np.arange(len(labels))
485
433
 
486
- tvs = (
487
- single_split(index=index, labels=labels, split_frac=test_frac, groups=groups, stratified=stratify)
488
- if test_frac
489
- else TrainValSplit(index, np.array([], dtype=np.intp))
490
- )
434
+ if test_frac:
435
+ tvs = single_split(index, labels, test_frac, groups, stratify)
436
+ else:
437
+ tvs = TrainValSplit(index, np.array([], dtype=np.intp))
491
438
 
492
439
  tv_labels = labels[tvs.train]
493
440
  tv_groups = groups[tvs.train] if groups is not None else None
@@ -19,9 +19,12 @@ from dataeval.utils.data.datasets._types import (
19
19
  )
20
20
 
21
21
  if TYPE_CHECKING:
22
- from dataeval.typing import Transform
22
+ from dataeval.typing import Array, Transform
23
+
24
+ _TArray = TypeVar("_TArray", bound=Array)
25
+ else:
26
+ _TArray = TypeVar("_TArray")
23
27
 
24
- _TArray = TypeVar("_TArray")
25
28
  _TTarget = TypeVar("_TTarget")
26
29
  _TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
27
30
 
@@ -51,9 +54,9 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
51
54
  def __init__(
52
55
  self,
53
56
  root: str | Path,
54
- download: bool = False,
55
- image_set: Literal["train", "val", "test", "base"] = "train",
57
+ image_set: Literal["train", "val", "test", "operational", "base"] = "train",
56
58
  transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
59
+ download: bool = False,
57
60
  verbose: bool = False,
58
61
  ) -> None:
59
62
  self._root: Path = root.absolute() if isinstance(root, Path) else Path(root).absolute()
@@ -27,13 +27,13 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
27
27
  ----------
28
28
  root : str or pathlib.Path
29
29
  Root directory of dataset where the ``mnist`` folder exists.
30
- download : bool, default False
31
- If True, downloads the dataset from the internet and puts it in root directory.
32
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
33
30
  image_set : "train", "test" or "base", default "train"
34
31
  If "base", returns all of the data to allow the user to create their own splits.
35
32
  transforms : Transform, Sequence[Transform] or None, default None
36
33
  Transform(s) to apply to the data.
34
+ download : bool, default False
35
+ If True, downloads the dataset from the internet and puts it in root directory.
36
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
37
37
  verbose : bool, default False
38
38
  If True, outputs print statements.
39
39
 
@@ -43,16 +43,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
43
43
  Location of the folder containing the data.
44
44
  image_set : "train", "test" or "base"
45
45
  The selected image set from the dataset.
46
+ transforms : Sequence[Transform]
47
+ The transforms to be applied to the data.
48
+ size : int
49
+ The size of the dataset.
46
50
  index2label : dict[int, str]
47
51
  Dictionary which translates from class integers to the associated class strings.
48
52
  label2index : dict[str, int]
49
53
  Dictionary which translates from class strings to the associated class integers.
50
54
  metadata : DatasetMetadata
51
55
  Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
52
- transforms : Sequence[Transform]
53
- The transforms to be applied to the data.
54
- size : int
55
- The size of the dataset.
56
56
  """
57
57
 
58
58
  _resources = [
@@ -80,16 +80,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
80
80
  def __init__(
81
81
  self,
82
82
  root: str | Path,
83
- download: bool = False,
84
83
  image_set: Literal["train", "test", "base"] = "train",
85
84
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
85
+ download: bool = False,
86
86
  verbose: bool = False,
87
87
  ) -> None:
88
88
  super().__init__(
89
89
  root,
90
- download,
91
90
  image_set,
92
91
  transforms,
92
+ download,
93
93
  verbose,
94
94
  )
95
95
 
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Sequence
6
+ from typing import TYPE_CHECKING, Any, Literal, Sequence
7
7
 
8
8
  from numpy.typing import NDArray
9
9
 
@@ -16,21 +16,20 @@ if TYPE_CHECKING:
16
16
 
17
17
  class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
18
18
  """
19
- A side-scan sonar dataset focused on mine (object) detection.
19
+ A side-scan sonar dataset focused on mine-like object detection.
20
20
 
21
21
  The dataset comes from the paper
22
22
  `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
23
23
  by N.P. Santos et. al. (2024).
24
24
 
25
- This class only accesses a portion of the above dataset due to size constraints.
26
25
  The full dataset contains 1170 side-scan sonar images collected using a 900-1800 kHz Marine Sonic
27
26
  dual frequency side-scan sonar of a Teledyne Marine Gavia Autonomous Underwater Vehicle.
28
27
  All the images were carefully analyzed and annotated, including the image coordinates of the
29
28
  Bounding Box (BB) of the detected objects divided into NOn-Mine-like BOttom Objects (NOMBO)
30
29
  and MIne-Like COntacts (MILCO) classes.
31
30
 
32
- This dataset is consists of 261 images (120 images from 2015, 93 images from 2017, and 48 images from 2021).
33
- In these 261 images, there are 315 MILCO objects, and 175 NOMBO objects.
31
+ This dataset is consists of 345 images from 2010, 120 images from 2015, 93 images from 2017, 564 images from 2018,
32
+ and 48 images from 2021). In these 1170 images, there are 432 MILCO objects, and 235 NOMBO objects.
34
33
  The class “0” corresponds to a MILCO object and the class “1” corresponds to a NOMBO object.
35
34
  The raw BB coordinates provided in the downloaded text files are (x, y, w, h),
36
35
  given as percentages of the image (x_BB = x/img_width, y_BB = y/img_height, etc.).
@@ -40,11 +39,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
40
39
  ----------
41
40
  root : str or pathlib.Path
42
41
  Root directory of dataset where the ``milco`` folder exists.
42
+ image_set: "train", "operational", or "base", default "train"
43
+ If "train", then the images from 2015, 2017 and 2021 are selected,
44
+ resulting in 315 MILCO objects and 177 NOMBO objects.
45
+ If "operational", then the images from 2010 and 2018 are selected,
46
+ resulting in 117 MILCO objects and 58 NOMBO objects.
47
+ If "base", then the full dataset is selected.
48
+ transforms : Transform, Sequence[Transform] or None, default None
49
+ Transform(s) to apply to the data.
43
50
  download : bool, default False
44
51
  If True, downloads the dataset from the internet and puts it in root directory.
45
52
  Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
46
- transforms : Transform, Sequence[Transform] or None, default None
47
- Transform(s) to apply to the data.
48
53
  verbose : bool, default False
49
54
  If True, outputs print statements.
50
55
 
@@ -52,8 +57,8 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
52
57
  ----------
53
58
  path : pathlib.Path
54
59
  Location of the folder containing the data.
55
- image_set : "base"
56
- The base image set is the only available image set for the MILCO dataset.
60
+ image_set : "train", "operational" or "base"
61
+ The selected image set from the dataset.
57
62
  index2label : dict[int, str]
58
63
  Dictionary which translates from class integers to the associated class strings.
59
64
  label2index : dict[str, int]
@@ -64,6 +69,10 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
64
69
  The transforms to be applied to the data.
65
70
  size : int
66
71
  The size of the dataset.
72
+
73
+ Note
74
+ ----
75
+ Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_
67
76
  """
68
77
 
69
78
  _resources = [
@@ -85,6 +94,18 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
85
94
  md5=True,
86
95
  checksum="b84749b21fa95a4a4c7de3741db78bc7",
87
96
  ),
97
+ DataLocation(
98
+ url="https://figshare.com/ndownloader/files/43169008",
99
+ filename="2010.zip",
100
+ md5=True,
101
+ checksum="43347a0cc383c0d3dbe0d24ae56f328d",
102
+ ),
103
+ DataLocation(
104
+ url="https://figshare.com/ndownloader/files/43169011",
105
+ filename="2018.zip",
106
+ md5=True,
107
+ checksum="25d091044a10c78674fedad655023e3b",
108
+ ),
88
109
  ]
89
110
 
90
111
  index2label: dict[int, str] = {
@@ -95,15 +116,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
95
116
  def __init__(
96
117
  self,
97
118
  root: str | Path,
98
- download: bool = False,
119
+ image_set: Literal["train", "operational", "base"] = "train",
99
120
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
121
+ download: bool = False,
100
122
  verbose: bool = False,
101
123
  ) -> None:
102
124
  super().__init__(
103
125
  root,
104
- download,
105
- "base",
126
+ image_set,
106
127
  transforms,
128
+ download,
107
129
  verbose,
108
130
  )
109
131
 
@@ -112,10 +134,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
112
134
  targets: list[str] = []
113
135
  datum_metadata: dict[str, list[Any]] = {}
114
136
  metadata_list: list[dict[str, Any]] = []
137
+ image_sets: dict[str, list[int]] = {
138
+ "base": list(range(len(self._resources))),
139
+ "train": list(range(3)),
140
+ "operational": list(range(3, len(self._resources))),
141
+ }
115
142
 
116
143
  # Load the data
117
- for resource in self._resources:
118
- self._resource = resource
144
+ resource_indices = image_sets[self.image_set]
145
+ for idx in resource_indices:
146
+ self._resource = self._resources[idx]
119
147
  filepath, target, metadata = super()._load_data()
120
148
  filepaths.extend(filepath)
121
149
  targets.extend(target)
@@ -49,9 +49,6 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
49
49
  ----------
50
50
  root : str or pathlib.Path
51
51
  Root directory of dataset where the ``mnist`` folder exists.
52
- download : bool, default False
53
- If True, downloads the dataset from the internet and puts it in root directory.
54
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
55
52
  image_set : "train", "test" or "base", default "train"
56
53
  If "base", returns all of the data to allow the user to create their own splits.
57
54
  corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
@@ -60,6 +57,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
60
57
  Corruption to apply to the data.
61
58
  transforms : Transform, Sequence[Transform] or None, default None
62
59
  Transform(s) to apply to the data.
60
+ download : bool, default False
61
+ If True, downloads the dataset from the internet and puts it in root directory.
62
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
63
63
  verbose : bool, default False
64
64
  If True, outputs print statements.
65
65
 
@@ -81,6 +81,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
81
81
  The transforms to be applied to the data.
82
82
  size : int
83
83
  The size of the dataset.
84
+
85
+ Note
86
+ ----
87
+ Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_ for corruption dataset
84
88
  """
85
89
 
86
90
  _resources = [
@@ -114,10 +118,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
114
118
  def __init__(
115
119
  self,
116
120
  root: str | Path,
117
- download: bool = False,
118
121
  image_set: Literal["train", "test", "base"] = "train",
119
122
  corruption: CorruptionStringMap | None = None,
120
123
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
124
+ download: bool = False,
121
125
  verbose: bool = False,
122
126
  ) -> None:
123
127
  self.corruption = corruption
@@ -127,9 +131,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
127
131
 
128
132
  super().__init__(
129
133
  root,
130
- download,
131
134
  image_set,
132
135
  transforms,
136
+ download,
133
137
  verbose,
134
138
  )
135
139
 
@@ -31,11 +31,11 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
31
31
  ----------
32
32
  root : str or pathlib.Path
33
33
  Root directory of dataset where the ``shipdataset`` folder exists.
34
+ transforms : Transform, Sequence[Transform] or None, default None
35
+ Transform(s) to apply to the data.
34
36
  download : bool, default False
35
37
  If True, downloads the dataset from the internet and puts it in root directory.
36
38
  Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
37
- transforms : Transform, Sequence[Transform] or None, default None
38
- Transform(s) to apply to the data.
39
39
  verbose : bool, default False
40
40
  If True, outputs print statements.
41
41
 
@@ -55,6 +55,10 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
55
55
  The transforms to be applied to the data.
56
56
  size : int
57
57
  The size of the dataset.
58
+
59
+ Note
60
+ ----
61
+ Data License: `CC BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_
58
62
  """
59
63
 
60
64
  _resources = [
@@ -74,15 +78,15 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
74
78
  def __init__(
75
79
  self,
76
80
  root: str | Path,
77
- download: bool = False,
78
81
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
82
+ download: bool = False,
79
83
  verbose: bool = False,
80
84
  ) -> None:
81
85
  super().__init__(
82
86
  root,
83
- download,
84
87
  "base",
85
88
  transforms,
89
+ download,
86
90
  verbose,
87
91
  )
88
92
  self._scenes: list[str] = self._load_scenes()