dataeval 0.84.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/data/__init__.py +19 -0
  3. dataeval/data/_embeddings.py +345 -0
  4. dataeval/{utils/data → data}/_images.py +2 -2
  5. dataeval/{utils/data → data}/_metadata.py +8 -7
  6. dataeval/{utils/data → data}/_selection.py +22 -9
  7. dataeval/{utils/data → data}/_split.py +1 -1
  8. dataeval/data/selections/__init__.py +19 -0
  9. dataeval/data/selections/_classbalance.py +37 -0
  10. dataeval/data/selections/_classfilter.py +109 -0
  11. dataeval/{utils/data → data}/selections/_indices.py +1 -1
  12. dataeval/{utils/data → data}/selections/_limit.py +1 -1
  13. dataeval/{utils/data → data}/selections/_prioritize.py +3 -3
  14. dataeval/{utils/data → data}/selections/_reverse.py +1 -1
  15. dataeval/{utils/data → data}/selections/_shuffle.py +3 -3
  16. dataeval/detectors/drift/__init__.py +2 -2
  17. dataeval/detectors/drift/_base.py +55 -203
  18. dataeval/detectors/drift/_cvm.py +19 -30
  19. dataeval/detectors/drift/_ks.py +18 -30
  20. dataeval/detectors/drift/_mmd.py +189 -53
  21. dataeval/detectors/drift/_uncertainty.py +52 -56
  22. dataeval/detectors/drift/updates.py +13 -12
  23. dataeval/detectors/linters/duplicates.py +6 -4
  24. dataeval/detectors/linters/outliers.py +3 -3
  25. dataeval/detectors/ood/ae.py +1 -1
  26. dataeval/metadata/_distance.py +1 -1
  27. dataeval/metadata/_ood.py +4 -4
  28. dataeval/metrics/bias/_balance.py +1 -1
  29. dataeval/metrics/bias/_diversity.py +1 -1
  30. dataeval/metrics/bias/_parity.py +1 -1
  31. dataeval/metrics/stats/_base.py +7 -7
  32. dataeval/metrics/stats/_dimensionstats.py +2 -2
  33. dataeval/metrics/stats/_hashstats.py +2 -2
  34. dataeval/metrics/stats/_imagestats.py +4 -4
  35. dataeval/metrics/stats/_labelstats.py +2 -2
  36. dataeval/metrics/stats/_pixelstats.py +2 -2
  37. dataeval/metrics/stats/_visualstats.py +2 -2
  38. dataeval/outputs/_bias.py +1 -1
  39. dataeval/typing.py +53 -19
  40. dataeval/utils/__init__.py +2 -2
  41. dataeval/utils/_array.py +18 -7
  42. dataeval/utils/data/__init__.py +5 -20
  43. dataeval/utils/data/_dataset.py +6 -4
  44. dataeval/utils/data/collate.py +2 -0
  45. dataeval/utils/datasets/__init__.py +17 -0
  46. dataeval/utils/{data/datasets → datasets}/_base.py +10 -7
  47. dataeval/utils/{data/datasets → datasets}/_cifar10.py +11 -11
  48. dataeval/utils/{data/datasets → datasets}/_milco.py +44 -16
  49. dataeval/utils/{data/datasets → datasets}/_mnist.py +11 -7
  50. dataeval/utils/{data/datasets → datasets}/_ships.py +10 -6
  51. dataeval/utils/{data/datasets → datasets}/_voc.py +43 -22
  52. dataeval/utils/torch/_internal.py +12 -35
  53. {dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/METADATA +2 -3
  54. dataeval-1.0.0.dist-info/RECORD +107 -0
  55. dataeval/detectors/drift/_torch.py +0 -222
  56. dataeval/utils/data/_embeddings.py +0 -186
  57. dataeval/utils/data/datasets/__init__.py +0 -17
  58. dataeval/utils/data/selections/__init__.py +0 -17
  59. dataeval/utils/data/selections/_classfilter.py +0 -59
  60. dataeval-0.84.0.dist-info/RECORD +0 -106
  61. /dataeval/{utils/data → data}/_targets.py +0 -0
  62. /dataeval/utils/{metadata.py → data/metadata.py} +0 -0
  63. /dataeval/utils/{data/datasets → datasets}/_fileio.py +0 -0
  64. /dataeval/utils/{data/datasets → datasets}/_mixin.py +0 -0
  65. /dataeval/utils/{data/datasets → datasets}/_types.py +0 -0
  66. {dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/LICENSE.txt +0 -0
  67. {dataeval-0.84.0.dist-info → dataeval-1.0.0.dist-info}/WHEEL +0 -0
@@ -52,10 +52,12 @@ def _validate_data(
52
52
 
53
53
 
54
54
  def _find_max(arr: ArrayLike) -> Any:
55
- if isinstance(arr[0], (Iterable, Sequence, Array)):
56
- return max([_find_max(x) for x in arr]) # type: ignore
57
- else:
58
- return max(arr)
55
+ if isinstance(arr, (Iterable, Sequence, Array)):
56
+ if isinstance(arr[0], (Iterable, Sequence, Array)):
57
+ return max([_find_max(x) for x in arr]) # type: ignore
58
+ else:
59
+ return max(arr)
60
+ return arr
59
61
 
60
62
 
61
63
  _TLabels = TypeVar("_TLabels", Sequence[int], Sequence[Sequence[int]])
@@ -4,6 +4,8 @@ Collate functions used with a PyTorch DataLoader to load data from MAITE complia
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ __all__ = ["list_collate_fn", "numpy_collate_fn", "torch_collate_fn"]
8
+
7
9
  from typing import Any, Iterable, Sequence, TypeVar
8
10
 
9
11
  import numpy as np
@@ -0,0 +1,17 @@
1
+ """Provides access to common Computer Vision datasets."""
2
+
3
+ from dataeval.utils.datasets._cifar10 import CIFAR10
4
+ from dataeval.utils.datasets._milco import MILCO
5
+ from dataeval.utils.datasets._mnist import MNIST
6
+ from dataeval.utils.datasets._ships import Ships
7
+ from dataeval.utils.datasets._voc import VOCDetection, VOCDetectionTorch, VOCSegmentation
8
+
9
+ __all__ = [
10
+ "MNIST",
11
+ "Ships",
12
+ "CIFAR10",
13
+ "MILCO",
14
+ "VOCDetection",
15
+ "VOCDetectionTorch",
16
+ "VOCSegmentation",
17
+ ]
@@ -6,9 +6,9 @@ from abc import abstractmethod
6
6
  from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Generic, Iterator, Literal, NamedTuple, Sequence, TypeVar
8
8
 
9
- from dataeval.utils.data.datasets._fileio import _ensure_exists
10
- from dataeval.utils.data.datasets._mixin import BaseDatasetMixin
11
- from dataeval.utils.data.datasets._types import (
9
+ from dataeval.utils.datasets._fileio import _ensure_exists
10
+ from dataeval.utils.datasets._mixin import BaseDatasetMixin
11
+ from dataeval.utils.datasets._types import (
12
12
  AnnotatedDataset,
13
13
  DatasetMetadata,
14
14
  ImageClassificationDataset,
@@ -19,9 +19,12 @@ from dataeval.utils.data.datasets._types import (
19
19
  )
20
20
 
21
21
  if TYPE_CHECKING:
22
- from dataeval.typing import Transform
22
+ from dataeval.typing import Array, Transform
23
+
24
+ _TArray = TypeVar("_TArray", bound=Array)
25
+ else:
26
+ _TArray = TypeVar("_TArray")
23
27
 
24
- _TArray = TypeVar("_TArray")
25
28
  _TTarget = TypeVar("_TTarget")
26
29
  _TRawTarget = TypeVar("_TRawTarget", list[int], list[str])
27
30
 
@@ -51,9 +54,9 @@ class BaseDataset(AnnotatedDataset[tuple[_TArray, _TTarget, dict[str, Any]]], Ge
51
54
  def __init__(
52
55
  self,
53
56
  root: str | Path,
54
- download: bool = False,
55
- image_set: Literal["train", "val", "test", "base"] = "train",
57
+ image_set: Literal["train", "val", "test", "operational", "base"] = "train",
56
58
  transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
59
+ download: bool = False,
57
60
  verbose: bool = False,
58
61
  ) -> None:
59
62
  self._root: Path = root.absolute() if isinstance(root, Path) else Path(root).absolute()
@@ -9,8 +9,8 @@ import numpy as np
9
9
  from numpy.typing import NDArray
10
10
  from PIL import Image
11
11
 
12
- from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
13
- from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
12
+ from dataeval.utils.datasets._base import BaseICDataset, DataLocation
13
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
14
14
 
15
15
  if TYPE_CHECKING:
16
16
  from dataeval.typing import Transform
@@ -27,13 +27,13 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
27
27
  ----------
28
28
  root : str or pathlib.Path
29
29
  Root directory of dataset where the ``mnist`` folder exists.
30
- download : bool, default False
31
- If True, downloads the dataset from the internet and puts it in root directory.
32
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
33
30
  image_set : "train", "test" or "base", default "train"
34
31
  If "base", returns all of the data to allow the user to create their own splits.
35
32
  transforms : Transform, Sequence[Transform] or None, default None
36
33
  Transform(s) to apply to the data.
34
+ download : bool, default False
35
+ If True, downloads the dataset from the internet and puts it in root directory.
36
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
37
37
  verbose : bool, default False
38
38
  If True, outputs print statements.
39
39
 
@@ -43,16 +43,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
43
43
  Location of the folder containing the data.
44
44
  image_set : "train", "test" or "base"
45
45
  The selected image set from the dataset.
46
+ transforms : Sequence[Transform]
47
+ The transforms to be applied to the data.
48
+ size : int
49
+ The size of the dataset.
46
50
  index2label : dict[int, str]
47
51
  Dictionary which translates from class integers to the associated class strings.
48
52
  label2index : dict[str, int]
49
53
  Dictionary which translates from class strings to the associated class integers.
50
54
  metadata : DatasetMetadata
51
55
  Typed dictionary containing dataset metadata, such as `id` which returns the dataset class name.
52
- transforms : Sequence[Transform]
53
- The transforms to be applied to the data.
54
- size : int
55
- The size of the dataset.
56
56
  """
57
57
 
58
58
  _resources = [
@@ -80,16 +80,16 @@ class CIFAR10(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
80
80
  def __init__(
81
81
  self,
82
82
  root: str | Path,
83
- download: bool = False,
84
83
  image_set: Literal["train", "test", "base"] = "train",
85
84
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
85
+ download: bool = False,
86
86
  verbose: bool = False,
87
87
  ) -> None:
88
88
  super().__init__(
89
89
  root,
90
- download,
91
90
  image_set,
92
91
  transforms,
92
+ download,
93
93
  verbose,
94
94
  )
95
95
 
@@ -3,12 +3,12 @@ from __future__ import annotations
3
3
  __all__ = []
4
4
 
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Sequence
6
+ from typing import TYPE_CHECKING, Any, Literal, Sequence
7
7
 
8
8
  from numpy.typing import NDArray
9
9
 
10
- from dataeval.utils.data.datasets._base import BaseODDataset, DataLocation
11
- from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
10
+ from dataeval.utils.datasets._base import BaseODDataset, DataLocation
11
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
12
12
 
13
13
  if TYPE_CHECKING:
14
14
  from dataeval.typing import Transform
@@ -16,21 +16,20 @@ if TYPE_CHECKING:
16
16
 
17
17
  class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
18
18
  """
19
- A side-scan sonar dataset focused on mine (object) detection.
19
+ A side-scan sonar dataset focused on mine-like object detection.
20
20
 
21
21
  The dataset comes from the paper
22
22
  `Side-scan sonar imaging data of underwater vehicles for mine detection <https://doi.org/10.1016/j.dib.2024.110132>`_
23
23
  by N.P. Santos et. al. (2024).
24
24
 
25
- This class only accesses a portion of the above dataset due to size constraints.
26
25
  The full dataset contains 1170 side-scan sonar images collected using a 900-1800 kHz Marine Sonic
27
26
  dual frequency side-scan sonar of a Teledyne Marine Gavia Autonomous Underwater Vehicle.
28
27
  All the images were carefully analyzed and annotated, including the image coordinates of the
29
28
  Bounding Box (BB) of the detected objects divided into NOn-Mine-like BOttom Objects (NOMBO)
30
29
  and MIne-Like COntacts (MILCO) classes.
31
30
 
32
- This dataset is consists of 261 images (120 images from 2015, 93 images from 2017, and 48 images from 2021).
33
- In these 261 images, there are 315 MILCO objects, and 175 NOMBO objects.
31
+ This dataset is consists of 345 images from 2010, 120 images from 2015, 93 images from 2017, 564 images from 2018,
32
+ and 48 images from 2021). In these 1170 images, there are 432 MILCO objects, and 235 NOMBO objects.
34
33
  The class “0” corresponds to a MILCO object and the class “1” corresponds to a NOMBO object.
35
34
  The raw BB coordinates provided in the downloaded text files are (x, y, w, h),
36
35
  given as percentages of the image (x_BB = x/img_width, y_BB = y/img_height, etc.).
@@ -40,11 +39,17 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
40
39
  ----------
41
40
  root : str or pathlib.Path
42
41
  Root directory of dataset where the ``milco`` folder exists.
42
+ image_set: "train", "operational", or "base", default "train"
43
+ If "train", then the images from 2015, 2017 and 2021 are selected,
44
+ resulting in 315 MILCO objects and 177 NOMBO objects.
45
+ If "operational", then the images from 2010 and 2018 are selected,
46
+ resulting in 117 MILCO objects and 58 NOMBO objects.
47
+ If "base", then the full dataset is selected.
48
+ transforms : Transform, Sequence[Transform] or None, default None
49
+ Transform(s) to apply to the data.
43
50
  download : bool, default False
44
51
  If True, downloads the dataset from the internet and puts it in root directory.
45
52
  Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
46
- transforms : Transform, Sequence[Transform] or None, default None
47
- Transform(s) to apply to the data.
48
53
  verbose : bool, default False
49
54
  If True, outputs print statements.
50
55
 
@@ -52,8 +57,8 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
52
57
  ----------
53
58
  path : pathlib.Path
54
59
  Location of the folder containing the data.
55
- image_set : "base"
56
- The base image set is the only available image set for the MILCO dataset.
60
+ image_set : "train", "operational" or "base"
61
+ The selected image set from the dataset.
57
62
  index2label : dict[int, str]
58
63
  Dictionary which translates from class integers to the associated class strings.
59
64
  label2index : dict[str, int]
@@ -64,6 +69,10 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
64
69
  The transforms to be applied to the data.
65
70
  size : int
66
71
  The size of the dataset.
72
+
73
+ Note
74
+ ----
75
+ Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_
67
76
  """
68
77
 
69
78
  _resources = [
@@ -85,6 +94,18 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
85
94
  md5=True,
86
95
  checksum="b84749b21fa95a4a4c7de3741db78bc7",
87
96
  ),
97
+ DataLocation(
98
+ url="https://figshare.com/ndownloader/files/43169008",
99
+ filename="2010.zip",
100
+ md5=True,
101
+ checksum="43347a0cc383c0d3dbe0d24ae56f328d",
102
+ ),
103
+ DataLocation(
104
+ url="https://figshare.com/ndownloader/files/43169011",
105
+ filename="2018.zip",
106
+ md5=True,
107
+ checksum="25d091044a10c78674fedad655023e3b",
108
+ ),
88
109
  ]
89
110
 
90
111
  index2label: dict[int, str] = {
@@ -95,15 +116,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
95
116
  def __init__(
96
117
  self,
97
118
  root: str | Path,
98
- download: bool = False,
119
+ image_set: Literal["train", "operational", "base"] = "train",
99
120
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
121
+ download: bool = False,
100
122
  verbose: bool = False,
101
123
  ) -> None:
102
124
  super().__init__(
103
125
  root,
104
- download,
105
- "base",
126
+ image_set,
106
127
  transforms,
128
+ download,
107
129
  verbose,
108
130
  )
109
131
 
@@ -112,10 +134,16 @@ class MILCO(BaseODDataset[NDArray[Any]], BaseDatasetNumpyMixin):
112
134
  targets: list[str] = []
113
135
  datum_metadata: dict[str, list[Any]] = {}
114
136
  metadata_list: list[dict[str, Any]] = []
137
+ image_sets: dict[str, list[int]] = {
138
+ "base": list(range(len(self._resources))),
139
+ "train": list(range(3)),
140
+ "operational": list(range(3, len(self._resources))),
141
+ }
115
142
 
116
143
  # Load the data
117
- for resource in self._resources:
118
- self._resource = resource
144
+ resource_indices = image_sets[self.image_set]
145
+ for idx in resource_indices:
146
+ self._resource = self._resources[idx]
119
147
  filepath, target, metadata = super()._load_data()
120
148
  filepaths.extend(filepath)
121
149
  targets.extend(target)
@@ -8,8 +8,8 @@ from typing import TYPE_CHECKING, Any, Literal, Sequence, TypeVar
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
12
- from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
11
+ from dataeval.utils.datasets._base import BaseICDataset, DataLocation
12
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from dataeval.typing import Transform
@@ -49,9 +49,6 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
49
49
  ----------
50
50
  root : str or pathlib.Path
51
51
  Root directory of dataset where the ``mnist`` folder exists.
52
- download : bool, default False
53
- If True, downloads the dataset from the internet and puts it in root directory.
54
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
55
52
  image_set : "train", "test" or "base", default "train"
56
53
  If "base", returns all of the data to allow the user to create their own splits.
57
54
  corruption : "identity", "shot_noise", "impulse_noise", "glass_blur", "motion_blur", \
@@ -60,6 +57,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
60
57
  Corruption to apply to the data.
61
58
  transforms : Transform, Sequence[Transform] or None, default None
62
59
  Transform(s) to apply to the data.
60
+ download : bool, default False
61
+ If True, downloads the dataset from the internet and puts it in root directory.
62
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
63
63
  verbose : bool, default False
64
64
  If True, outputs print statements.
65
65
 
@@ -81,6 +81,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
81
81
  The transforms to be applied to the data.
82
82
  size : int
83
83
  The size of the dataset.
84
+
85
+ Note
86
+ ----
87
+ Data License: `CC BY 4.0 <https://creativecommons.org/licenses/by/4.0/>`_ for corruption dataset
84
88
  """
85
89
 
86
90
  _resources = [
@@ -114,10 +118,10 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
114
118
  def __init__(
115
119
  self,
116
120
  root: str | Path,
117
- download: bool = False,
118
121
  image_set: Literal["train", "test", "base"] = "train",
119
122
  corruption: CorruptionStringMap | None = None,
120
123
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
124
+ download: bool = False,
121
125
  verbose: bool = False,
122
126
  ) -> None:
123
127
  self.corruption = corruption
@@ -127,9 +131,9 @@ class MNIST(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
127
131
 
128
132
  super().__init__(
129
133
  root,
130
- download,
131
134
  image_set,
132
135
  transforms,
136
+ download,
133
137
  verbose,
134
138
  )
135
139
 
@@ -8,8 +8,8 @@ from typing import TYPE_CHECKING, Any, Sequence
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
10
10
 
11
- from dataeval.utils.data.datasets._base import BaseICDataset, DataLocation
12
- from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin
11
+ from dataeval.utils.datasets._base import BaseICDataset, DataLocation
12
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from dataeval.typing import Transform
@@ -31,11 +31,11 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
31
31
  ----------
32
32
  root : str or pathlib.Path
33
33
  Root directory of dataset where the ``shipdataset`` folder exists.
34
+ transforms : Transform, Sequence[Transform] or None, default None
35
+ Transform(s) to apply to the data.
34
36
  download : bool, default False
35
37
  If True, downloads the dataset from the internet and puts it in root directory.
36
38
  Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
37
- transforms : Transform, Sequence[Transform] or None, default None
38
- Transform(s) to apply to the data.
39
39
  verbose : bool, default False
40
40
  If True, outputs print statements.
41
41
 
@@ -55,6 +55,10 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
55
55
  The transforms to be applied to the data.
56
56
  size : int
57
57
  The size of the dataset.
58
+
59
+ Note
60
+ ----
61
+ Data License: `CC BY-SA 4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_
58
62
  """
59
63
 
60
64
  _resources = [
@@ -74,15 +78,15 @@ class Ships(BaseICDataset[NDArray[Any]], BaseDatasetNumpyMixin):
74
78
  def __init__(
75
79
  self,
76
80
  root: str | Path,
77
- download: bool = False,
78
81
  transforms: Transform[NDArray[Any]] | Sequence[Transform[NDArray[Any]]] | None = None,
82
+ download: bool = False,
79
83
  verbose: bool = False,
80
84
  ) -> None:
81
85
  super().__init__(
82
86
  root,
83
- download,
84
87
  "base",
85
88
  transforms,
89
+ download,
86
90
  verbose,
87
91
  )
88
92
  self._scenes: list[str] = self._load_scenes()
@@ -9,21 +9,20 @@ import torch
9
9
  from defusedxml.ElementTree import parse
10
10
  from numpy.typing import NDArray
11
11
 
12
- from dataeval.utils.data.datasets._base import (
12
+ from dataeval.utils.datasets._base import (
13
13
  BaseDataset,
14
14
  BaseODDataset,
15
15
  BaseSegDataset,
16
16
  DataLocation,
17
+ _TArray,
18
+ _TTarget,
17
19
  )
18
- from dataeval.utils.data.datasets._mixin import BaseDatasetNumpyMixin, BaseDatasetTorchMixin
19
- from dataeval.utils.data.datasets._types import ObjectDetectionTarget, SegmentationTarget
20
+ from dataeval.utils.datasets._mixin import BaseDatasetNumpyMixin, BaseDatasetTorchMixin
21
+ from dataeval.utils.datasets._types import ObjectDetectionTarget, SegmentationTarget
20
22
 
21
23
  if TYPE_CHECKING:
22
24
  from dataeval.typing import Transform
23
25
 
24
- _TArray = TypeVar("_TArray")
25
- _TTarget = TypeVar("_TTarget")
26
-
27
26
  VOCClassStringMap = Literal[
28
27
  "aeroplane",
29
28
  "bicycle",
@@ -121,19 +120,19 @@ class BaseVOCDataset(BaseDataset[_TArray, _TTarget, list[str]]):
121
120
  def __init__(
122
121
  self,
123
122
  root: str | Path,
124
- year: Literal["2007", "2008", "2009", "2010", "2011", "2012"] = "2012",
125
123
  image_set: Literal["train", "val", "test", "base"] = "train",
126
- download: bool = False,
124
+ year: Literal["2007", "2008", "2009", "2010", "2011", "2012"] = "2012",
127
125
  transforms: Transform[_TArray] | Sequence[Transform[_TArray]] | None = None,
126
+ download: bool = False,
128
127
  verbose: bool = False,
129
128
  ) -> None:
130
129
  self.year = year
131
130
  self._resource_index = self._get_year_image_set_index(year, image_set)
132
131
  super().__init__(
133
132
  root,
134
- download,
135
133
  image_set,
136
134
  transforms,
135
+ download,
137
136
  verbose,
138
137
  )
139
138
 
@@ -191,10 +190,14 @@ class BaseVOCDataset(BaseDataset[_TArray, _TTarget, list[str]]):
191
190
  for entry in data:
192
191
  file_name = Path(entry).name
193
192
  file_stem = Path(entry).stem
194
- # Remove file extension and split by "_"
195
- parts = file_stem.split("_")
196
- file_meta["year"].append(parts[0])
197
- file_meta["image_id"].append(parts[1])
193
+ if self.year != "2007":
194
+ # Remove file extension and split by "_"
195
+ parts = file_stem.split("_")
196
+ file_meta["year"].append(parts[0])
197
+ file_meta["image_id"].append(parts[1])
198
+ else:
199
+ file_meta["year"].append(self.year)
200
+ file_meta["image_id"].append(file_stem)
198
201
  file_meta["mask_path"].append(str(seg_folder / file_name))
199
202
  annotations.append(str(ann_folder / file_stem) + ".xml")
200
203
 
@@ -250,9 +253,6 @@ class VOCDetection(
250
253
  ----------
251
254
  root : str or pathlib.Path
252
255
  Root directory of dataset where the ``vocdataset`` folder exists.
253
- download : bool, default False
254
- If True, downloads the dataset from the internet and puts it in root directory.
255
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
256
256
  image_set : "train", "val", "test", or "base", default "train"
257
257
  If "test", then dataset year must be "2007".
258
258
  If "base", then the combined dataset of "train" and "val" is returned.
@@ -260,6 +260,9 @@ class VOCDetection(
260
260
  The dataset year.
261
261
  transforms : Transform, Sequence[Transform] or None, default None
262
262
  Transform(s) to apply to the data.
263
+ download : bool, default False
264
+ If True, downloads the dataset from the internet and puts it in root directory.
265
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
263
266
  verbose : bool, default False
264
267
  If True, outputs print statements.
265
268
 
@@ -267,6 +270,8 @@ class VOCDetection(
267
270
  ----------
268
271
  path : pathlib.Path
269
272
  Location of the folder containing the data.
273
+ year : "2007", "2008", "2009", "2010", "2011" or "2012"
274
+ The selected dataset year.
270
275
  image_set : "train", "val", "test" or "base"
271
276
  The selected image set from the dataset.
272
277
  index2label : dict[int, str]
@@ -279,6 +284,10 @@ class VOCDetection(
279
284
  The transforms to be applied to the data.
280
285
  size : int
281
286
  The size of the dataset.
287
+
288
+ Note
289
+ ----
290
+ Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
282
291
  """
283
292
 
284
293
 
@@ -294,9 +303,6 @@ class VOCDetectionTorch(
294
303
  ----------
295
304
  root : str or pathlib.Path
296
305
  Root directory of dataset where the ``vocdataset`` folder exists.
297
- download : bool, default False
298
- If True, downloads the dataset from the internet and puts it in root directory.
299
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
300
306
  image_set : "train", "val", "test", or "base", default "train"
301
307
  If "test", then dataset year must be "2007".
302
308
  If "base", then the combined dataset of "train" and "val" is returned.
@@ -304,6 +310,9 @@ class VOCDetectionTorch(
304
310
  The dataset year.
305
311
  transforms : Transform, Sequence[Transform] or None, default None
306
312
  Transform(s) to apply to the data.
313
+ download : bool, default False
314
+ If True, downloads the dataset from the internet and puts it in root directory.
315
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
307
316
  verbose : bool, default False
308
317
  If True, outputs print statements.
309
318
 
@@ -311,6 +320,8 @@ class VOCDetectionTorch(
311
320
  ----------
312
321
  path : pathlib.Path
313
322
  Location of the folder containing the data.
323
+ year : "2007", "2008", "2009", "2010", "2011" or "2012"
324
+ The selected dataset year.
314
325
  image_set : "train", "val", "test" or "base"
315
326
  The selected image set from the dataset.
316
327
  index2label : dict[int, str]
@@ -323,6 +334,10 @@ class VOCDetectionTorch(
323
334
  The transforms to be applied to the data.
324
335
  size : int
325
336
  The size of the dataset.
337
+
338
+ Note
339
+ ----
340
+ Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
326
341
  """
327
342
 
328
343
 
@@ -338,9 +353,6 @@ class VOCSegmentation(
338
353
  ----------
339
354
  root : str or pathlib.Path
340
355
  Root directory of dataset where the ``vocdataset`` folder exists.
341
- download : bool, default False
342
- If True, downloads the dataset from the internet and puts it in root directory.
343
- Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
344
356
  image_set : "train", "val", "test", or "base", default "train"
345
357
  If "test", then dataset year must be "2007".
346
358
  If "base", then the combined dataset of "train" and "val" is returned.
@@ -348,6 +360,9 @@ class VOCSegmentation(
348
360
  The dataset year.
349
361
  transforms : Transform, Sequence[Transform] or None, default None
350
362
  Transform(s) to apply to the data.
363
+ download : bool, default False
364
+ If True, downloads the dataset from the internet and puts it in root directory.
365
+ Class checks to see if data is already downloaded to ensure it does not create a duplicate download.
351
366
  verbose : bool, default False
352
367
  If True, outputs print statements.
353
368
 
@@ -355,6 +370,8 @@ class VOCSegmentation(
355
370
  ----------
356
371
  path : pathlib.Path
357
372
  Location of the folder containing the data.
373
+ year : "2007", "2008", "2009", "2010", "2011" or "2012"
374
+ The selected dataset year.
358
375
  image_set : "train", "val", "test" or "base"
359
376
  The selected image set from the dataset.
360
377
  index2label : dict[int, str]
@@ -367,6 +384,10 @@ class VOCSegmentation(
367
384
  The transforms to be applied to the data.
368
385
  size : int
369
386
  The size of the dataset.
387
+
388
+ Note
389
+ ----
390
+ Data License: `Flickr Terms of Use <http://www.flickr.com/terms.gne?legacy=1>`_
370
391
  """
371
392
 
372
393
  def _load_data(self) -> tuple[list[str], list[str], dict[str, list[Any]]]: