dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/config.py +77 -0
  3. dataeval/detectors/__init__.py +1 -1
  4. dataeval/detectors/drift/__init__.py +6 -6
  5. dataeval/detectors/drift/{base.py → _base.py} +40 -85
  6. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  7. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  8. dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
  9. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  10. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
  11. dataeval/detectors/drift/updates.py +20 -3
  12. dataeval/detectors/linters/__init__.py +3 -5
  13. dataeval/detectors/linters/duplicates.py +13 -36
  14. dataeval/detectors/linters/outliers.py +23 -148
  15. dataeval/detectors/ood/__init__.py +1 -1
  16. dataeval/detectors/ood/ae.py +30 -9
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/mixin.py +21 -7
  19. dataeval/detectors/ood/vae.py +73 -0
  20. dataeval/metadata/__init__.py +6 -0
  21. dataeval/metadata/_distance.py +167 -0
  22. dataeval/metadata/_ood.py +217 -0
  23. dataeval/metadata/_utils.py +44 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +6 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
  27. dataeval/metrics/bias/_coverage.py +98 -0
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
  29. dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
  30. dataeval/metrics/estimators/__init__.py +15 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
  32. dataeval/metrics/estimators/_clusterer.py +44 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
  35. dataeval/metrics/stats/__init__.py +16 -13
  36. dataeval/metrics/stats/{base.py → _base.py} +82 -133
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
  38. dataeval/metrics/stats/_dimensionstats.py +75 -0
  39. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
  40. dataeval/metrics/stats/_imagestats.py +94 -0
  41. dataeval/metrics/stats/_labelstats.py +131 -0
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
  44. dataeval/outputs/__init__.py +53 -0
  45. dataeval/{output.py → outputs/_base.py} +55 -25
  46. dataeval/outputs/_bias.py +381 -0
  47. dataeval/outputs/_drift.py +83 -0
  48. dataeval/outputs/_estimators.py +114 -0
  49. dataeval/outputs/_linters.py +184 -0
  50. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  51. dataeval/outputs/_stats.py +387 -0
  52. dataeval/outputs/_utils.py +44 -0
  53. dataeval/outputs/_workflows.py +364 -0
  54. dataeval/typing.py +234 -0
  55. dataeval/utils/__init__.py +2 -2
  56. dataeval/utils/_array.py +169 -0
  57. dataeval/utils/_bin.py +199 -0
  58. dataeval/utils/_clusterer.py +144 -0
  59. dataeval/utils/_fast_mst.py +189 -0
  60. dataeval/utils/{image.py → _image.py} +6 -4
  61. dataeval/utils/_method.py +14 -0
  62. dataeval/utils/{shared.py → _mst.py} +3 -65
  63. dataeval/utils/{plot.py → _plot.py} +6 -6
  64. dataeval/utils/data/__init__.py +26 -0
  65. dataeval/utils/data/_dataset.py +217 -0
  66. dataeval/utils/data/_embeddings.py +104 -0
  67. dataeval/utils/data/_images.py +68 -0
  68. dataeval/utils/data/_metadata.py +360 -0
  69. dataeval/utils/data/_selection.py +126 -0
  70. dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
  71. dataeval/utils/data/_targets.py +85 -0
  72. dataeval/utils/data/collate.py +103 -0
  73. dataeval/utils/data/datasets/__init__.py +17 -0
  74. dataeval/utils/data/datasets/_base.py +254 -0
  75. dataeval/utils/data/datasets/_cifar10.py +134 -0
  76. dataeval/utils/data/datasets/_fileio.py +168 -0
  77. dataeval/utils/data/datasets/_milco.py +153 -0
  78. dataeval/utils/data/datasets/_mixin.py +56 -0
  79. dataeval/utils/data/datasets/_mnist.py +183 -0
  80. dataeval/utils/data/datasets/_ships.py +123 -0
  81. dataeval/utils/data/datasets/_types.py +52 -0
  82. dataeval/utils/data/datasets/_voc.py +352 -0
  83. dataeval/utils/data/selections/__init__.py +15 -0
  84. dataeval/utils/data/selections/_classfilter.py +57 -0
  85. dataeval/utils/data/selections/_indices.py +26 -0
  86. dataeval/utils/data/selections/_limit.py +26 -0
  87. dataeval/utils/data/selections/_reverse.py +18 -0
  88. dataeval/utils/data/selections/_shuffle.py +29 -0
  89. dataeval/utils/metadata.py +51 -376
  90. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  91. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  92. dataeval/utils/torch/models.py +43 -2
  93. dataeval/workflows/__init__.py +2 -1
  94. dataeval/workflows/sufficiency.py +11 -346
  95. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
  96. dataeval-0.82.0.dist-info/RECORD +104 -0
  97. dataeval/detectors/linters/clusterer.py +0 -512
  98. dataeval/detectors/linters/merged_stats.py +0 -49
  99. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  100. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  101. dataeval/interop.py +0 -69
  102. dataeval/metrics/bias/coverage.py +0 -194
  103. dataeval/metrics/stats/datasetstats.py +0 -202
  104. dataeval/metrics/stats/dimensionstats.py +0 -115
  105. dataeval/metrics/stats/labelstats.py +0 -210
  106. dataeval/utils/dataset/__init__.py +0 -7
  107. dataeval/utils/dataset/datasets.py +0 -412
  108. dataeval/utils/dataset/read.py +0 -63
  109. dataeval-0.76.1.dist-info/RECORD +0 -67
  110. /dataeval/{log.py → _log.py} +0 -0
  111. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  112. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
  113. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -1,202 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable
7
-
8
- from numpy.typing import ArrayLike
9
-
10
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
11
- from dataeval.metrics.stats.dimensionstats import (
12
- DimensionStatsOutput,
13
- DimensionStatsProcessor,
14
- )
15
- from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
16
- from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
17
- from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
18
- from dataeval.output import Output, set_metadata
19
- from dataeval.utils.plot import channel_histogram_plot
20
-
21
-
22
- @dataclass(frozen=True)
23
- class DatasetStatsOutput(Output, HistogramPlotMixin):
24
- """
25
- Output class for :func:`datasetstats` stats metric.
26
-
27
- This class represents the outputs of various stats functions against a single
28
- dataset, such that each index across all stat outputs are representative of
29
- the same source image. Modifying or mixing outputs will result in inaccurate
30
- outlier calculations if not created correctly.
31
-
32
- Attributes
33
- ----------
34
- dimensionstats : DimensionStatsOutput
35
- pixelstats: PixelStatsOutput
36
- visualstats: VisualStatsOutput
37
- labelstats: LabelStatsOutput or None
38
- """
39
-
40
- dimensionstats: DimensionStatsOutput
41
- pixelstats: PixelStatsOutput
42
- visualstats: VisualStatsOutput
43
- labelstats: LabelStatsOutput | None = None
44
-
45
- _excluded_keys = ["histogram", "percentiles"]
46
-
47
- def _outputs(self) -> list[Output]:
48
- return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
49
-
50
- def dict(self) -> dict[str, Any]:
51
- return {k: v for o in self._outputs() for k, v in o.dict().items()}
52
-
53
- def __post_init__(self) -> None:
54
- lengths = [len(s) for s in self._outputs() if isinstance(s, BaseStatsOutput)]
55
- if not all(length == lengths[0] for length in lengths):
56
- raise ValueError("All StatsOutput classes must contain the same number of image sources.")
57
-
58
-
59
- def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
60
- raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
61
- if isinstance(channel_index, int):
62
- max_channels = 1 if channel_index < raw_channels else raw_channels
63
- ch_mask = cls.pixelstats.get_channel_mask(channel_index)
64
- elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
65
- max_channels = len(list(channel_index))
66
- ch_mask = cls.pixelstats.get_channel_mask(channel_index)
67
- elif isinstance(channel_limit, int):
68
- max_channels = channel_limit
69
- ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
70
- else:
71
- max_channels = raw_channels
72
- ch_mask = None
73
-
74
- if max_channels > raw_channels:
75
- max_channels = raw_channels
76
- if ch_mask is not None and not any(ch_mask):
77
- ch_mask = None
78
-
79
- return max_channels, ch_mask
80
-
81
-
82
- @dataclass(frozen=True)
83
- class ChannelStatsOutput(Output):
84
- """
85
- Output class for :func:`channelstats` stats metric.
86
-
87
- This class represents the outputs of various per-channel stats functions against
88
- a single dataset, such that each index across all stat outputs are representative
89
- of the same source image. Modifying or mixing outputs will result in inaccurate
90
- outlier calculations if not created correctly.
91
-
92
- Attributes
93
- ----------
94
- pixelstats: PixelStatsOutput
95
- visualstats: VisualStatsOutput
96
- """
97
-
98
- pixelstats: PixelStatsOutput
99
- visualstats: VisualStatsOutput
100
-
101
- def _outputs(self) -> tuple[PixelStatsOutput, VisualStatsOutput]:
102
- return (self.pixelstats, self.visualstats)
103
-
104
- def dict(self) -> dict[str, Any]:
105
- return {**self.pixelstats.dict(), **self.visualstats.dict()}
106
-
107
- def __post_init__(self) -> None:
108
- lengths = [len(s) for s in self._outputs()]
109
- if not all(length == lengths[0] for length in lengths):
110
- raise ValueError("All StatsOutput classes must contain the same number of image sources.")
111
-
112
- def plot(
113
- self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
114
- ) -> None:
115
- max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
116
- data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
117
- channel_histogram_plot(data_dict, log, max_channels, ch_mask)
118
-
119
-
120
- @set_metadata
121
- def datasetstats(
122
- images: Iterable[ArrayLike],
123
- bboxes: Iterable[ArrayLike] | None = None,
124
- labels: Iterable[ArrayLike] | None = None,
125
- ) -> DatasetStatsOutput:
126
- """
127
- Calculates various :term:`statistics<Statistics>` for each image.
128
-
129
- This function computes dimension, pixel and visual metrics
130
- on the images or individual bounding boxes for each image as
131
- well as label statistics if provided.
132
-
133
- Parameters
134
- ----------
135
- images : Iterable[ArrayLike]
136
- Images to perform calculations on
137
- bboxes : Iterable[ArrayLike] or None
138
- Bounding boxes in `xyxy` format for each image to perform calculations on
139
- labels : Iterable[ArrayLike] or None
140
- Labels of images or boxes to perform calculations on
141
-
142
- Returns
143
- -------
144
- DatasetStatsOutput
145
- Output class containing the outputs of various stats functions
146
-
147
- See Also
148
- --------
149
- dimensionstats, labelstats, pixelstats, visualstats, Outliers
150
-
151
- Examples
152
- --------
153
- Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
154
-
155
- >>> stats = datasetstats(stats_images, bboxes)
156
- >>> print(stats.dimensionstats.aspect_ratio)
157
- [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
158
- >>> print(stats.visualstats.sharpness)
159
- [4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
160
- """
161
- outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
162
- return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
163
-
164
-
165
- @set_metadata
166
- def channelstats(
167
- images: Iterable[ArrayLike],
168
- bboxes: Iterable[ArrayLike] | None = None,
169
- ) -> ChannelStatsOutput:
170
- """
171
- Calculates various per-channel :term:`statistics` for each image.
172
-
173
- This function computes pixel and visual metrics on the images
174
- or individual bounding boxes for each image.
175
-
176
- Parameters
177
- ----------
178
- images : Iterable[ArrayLike]
179
- Images to perform calculations on
180
- bboxes : Iterable[ArrayLike] or None
181
- Bounding boxes in `xyxy` format for each image to perform calculations on
182
-
183
- Returns
184
- -------
185
- ChannelStatsOutput
186
- Output class containing the per-channel outputs of various stats functions
187
-
188
- See Also
189
- --------
190
- pixelstats, visualstats
191
-
192
- Examples
193
- --------
194
- Calculating the per-channel pixel and visual stats for a dataset
195
-
196
- >>> stats = channelstats(stats_images)
197
- >>> print(stats.visualstats.darkness)
198
- [0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
199
- 0.8154]
200
- """
201
- outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
202
- return ChannelStatsOutput(*outputs) # type: ignore
@@ -1,115 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Callable, Iterable
7
-
8
- import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
10
-
11
- from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
12
- from dataeval.output import set_metadata
13
- from dataeval.utils.image import get_bitdepth
14
-
15
-
16
- @dataclass(frozen=True)
17
- class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
18
- """
19
- Output class for :func:`dimensionstats` stats metric.
20
-
21
- Attributes
22
- ----------
23
- left : NDArray[np.int32]
24
- Offsets from the left edge of images in pixels
25
- top : NDArray[np.int32]
26
- Offsets from the top edge of images in pixels
27
- width : NDArray[np.uint32]
28
- Width of the images in pixels
29
- height : NDArray[np.uint32]
30
- Height of the images in pixels
31
- channels : NDArray[np.uint8]
32
- Channel count of the images in pixels
33
- size : NDArray[np.uint32]
34
- Size of the images in pixels
35
- aspect_ratio : NDArray[np.float16]
36
- :term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
37
- depth : NDArray[np.uint8]
38
- Color depth of the images in bits
39
- center : NDArray[np.uint16]
40
- Offset from center in [x,y] coordinates of the images in pixels
41
- distance : NDArray[np.float16]
42
- Distance in pixels from center
43
- """
44
-
45
- left: NDArray[np.int32]
46
- top: NDArray[np.int32]
47
- width: NDArray[np.uint32]
48
- height: NDArray[np.uint32]
49
- channels: NDArray[np.uint8]
50
- size: NDArray[np.uint32]
51
- aspect_ratio: NDArray[np.float16]
52
- depth: NDArray[np.uint8]
53
- center: NDArray[np.int16]
54
- distance: NDArray[np.float16]
55
-
56
-
57
- class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
58
- output_class: type = DimensionStatsOutput
59
- image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
60
- "left": lambda x: x.box[0],
61
- "top": lambda x: x.box[1],
62
- "width": lambda x: x.box[2] - x.box[0],
63
- "height": lambda x: x.box[3] - x.box[1],
64
- "channels": lambda x: x.shape[-3],
65
- "size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
66
- "aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
67
- "depth": lambda x: get_bitdepth(x.image).depth,
68
- "center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
69
- "distance": lambda x: np.sqrt(
70
- np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
71
- + np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
72
- ),
73
- }
74
-
75
-
76
- @set_metadata
77
- def dimensionstats(
78
- images: Iterable[ArrayLike],
79
- bboxes: Iterable[ArrayLike] | None = None,
80
- ) -> DimensionStatsOutput:
81
- """
82
- Calculates dimension :term:`statistics<Statistics>` for each image.
83
-
84
- This function computes various dimensional metrics (e.g., width, height, channels)
85
- on the images or individual bounding boxes for each image.
86
-
87
- Parameters
88
- ----------
89
- images : Iterable[ArrayLike]
90
- Images to perform calculations on
91
- bboxes : Iterable[ArrayLike] or None
92
- Bounding boxes in `xyxy` format for each image to perform calculations on
93
-
94
- Returns
95
- -------
96
- DimensionStatsOutput
97
- A dictionary-like object containing the computed dimension statistics for each image or bounding
98
- box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
99
- are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
100
-
101
- See Also
102
- --------
103
- pixelstats, visualstats, Outliers
104
-
105
- Examples
106
- --------
107
- Calculating the dimension statistics on the images, whose shape is (C, H, W)
108
-
109
- >>> results = dimensionstats(stats_images)
110
- >>> print(results.aspect_ratio)
111
- [1. 1. 1.333 1. 0.6665]
112
- >>> print(results.channels)
113
- [3 3 1 3 1]
114
- """
115
- return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
@@ -1,210 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- import contextlib
6
- from collections import Counter, defaultdict
7
- from dataclasses import dataclass
8
- from typing import Any, Iterable, Mapping, TypeVar
9
-
10
- import numpy as np
11
- from numpy.typing import ArrayLike
12
-
13
- from dataeval.interop import as_numpy
14
- from dataeval.output import Output, set_metadata
15
-
16
- with contextlib.suppress(ImportError):
17
- import pandas as pd
18
-
19
-
20
- @dataclass(frozen=True)
21
- class LabelStatsOutput(Output):
22
- """
23
- Output class for :func:`labelstats` stats metric.
24
-
25
- Attributes
26
- ----------
27
- label_counts_per_class : dict[str | int, int]
28
- Dictionary whose keys are the different label classes and
29
- values are total counts of each class
30
- label_counts_per_image : list[int]
31
- Number of labels per image
32
- image_counts_per_label : dict[str | int, int]
33
- Dictionary whose keys are the different label classes and
34
- values are total counts of each image the class is present in
35
- image_indices_per_label : dict[str | int, list]
36
- Dictionary whose keys are the different label classes and
37
- values are lists containing the images that have that label
38
- image_count : int
39
- Total number of images present
40
- class_count : int
41
- Total number of classes present
42
- label_count : int
43
- Total number of labels present
44
- """
45
-
46
- label_counts_per_class: dict[str | int, int]
47
- label_counts_per_image: list[int]
48
- image_counts_per_label: dict[str | int, int]
49
- image_indices_per_label: dict[str | int, list[int]]
50
- image_count: int
51
- class_count: int
52
- label_count: int
53
-
54
- def to_table(self) -> str:
55
- max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
56
- max_char = max(max_char, 5)
57
- max_label = max(list(self.label_counts_per_class.values()))
58
- max_img = max(list(self.image_counts_per_label.values()))
59
- max_num = int(np.ceil(np.log10(max(max_label, max_img))))
60
- max_num = max(max_num, 11)
61
-
62
- # Display basic counts
63
- table_str = f"Class Count: {self.class_count}\n"
64
- table_str += f"Label Count: {self.label_count}\n"
65
- table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
66
- table_str += "--------------------------------------\n"
67
-
68
- # Display counts per class
69
- table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
70
- for cls in self.label_counts_per_class:
71
- table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
72
- table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
73
-
74
- return table_str
75
-
76
- def to_dataframe(self) -> pd.DataFrame:
77
- import pandas as pd
78
-
79
- class_list = []
80
- total_count = []
81
- image_count = []
82
- for cls in self.label_counts_per_class:
83
- class_list.append(cls)
84
- total_count.append(self.label_counts_per_class[cls])
85
- image_count.append(self.image_counts_per_label[cls])
86
-
87
- return pd.DataFrame(
88
- {
89
- "Label": class_list,
90
- "Total Count": total_count,
91
- "Image Count": image_count,
92
- }
93
- )
94
-
95
-
96
- TKey = TypeVar("TKey", int, str)
97
-
98
-
99
- def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
100
- """
101
- Sort mappings by key in increasing order
102
- """
103
- return dict(sorted(d.items(), key=lambda x: x[0]))
104
-
105
-
106
- def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
107
- if isinstance(labels, np.ndarray):
108
- return labels[:, None]
109
- else:
110
- return [[lbl] for lbl in labels] # type: ignore
111
-
112
-
113
- def _get_list_depth(lst):
114
- if isinstance(lst, list) and lst:
115
- return 1 + max(_get_list_depth(item) for item in lst)
116
- return 0
117
-
118
-
119
- def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
120
- # Check for nested lists beyond 2 levels
121
-
122
- if isinstance(labels, np.ndarray):
123
- if labels.ndim == 1:
124
- return _ensure_2d(labels)
125
- elif labels.ndim == 2:
126
- return labels
127
- else:
128
- raise ValueError("The label array must not have more than 2 dimensions.")
129
- elif isinstance(labels, list):
130
- depth = _get_list_depth(labels)
131
- if depth == 1:
132
- return _ensure_2d(labels)
133
- elif depth == 2:
134
- return labels
135
- else:
136
- raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
137
- else:
138
- raise TypeError("Labels must be either a NumPy array or a list.")
139
-
140
-
141
- @set_metadata
142
- def labelstats(
143
- labels: Iterable[ArrayLike],
144
- ) -> LabelStatsOutput:
145
- """
146
- Calculates :term:`statistics<Statistics>` for data labels.
147
-
148
- This function computes counting metrics (e.g., total per class, total per image)
149
- on the labels.
150
-
151
- Parameters
152
- ----------
153
- labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
154
- Lists or :term:`NumPy` array of labels.
155
- A set of lists where each list contains all labels per image -
156
- (e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
157
- If a numpy array, N is the number of images, M is the number of labels per image.
158
-
159
- Returns
160
- -------
161
- LabelStatsOutput
162
- A dictionary-like object containing the computed counting metrics for the labels.
163
-
164
- Examples
165
- --------
166
- Calculating the :term:`statistics<Statistics>` on labels for a set of data
167
-
168
- >>> stats = labelstats(labels)
169
- >>> stats.label_counts_per_class
170
- {'chicken': 12, 'cow': 5, 'horse': 4, 'pig': 7, 'sheep': 4}
171
- >>> stats.label_counts_per_image
172
- [3, 3, 5, 3, 2, 5, 5, 2, 2, 2]
173
- >>> stats.image_counts_per_label
174
- {'chicken': 8, 'cow': 4, 'horse': 4, 'pig': 7, 'sheep': 4}
175
- >>> (stats.image_count, stats.class_count, stats.label_count)
176
- (10, 5, 32)
177
- """
178
- label_counts = Counter()
179
- image_counts = Counter()
180
- index_location = defaultdict(list[int])
181
- label_per_image: list[int] = []
182
-
183
- labels_2d = _check_labels_dimension(labels)
184
-
185
- for i, group in enumerate(labels_2d):
186
- group = as_numpy(group)
187
-
188
- # Count occurrences of each label in all sublists
189
- label_counts.update(group)
190
-
191
- # Get the number of labels per image
192
- label_per_image.append(len(group))
193
-
194
- # Create a set of unique items in the current sublist
195
- unique_items: set[int] = set(group)
196
-
197
- # Update image counts and index locations
198
- image_counts.update(unique_items)
199
- for item in unique_items:
200
- index_location[item].append(i)
201
-
202
- return LabelStatsOutput(
203
- label_counts_per_class=sort(label_counts),
204
- label_counts_per_image=label_per_image,
205
- image_counts_per_label=sort(image_counts),
206
- image_indices_per_label=sort(index_location),
207
- image_count=len(label_per_image),
208
- class_count=len(label_counts),
209
- label_count=sum(label_counts.values()),
210
- )
@@ -1,7 +0,0 @@
1
- """Provides utility functions for interacting with Computer Vision datasets."""
2
-
3
- __all__ = ["datasets", "read_dataset", "SplitDatasetOutput", "split_dataset"]
4
-
5
- from dataeval.utils.dataset import datasets
6
- from dataeval.utils.dataset.read import read_dataset
7
- from dataeval.utils.dataset.split import SplitDatasetOutput, split_dataset