dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +40 -85
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
- dataeval/detectors/drift/updates.py +20 -3
- dataeval/detectors/linters/__init__.py +3 -5
- dataeval/detectors/linters/duplicates.py +13 -36
- dataeval/detectors/linters/outliers.py +23 -148
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +30 -9
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/mixin.py +21 -7
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +6 -0
- dataeval/metadata/_distance.py +167 -0
- dataeval/metadata/_ood.py +217 -0
- dataeval/metadata/_utils.py +44 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +6 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
- dataeval/metrics/bias/_coverage.py +98 -0
- dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
- dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
- dataeval/metrics/estimators/__init__.py +15 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
- dataeval/metrics/estimators/_clusterer.py +44 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
- dataeval/metrics/stats/__init__.py +16 -13
- dataeval/metrics/stats/{base.py → _base.py} +82 -133
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
- dataeval/metrics/stats/_dimensionstats.py +75 -0
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
- dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval/metrics/stats/_labelstats.py +131 -0
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
- dataeval/outputs/__init__.py +53 -0
- dataeval/{output.py → outputs/_base.py} +55 -25
- dataeval/outputs/_bias.py +381 -0
- dataeval/outputs/_drift.py +83 -0
- dataeval/outputs/_estimators.py +114 -0
- dataeval/outputs/_linters.py +184 -0
- dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
- dataeval/outputs/_stats.py +387 -0
- dataeval/outputs/_utils.py +44 -0
- dataeval/outputs/_workflows.py +364 -0
- dataeval/typing.py +234 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +14 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +6 -6
- dataeval/utils/data/__init__.py +26 -0
- dataeval/utils/data/_dataset.py +217 -0
- dataeval/utils/data/_embeddings.py +104 -0
- dataeval/utils/data/_images.py +68 -0
- dataeval/utils/data/_metadata.py +360 -0
- dataeval/utils/data/_selection.py +126 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
- dataeval/utils/data/_targets.py +85 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_types.py +52 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +57 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/__init__.py +2 -1
- dataeval/workflows/sufficiency.py +11 -346
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
- dataeval-0.82.0.dist-info/RECORD +104 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/metrics/bias/coverage.py +0 -194
- dataeval/metrics/stats/datasetstats.py +0 -202
- dataeval/metrics/stats/dimensionstats.py +0 -115
- dataeval/metrics/stats/labelstats.py +0 -210
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -1,202 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import Any, Iterable
|
7
|
-
|
8
|
-
from numpy.typing import ArrayLike
|
9
|
-
|
10
|
-
from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
|
11
|
-
from dataeval.metrics.stats.dimensionstats import (
|
12
|
-
DimensionStatsOutput,
|
13
|
-
DimensionStatsProcessor,
|
14
|
-
)
|
15
|
-
from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
16
|
-
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
|
17
|
-
from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
|
18
|
-
from dataeval.output import Output, set_metadata
|
19
|
-
from dataeval.utils.plot import channel_histogram_plot
|
20
|
-
|
21
|
-
|
22
|
-
@dataclass(frozen=True)
|
23
|
-
class DatasetStatsOutput(Output, HistogramPlotMixin):
|
24
|
-
"""
|
25
|
-
Output class for :func:`datasetstats` stats metric.
|
26
|
-
|
27
|
-
This class represents the outputs of various stats functions against a single
|
28
|
-
dataset, such that each index across all stat outputs are representative of
|
29
|
-
the same source image. Modifying or mixing outputs will result in inaccurate
|
30
|
-
outlier calculations if not created correctly.
|
31
|
-
|
32
|
-
Attributes
|
33
|
-
----------
|
34
|
-
dimensionstats : DimensionStatsOutput
|
35
|
-
pixelstats: PixelStatsOutput
|
36
|
-
visualstats: VisualStatsOutput
|
37
|
-
labelstats: LabelStatsOutput or None
|
38
|
-
"""
|
39
|
-
|
40
|
-
dimensionstats: DimensionStatsOutput
|
41
|
-
pixelstats: PixelStatsOutput
|
42
|
-
visualstats: VisualStatsOutput
|
43
|
-
labelstats: LabelStatsOutput | None = None
|
44
|
-
|
45
|
-
_excluded_keys = ["histogram", "percentiles"]
|
46
|
-
|
47
|
-
def _outputs(self) -> list[Output]:
|
48
|
-
return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
|
49
|
-
|
50
|
-
def dict(self) -> dict[str, Any]:
|
51
|
-
return {k: v for o in self._outputs() for k, v in o.dict().items()}
|
52
|
-
|
53
|
-
def __post_init__(self) -> None:
|
54
|
-
lengths = [len(s) for s in self._outputs() if isinstance(s, BaseStatsOutput)]
|
55
|
-
if not all(length == lengths[0] for length in lengths):
|
56
|
-
raise ValueError("All StatsOutput classes must contain the same number of image sources.")
|
57
|
-
|
58
|
-
|
59
|
-
def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
|
60
|
-
raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
|
61
|
-
if isinstance(channel_index, int):
|
62
|
-
max_channels = 1 if channel_index < raw_channels else raw_channels
|
63
|
-
ch_mask = cls.pixelstats.get_channel_mask(channel_index)
|
64
|
-
elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
|
65
|
-
max_channels = len(list(channel_index))
|
66
|
-
ch_mask = cls.pixelstats.get_channel_mask(channel_index)
|
67
|
-
elif isinstance(channel_limit, int):
|
68
|
-
max_channels = channel_limit
|
69
|
-
ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
|
70
|
-
else:
|
71
|
-
max_channels = raw_channels
|
72
|
-
ch_mask = None
|
73
|
-
|
74
|
-
if max_channels > raw_channels:
|
75
|
-
max_channels = raw_channels
|
76
|
-
if ch_mask is not None and not any(ch_mask):
|
77
|
-
ch_mask = None
|
78
|
-
|
79
|
-
return max_channels, ch_mask
|
80
|
-
|
81
|
-
|
82
|
-
@dataclass(frozen=True)
|
83
|
-
class ChannelStatsOutput(Output):
|
84
|
-
"""
|
85
|
-
Output class for :func:`channelstats` stats metric.
|
86
|
-
|
87
|
-
This class represents the outputs of various per-channel stats functions against
|
88
|
-
a single dataset, such that each index across all stat outputs are representative
|
89
|
-
of the same source image. Modifying or mixing outputs will result in inaccurate
|
90
|
-
outlier calculations if not created correctly.
|
91
|
-
|
92
|
-
Attributes
|
93
|
-
----------
|
94
|
-
pixelstats: PixelStatsOutput
|
95
|
-
visualstats: VisualStatsOutput
|
96
|
-
"""
|
97
|
-
|
98
|
-
pixelstats: PixelStatsOutput
|
99
|
-
visualstats: VisualStatsOutput
|
100
|
-
|
101
|
-
def _outputs(self) -> tuple[PixelStatsOutput, VisualStatsOutput]:
|
102
|
-
return (self.pixelstats, self.visualstats)
|
103
|
-
|
104
|
-
def dict(self) -> dict[str, Any]:
|
105
|
-
return {**self.pixelstats.dict(), **self.visualstats.dict()}
|
106
|
-
|
107
|
-
def __post_init__(self) -> None:
|
108
|
-
lengths = [len(s) for s in self._outputs()]
|
109
|
-
if not all(length == lengths[0] for length in lengths):
|
110
|
-
raise ValueError("All StatsOutput classes must contain the same number of image sources.")
|
111
|
-
|
112
|
-
def plot(
|
113
|
-
self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
|
114
|
-
) -> None:
|
115
|
-
max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
|
116
|
-
data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
|
117
|
-
channel_histogram_plot(data_dict, log, max_channels, ch_mask)
|
118
|
-
|
119
|
-
|
120
|
-
@set_metadata
|
121
|
-
def datasetstats(
|
122
|
-
images: Iterable[ArrayLike],
|
123
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
124
|
-
labels: Iterable[ArrayLike] | None = None,
|
125
|
-
) -> DatasetStatsOutput:
|
126
|
-
"""
|
127
|
-
Calculates various :term:`statistics<Statistics>` for each image.
|
128
|
-
|
129
|
-
This function computes dimension, pixel and visual metrics
|
130
|
-
on the images or individual bounding boxes for each image as
|
131
|
-
well as label statistics if provided.
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
images : Iterable[ArrayLike]
|
136
|
-
Images to perform calculations on
|
137
|
-
bboxes : Iterable[ArrayLike] or None
|
138
|
-
Bounding boxes in `xyxy` format for each image to perform calculations on
|
139
|
-
labels : Iterable[ArrayLike] or None
|
140
|
-
Labels of images or boxes to perform calculations on
|
141
|
-
|
142
|
-
Returns
|
143
|
-
-------
|
144
|
-
DatasetStatsOutput
|
145
|
-
Output class containing the outputs of various stats functions
|
146
|
-
|
147
|
-
See Also
|
148
|
-
--------
|
149
|
-
dimensionstats, labelstats, pixelstats, visualstats, Outliers
|
150
|
-
|
151
|
-
Examples
|
152
|
-
--------
|
153
|
-
Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
|
154
|
-
|
155
|
-
>>> stats = datasetstats(stats_images, bboxes)
|
156
|
-
>>> print(stats.dimensionstats.aspect_ratio)
|
157
|
-
[ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
|
158
|
-
>>> print(stats.visualstats.sharpness)
|
159
|
-
[4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
|
160
|
-
"""
|
161
|
-
outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
|
162
|
-
return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
|
163
|
-
|
164
|
-
|
165
|
-
@set_metadata
|
166
|
-
def channelstats(
|
167
|
-
images: Iterable[ArrayLike],
|
168
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
169
|
-
) -> ChannelStatsOutput:
|
170
|
-
"""
|
171
|
-
Calculates various per-channel :term:`statistics` for each image.
|
172
|
-
|
173
|
-
This function computes pixel and visual metrics on the images
|
174
|
-
or individual bounding boxes for each image.
|
175
|
-
|
176
|
-
Parameters
|
177
|
-
----------
|
178
|
-
images : Iterable[ArrayLike]
|
179
|
-
Images to perform calculations on
|
180
|
-
bboxes : Iterable[ArrayLike] or None
|
181
|
-
Bounding boxes in `xyxy` format for each image to perform calculations on
|
182
|
-
|
183
|
-
Returns
|
184
|
-
-------
|
185
|
-
ChannelStatsOutput
|
186
|
-
Output class containing the per-channel outputs of various stats functions
|
187
|
-
|
188
|
-
See Also
|
189
|
-
--------
|
190
|
-
pixelstats, visualstats
|
191
|
-
|
192
|
-
Examples
|
193
|
-
--------
|
194
|
-
Calculating the per-channel pixel and visual stats for a dataset
|
195
|
-
|
196
|
-
>>> stats = channelstats(stats_images)
|
197
|
-
>>> print(stats.visualstats.darkness)
|
198
|
-
[0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
|
199
|
-
0.8154]
|
200
|
-
"""
|
201
|
-
outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
|
202
|
-
return ChannelStatsOutput(*outputs) # type: ignore
|
@@ -1,115 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import Any, Callable, Iterable
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
from numpy.typing import ArrayLike, NDArray
|
10
|
-
|
11
|
-
from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
|
12
|
-
from dataeval.output import set_metadata
|
13
|
-
from dataeval.utils.image import get_bitdepth
|
14
|
-
|
15
|
-
|
16
|
-
@dataclass(frozen=True)
|
17
|
-
class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
|
-
"""
|
19
|
-
Output class for :func:`dimensionstats` stats metric.
|
20
|
-
|
21
|
-
Attributes
|
22
|
-
----------
|
23
|
-
left : NDArray[np.int32]
|
24
|
-
Offsets from the left edge of images in pixels
|
25
|
-
top : NDArray[np.int32]
|
26
|
-
Offsets from the top edge of images in pixels
|
27
|
-
width : NDArray[np.uint32]
|
28
|
-
Width of the images in pixels
|
29
|
-
height : NDArray[np.uint32]
|
30
|
-
Height of the images in pixels
|
31
|
-
channels : NDArray[np.uint8]
|
32
|
-
Channel count of the images in pixels
|
33
|
-
size : NDArray[np.uint32]
|
34
|
-
Size of the images in pixels
|
35
|
-
aspect_ratio : NDArray[np.float16]
|
36
|
-
:term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
|
37
|
-
depth : NDArray[np.uint8]
|
38
|
-
Color depth of the images in bits
|
39
|
-
center : NDArray[np.uint16]
|
40
|
-
Offset from center in [x,y] coordinates of the images in pixels
|
41
|
-
distance : NDArray[np.float16]
|
42
|
-
Distance in pixels from center
|
43
|
-
"""
|
44
|
-
|
45
|
-
left: NDArray[np.int32]
|
46
|
-
top: NDArray[np.int32]
|
47
|
-
width: NDArray[np.uint32]
|
48
|
-
height: NDArray[np.uint32]
|
49
|
-
channels: NDArray[np.uint8]
|
50
|
-
size: NDArray[np.uint32]
|
51
|
-
aspect_ratio: NDArray[np.float16]
|
52
|
-
depth: NDArray[np.uint8]
|
53
|
-
center: NDArray[np.int16]
|
54
|
-
distance: NDArray[np.float16]
|
55
|
-
|
56
|
-
|
57
|
-
class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
58
|
-
output_class: type = DimensionStatsOutput
|
59
|
-
image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
|
60
|
-
"left": lambda x: x.box[0],
|
61
|
-
"top": lambda x: x.box[1],
|
62
|
-
"width": lambda x: x.box[2] - x.box[0],
|
63
|
-
"height": lambda x: x.box[3] - x.box[1],
|
64
|
-
"channels": lambda x: x.shape[-3],
|
65
|
-
"size": lambda x: (x.box[2] - x.box[0]) * (x.box[3] - x.box[1]),
|
66
|
-
"aspect_ratio": lambda x: (x.box[2] - x.box[0]) / (x.box[3] - x.box[1]),
|
67
|
-
"depth": lambda x: get_bitdepth(x.image).depth,
|
68
|
-
"center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
|
69
|
-
"distance": lambda x: np.sqrt(
|
70
|
-
np.square(((x.box[0] + x.box[2]) / 2) - (x.shape[-1] / 2))
|
71
|
-
+ np.square(((x.box[1] + x.box[3]) / 2) - (x.shape[-2] / 2))
|
72
|
-
),
|
73
|
-
}
|
74
|
-
|
75
|
-
|
76
|
-
@set_metadata
|
77
|
-
def dimensionstats(
|
78
|
-
images: Iterable[ArrayLike],
|
79
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
80
|
-
) -> DimensionStatsOutput:
|
81
|
-
"""
|
82
|
-
Calculates dimension :term:`statistics<Statistics>` for each image.
|
83
|
-
|
84
|
-
This function computes various dimensional metrics (e.g., width, height, channels)
|
85
|
-
on the images or individual bounding boxes for each image.
|
86
|
-
|
87
|
-
Parameters
|
88
|
-
----------
|
89
|
-
images : Iterable[ArrayLike]
|
90
|
-
Images to perform calculations on
|
91
|
-
bboxes : Iterable[ArrayLike] or None
|
92
|
-
Bounding boxes in `xyxy` format for each image to perform calculations on
|
93
|
-
|
94
|
-
Returns
|
95
|
-
-------
|
96
|
-
DimensionStatsOutput
|
97
|
-
A dictionary-like object containing the computed dimension statistics for each image or bounding
|
98
|
-
box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
|
99
|
-
are lists of results for each image or :term:NumPy` arrays when the results are multi-dimensional.
|
100
|
-
|
101
|
-
See Also
|
102
|
-
--------
|
103
|
-
pixelstats, visualstats, Outliers
|
104
|
-
|
105
|
-
Examples
|
106
|
-
--------
|
107
|
-
Calculating the dimension statistics on the images, whose shape is (C, H, W)
|
108
|
-
|
109
|
-
>>> results = dimensionstats(stats_images)
|
110
|
-
>>> print(results.aspect_ratio)
|
111
|
-
[1. 1. 1.333 1. 0.6665]
|
112
|
-
>>> print(results.channels)
|
113
|
-
[3 3 1 3 1]
|
114
|
-
"""
|
115
|
-
return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
|
@@ -1,210 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
import contextlib
|
6
|
-
from collections import Counter, defaultdict
|
7
|
-
from dataclasses import dataclass
|
8
|
-
from typing import Any, Iterable, Mapping, TypeVar
|
9
|
-
|
10
|
-
import numpy as np
|
11
|
-
from numpy.typing import ArrayLike
|
12
|
-
|
13
|
-
from dataeval.interop import as_numpy
|
14
|
-
from dataeval.output import Output, set_metadata
|
15
|
-
|
16
|
-
with contextlib.suppress(ImportError):
|
17
|
-
import pandas as pd
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass(frozen=True)
|
21
|
-
class LabelStatsOutput(Output):
|
22
|
-
"""
|
23
|
-
Output class for :func:`labelstats` stats metric.
|
24
|
-
|
25
|
-
Attributes
|
26
|
-
----------
|
27
|
-
label_counts_per_class : dict[str | int, int]
|
28
|
-
Dictionary whose keys are the different label classes and
|
29
|
-
values are total counts of each class
|
30
|
-
label_counts_per_image : list[int]
|
31
|
-
Number of labels per image
|
32
|
-
image_counts_per_label : dict[str | int, int]
|
33
|
-
Dictionary whose keys are the different label classes and
|
34
|
-
values are total counts of each image the class is present in
|
35
|
-
image_indices_per_label : dict[str | int, list]
|
36
|
-
Dictionary whose keys are the different label classes and
|
37
|
-
values are lists containing the images that have that label
|
38
|
-
image_count : int
|
39
|
-
Total number of images present
|
40
|
-
class_count : int
|
41
|
-
Total number of classes present
|
42
|
-
label_count : int
|
43
|
-
Total number of labels present
|
44
|
-
"""
|
45
|
-
|
46
|
-
label_counts_per_class: dict[str | int, int]
|
47
|
-
label_counts_per_image: list[int]
|
48
|
-
image_counts_per_label: dict[str | int, int]
|
49
|
-
image_indices_per_label: dict[str | int, list[int]]
|
50
|
-
image_count: int
|
51
|
-
class_count: int
|
52
|
-
label_count: int
|
53
|
-
|
54
|
-
def to_table(self) -> str:
|
55
|
-
max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
|
56
|
-
max_char = max(max_char, 5)
|
57
|
-
max_label = max(list(self.label_counts_per_class.values()))
|
58
|
-
max_img = max(list(self.image_counts_per_label.values()))
|
59
|
-
max_num = int(np.ceil(np.log10(max(max_label, max_img))))
|
60
|
-
max_num = max(max_num, 11)
|
61
|
-
|
62
|
-
# Display basic counts
|
63
|
-
table_str = f"Class Count: {self.class_count}\n"
|
64
|
-
table_str += f"Label Count: {self.label_count}\n"
|
65
|
-
table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
|
66
|
-
table_str += "--------------------------------------\n"
|
67
|
-
|
68
|
-
# Display counts per class
|
69
|
-
table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
|
70
|
-
for cls in self.label_counts_per_class:
|
71
|
-
table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
|
72
|
-
table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
|
73
|
-
|
74
|
-
return table_str
|
75
|
-
|
76
|
-
def to_dataframe(self) -> pd.DataFrame:
|
77
|
-
import pandas as pd
|
78
|
-
|
79
|
-
class_list = []
|
80
|
-
total_count = []
|
81
|
-
image_count = []
|
82
|
-
for cls in self.label_counts_per_class:
|
83
|
-
class_list.append(cls)
|
84
|
-
total_count.append(self.label_counts_per_class[cls])
|
85
|
-
image_count.append(self.image_counts_per_label[cls])
|
86
|
-
|
87
|
-
return pd.DataFrame(
|
88
|
-
{
|
89
|
-
"Label": class_list,
|
90
|
-
"Total Count": total_count,
|
91
|
-
"Image Count": image_count,
|
92
|
-
}
|
93
|
-
)
|
94
|
-
|
95
|
-
|
96
|
-
TKey = TypeVar("TKey", int, str)
|
97
|
-
|
98
|
-
|
99
|
-
def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
|
100
|
-
"""
|
101
|
-
Sort mappings by key in increasing order
|
102
|
-
"""
|
103
|
-
return dict(sorted(d.items(), key=lambda x: x[0]))
|
104
|
-
|
105
|
-
|
106
|
-
def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
|
107
|
-
if isinstance(labels, np.ndarray):
|
108
|
-
return labels[:, None]
|
109
|
-
else:
|
110
|
-
return [[lbl] for lbl in labels] # type: ignore
|
111
|
-
|
112
|
-
|
113
|
-
def _get_list_depth(lst):
|
114
|
-
if isinstance(lst, list) and lst:
|
115
|
-
return 1 + max(_get_list_depth(item) for item in lst)
|
116
|
-
return 0
|
117
|
-
|
118
|
-
|
119
|
-
def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
|
120
|
-
# Check for nested lists beyond 2 levels
|
121
|
-
|
122
|
-
if isinstance(labels, np.ndarray):
|
123
|
-
if labels.ndim == 1:
|
124
|
-
return _ensure_2d(labels)
|
125
|
-
elif labels.ndim == 2:
|
126
|
-
return labels
|
127
|
-
else:
|
128
|
-
raise ValueError("The label array must not have more than 2 dimensions.")
|
129
|
-
elif isinstance(labels, list):
|
130
|
-
depth = _get_list_depth(labels)
|
131
|
-
if depth == 1:
|
132
|
-
return _ensure_2d(labels)
|
133
|
-
elif depth == 2:
|
134
|
-
return labels
|
135
|
-
else:
|
136
|
-
raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
|
137
|
-
else:
|
138
|
-
raise TypeError("Labels must be either a NumPy array or a list.")
|
139
|
-
|
140
|
-
|
141
|
-
@set_metadata
|
142
|
-
def labelstats(
|
143
|
-
labels: Iterable[ArrayLike],
|
144
|
-
) -> LabelStatsOutput:
|
145
|
-
"""
|
146
|
-
Calculates :term:`statistics<Statistics>` for data labels.
|
147
|
-
|
148
|
-
This function computes counting metrics (e.g., total per class, total per image)
|
149
|
-
on the labels.
|
150
|
-
|
151
|
-
Parameters
|
152
|
-
----------
|
153
|
-
labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
|
154
|
-
Lists or :term:`NumPy` array of labels.
|
155
|
-
A set of lists where each list contains all labels per image -
|
156
|
-
(e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
|
157
|
-
If a numpy array, N is the number of images, M is the number of labels per image.
|
158
|
-
|
159
|
-
Returns
|
160
|
-
-------
|
161
|
-
LabelStatsOutput
|
162
|
-
A dictionary-like object containing the computed counting metrics for the labels.
|
163
|
-
|
164
|
-
Examples
|
165
|
-
--------
|
166
|
-
Calculating the :term:`statistics<Statistics>` on labels for a set of data
|
167
|
-
|
168
|
-
>>> stats = labelstats(labels)
|
169
|
-
>>> stats.label_counts_per_class
|
170
|
-
{'chicken': 12, 'cow': 5, 'horse': 4, 'pig': 7, 'sheep': 4}
|
171
|
-
>>> stats.label_counts_per_image
|
172
|
-
[3, 3, 5, 3, 2, 5, 5, 2, 2, 2]
|
173
|
-
>>> stats.image_counts_per_label
|
174
|
-
{'chicken': 8, 'cow': 4, 'horse': 4, 'pig': 7, 'sheep': 4}
|
175
|
-
>>> (stats.image_count, stats.class_count, stats.label_count)
|
176
|
-
(10, 5, 32)
|
177
|
-
"""
|
178
|
-
label_counts = Counter()
|
179
|
-
image_counts = Counter()
|
180
|
-
index_location = defaultdict(list[int])
|
181
|
-
label_per_image: list[int] = []
|
182
|
-
|
183
|
-
labels_2d = _check_labels_dimension(labels)
|
184
|
-
|
185
|
-
for i, group in enumerate(labels_2d):
|
186
|
-
group = as_numpy(group)
|
187
|
-
|
188
|
-
# Count occurrences of each label in all sublists
|
189
|
-
label_counts.update(group)
|
190
|
-
|
191
|
-
# Get the number of labels per image
|
192
|
-
label_per_image.append(len(group))
|
193
|
-
|
194
|
-
# Create a set of unique items in the current sublist
|
195
|
-
unique_items: set[int] = set(group)
|
196
|
-
|
197
|
-
# Update image counts and index locations
|
198
|
-
image_counts.update(unique_items)
|
199
|
-
for item in unique_items:
|
200
|
-
index_location[item].append(i)
|
201
|
-
|
202
|
-
return LabelStatsOutput(
|
203
|
-
label_counts_per_class=sort(label_counts),
|
204
|
-
label_counts_per_image=label_per_image,
|
205
|
-
image_counts_per_label=sort(image_counts),
|
206
|
-
image_indices_per_label=sort(index_location),
|
207
|
-
image_count=len(label_per_image),
|
208
|
-
class_count=len(label_counts),
|
209
|
-
label_count=sum(label_counts.values()),
|
210
|
-
)
|
@@ -1,7 +0,0 @@
|
|
1
|
-
"""Provides utility functions for interacting with Computer Vision datasets."""
|
2
|
-
|
3
|
-
__all__ = ["datasets", "read_dataset", "SplitDatasetOutput", "split_dataset"]
|
4
|
-
|
5
|
-
from dataeval.utils.dataset import datasets
|
6
|
-
from dataeval.utils.dataset.read import read_dataset
|
7
|
-
from dataeval.utils.dataset.split import SplitDatasetOutput, split_dataset
|