dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +27 -23
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +3 -2
- dataeval/detectors/drift/mmd.py +9 -7
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +5 -4
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +5 -9
- dataeval/detectors/linters/duplicates.py +10 -14
- dataeval/detectors/linters/outliers.py +100 -5
- dataeval/detectors/ood/__init__.py +4 -11
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +47 -160
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +7 -6
- dataeval/{logging.py → log.py} +2 -0
- dataeval/metrics/__init__.py +3 -3
- dataeval/metrics/bias/__init__.py +10 -13
- dataeval/metrics/bias/balance.py +13 -11
- dataeval/metrics/bias/coverage.py +53 -5
- dataeval/metrics/bias/diversity.py +56 -24
- dataeval/metrics/bias/parity.py +20 -17
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +7 -4
- dataeval/metrics/estimators/divergence.py +4 -4
- dataeval/metrics/estimators/uap.py +4 -4
- dataeval/metrics/stats/__init__.py +19 -19
- dataeval/metrics/stats/base.py +28 -12
- dataeval/metrics/stats/boxratiostats.py +13 -14
- dataeval/metrics/stats/datasetstats.py +49 -20
- dataeval/metrics/stats/dimensionstats.py +8 -8
- dataeval/metrics/stats/hashstats.py +14 -10
- dataeval/metrics/stats/labelstats.py +94 -11
- dataeval/metrics/stats/pixelstats.py +11 -14
- dataeval/metrics/stats/visualstats.py +10 -13
- dataeval/output.py +23 -14
- dataeval/utils/__init__.py +5 -14
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +317 -14
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +18 -8
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
- dataeval-0.76.0.dist-info/METADATA +137 -0
- dataeval-0.76.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval-0.74.2.dist-info/METADATA +0 -120
- dataeval-0.74.2.dist-info/RECORD +0 -66
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0
dataeval/metrics/stats/base.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from dataeval.utils.plot import histogram_plot
|
4
|
+
|
3
5
|
__all__ = []
|
4
6
|
|
5
7
|
import re
|
@@ -100,19 +102,33 @@ class BaseStatsOutput(Output):
|
|
100
102
|
for source_index in list(self.source_index) + [None]:
|
101
103
|
if source_index is None or source_index.image > cur_image:
|
102
104
|
mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
|
103
|
-
if source_index is None:
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
105
|
+
if source_index is not None:
|
106
|
+
cur_image = source_index.image
|
107
|
+
cur_max_channel = 0
|
108
|
+
cur_mask.clear()
|
109
|
+
if source_index is not None:
|
110
|
+
cur_mask.append(matches(source_index.channel, channel_index))
|
111
|
+
cur_max_channel = max(cur_max_channel, source_index.channel or 0)
|
110
112
|
return mask
|
111
113
|
|
112
114
|
def __len__(self) -> int:
|
113
115
|
return len(self.source_index)
|
114
116
|
|
115
117
|
|
118
|
+
def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
|
119
|
+
return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
|
120
|
+
|
121
|
+
|
122
|
+
class HistogramPlotMixin:
|
123
|
+
_excluded_keys: Iterable[str] = []
|
124
|
+
|
125
|
+
def dict(self) -> dict[str, Any]: ...
|
126
|
+
|
127
|
+
def plot(self, log: bool) -> None:
|
128
|
+
data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
|
129
|
+
histogram_plot(data_dict, log)
|
130
|
+
|
131
|
+
|
116
132
|
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
|
117
133
|
|
118
134
|
|
@@ -126,7 +142,7 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
126
142
|
self.raw = image
|
127
143
|
self.width: int = image.shape[-1]
|
128
144
|
self.height: int = image.shape[-2]
|
129
|
-
self.box: NDArray[
|
145
|
+
self.box: NDArray[np.int64] = np.array([0, 0, self.width, self.height]) if box is None else box.astype(np.int64)
|
130
146
|
self._per_channel = per_channel
|
131
147
|
self._image = None
|
132
148
|
self._shape = None
|
@@ -193,7 +209,7 @@ class StatsProcessorOutput(NamedTuple):
|
|
193
209
|
results: list[dict[str, Any]]
|
194
210
|
source_indices: list[SourceIndex]
|
195
211
|
box_counts: list[int]
|
196
|
-
warnings_list: list[
|
212
|
+
warnings_list: list[str]
|
197
213
|
|
198
214
|
|
199
215
|
def process_stats(
|
@@ -206,13 +222,13 @@ def process_stats(
|
|
206
222
|
results_list: list[dict[str, Any]] = []
|
207
223
|
source_indices: list[SourceIndex] = []
|
208
224
|
box_counts: list[int] = []
|
209
|
-
warnings_list: list[
|
225
|
+
warnings_list: list[str] = []
|
210
226
|
nboxes = [None] if boxes is None else normalize_box_shape(boxes)
|
211
227
|
for i_b, box in enumerate(nboxes):
|
212
228
|
i_b = None if box is None else i_b
|
213
229
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
214
230
|
if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
|
215
|
-
warnings_list.append(
|
231
|
+
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
|
216
232
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
217
233
|
if per_channel:
|
218
234
|
source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
|
@@ -302,7 +318,7 @@ def run_stats(
|
|
302
318
|
|
303
319
|
# warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
|
304
320
|
for w in warning_list:
|
305
|
-
warnings.warn(
|
321
|
+
warnings.warn(w, UserWarning)
|
306
322
|
|
307
323
|
output = {}
|
308
324
|
for results in results_list:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import copy
|
6
6
|
from typing import Any, Callable, Generic, TypeVar, cast
|
@@ -26,7 +26,7 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
|
|
26
26
|
def __getitem__(self, key: str) -> NDArray[np.float64]:
|
27
27
|
_stat = cast(np.ndarray, getattr(self._stats, key)).astype(np.float64)
|
28
28
|
_shape = _stat[0].shape
|
29
|
-
_slice = _stat[self._slice[0] : self._slice[1]]
|
29
|
+
_slice = _stat[int(self._slice[0]) : int(self._slice[1])]
|
30
30
|
return _slice.reshape(-1, self._channels, *_shape) if self._channels else _slice.reshape(-1, *_shape)
|
31
31
|
|
32
32
|
box: StatSlicer
|
@@ -102,7 +102,7 @@ def boxratiostats(
|
|
102
102
|
imgstats: TStatOutput,
|
103
103
|
) -> TStatOutput:
|
104
104
|
"""
|
105
|
-
Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs
|
105
|
+
Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs.
|
106
106
|
|
107
107
|
Parameters
|
108
108
|
----------
|
@@ -130,17 +130,16 @@ def boxratiostats(
|
|
130
130
|
--------
|
131
131
|
Calculating the box ratio statistics using the dimension stats of the boxes and images
|
132
132
|
|
133
|
-
>>>
|
134
|
-
>>>
|
133
|
+
>>> from dataeval.metrics.stats import dimensionstats
|
134
|
+
>>> imagestats = dimensionstats(stats_images)
|
135
|
+
>>> boxstats = dimensionstats(stats_images, bboxes)
|
135
136
|
>>> ratiostats = boxratiostats(boxstats, imagestats)
|
136
137
|
>>> print(ratiostats.aspect_ratio)
|
137
|
-
[
|
138
|
-
0.
|
139
|
-
0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
|
138
|
+
[ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
|
139
|
+
0.66650391 3.83296703 1.95018315]
|
140
140
|
>>> print(ratiostats.size)
|
141
|
-
[0.
|
142
|
-
0.
|
143
|
-
0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
|
141
|
+
[0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
|
142
|
+
0.00915527 0.03369141 0.02115885]
|
144
143
|
"""
|
145
144
|
output_cls = type(boxstats)
|
146
145
|
if type(boxstats) is not type(imgstats):
|
@@ -148,13 +147,13 @@ def boxratiostats(
|
|
148
147
|
if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
|
149
148
|
raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
|
150
149
|
if all(count == 0 for count in boxstats.box_count):
|
151
|
-
raise
|
150
|
+
raise ValueError("Input for boxstats must contain box information.")
|
152
151
|
if any(count != 0 for count in imgstats.box_count):
|
153
|
-
raise
|
152
|
+
raise ValueError("Input for imgstats must not contain box information.")
|
154
153
|
boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
|
155
154
|
imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
|
156
155
|
if boxstats_has_channels != imgstats_has_channels:
|
157
|
-
raise
|
156
|
+
raise ValueError("Input for boxstats and imgstats must have matching channel information.")
|
158
157
|
|
159
158
|
output_dict = {}
|
160
159
|
for key in boxstats.dict():
|
@@ -1,13 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Iterable
|
7
7
|
|
8
8
|
from numpy.typing import ArrayLike
|
9
9
|
|
10
|
-
from dataeval.metrics.stats.base import BaseStatsOutput, run_stats
|
10
|
+
from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
|
11
11
|
from dataeval.metrics.stats.dimensionstats import (
|
12
12
|
DimensionStatsOutput,
|
13
13
|
DimensionStatsProcessor,
|
@@ -16,16 +16,17 @@ from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
|
16
16
|
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
|
17
17
|
from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
|
18
18
|
from dataeval.output import Output, set_metadata
|
19
|
+
from dataeval.utils.plot import channel_histogram_plot
|
19
20
|
|
20
21
|
|
21
22
|
@dataclass(frozen=True)
|
22
|
-
class DatasetStatsOutput(Output):
|
23
|
+
class DatasetStatsOutput(Output, HistogramPlotMixin):
|
23
24
|
"""
|
24
|
-
Output class for :func:`datasetstats` stats metric
|
25
|
+
Output class for :func:`datasetstats` stats metric.
|
25
26
|
|
26
27
|
This class represents the outputs of various stats functions against a single
|
27
28
|
dataset, such that each index across all stat outputs are representative of
|
28
|
-
the same source image.
|
29
|
+
the same source image. Modifying or mixing outputs will result in inaccurate
|
29
30
|
outlier calculations if not created correctly.
|
30
31
|
|
31
32
|
Attributes
|
@@ -41,6 +42,8 @@ class DatasetStatsOutput(Output):
|
|
41
42
|
visualstats: VisualStatsOutput
|
42
43
|
labelstats: LabelStatsOutput | None = None
|
43
44
|
|
45
|
+
_excluded_keys = ["histogram", "percentiles"]
|
46
|
+
|
44
47
|
def _outputs(self) -> list[Output]:
|
45
48
|
return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
|
46
49
|
|
@@ -53,14 +56,37 @@ class DatasetStatsOutput(Output):
|
|
53
56
|
raise ValueError("All StatsOutput classes must contain the same number of image sources.")
|
54
57
|
|
55
58
|
|
59
|
+
def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
|
60
|
+
raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
|
61
|
+
if isinstance(channel_index, int):
|
62
|
+
max_channels = 1 if channel_index < raw_channels else raw_channels
|
63
|
+
ch_mask = cls.pixelstats.get_channel_mask(channel_index)
|
64
|
+
elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
|
65
|
+
max_channels = len(list(channel_index))
|
66
|
+
ch_mask = cls.pixelstats.get_channel_mask(channel_index)
|
67
|
+
elif isinstance(channel_limit, int):
|
68
|
+
max_channels = channel_limit
|
69
|
+
ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
|
70
|
+
else:
|
71
|
+
max_channels = raw_channels
|
72
|
+
ch_mask = None
|
73
|
+
|
74
|
+
if max_channels > raw_channels:
|
75
|
+
max_channels = raw_channels
|
76
|
+
if ch_mask is not None and not any(ch_mask):
|
77
|
+
ch_mask = None
|
78
|
+
|
79
|
+
return max_channels, ch_mask
|
80
|
+
|
81
|
+
|
56
82
|
@dataclass(frozen=True)
|
57
83
|
class ChannelStatsOutput(Output):
|
58
84
|
"""
|
59
|
-
Output class for :func:`channelstats` stats metric
|
85
|
+
Output class for :func:`channelstats` stats metric.
|
60
86
|
|
61
87
|
This class represents the outputs of various per-channel stats functions against
|
62
88
|
a single dataset, such that each index across all stat outputs are representative
|
63
|
-
of the same source image.
|
89
|
+
of the same source image. Modifying or mixing outputs will result in inaccurate
|
64
90
|
outlier calculations if not created correctly.
|
65
91
|
|
66
92
|
Attributes
|
@@ -83,6 +109,13 @@ class ChannelStatsOutput(Output):
|
|
83
109
|
if not all(length == lengths[0] for length in lengths):
|
84
110
|
raise ValueError("All StatsOutput classes must contain the same number of image sources.")
|
85
111
|
|
112
|
+
def plot(
|
113
|
+
self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
|
114
|
+
) -> None:
|
115
|
+
max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
|
116
|
+
data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
|
117
|
+
channel_histogram_plot(data_dict, log, max_channels, ch_mask)
|
118
|
+
|
86
119
|
|
87
120
|
@set_metadata
|
88
121
|
def datasetstats(
|
@@ -91,7 +124,7 @@ def datasetstats(
|
|
91
124
|
labels: Iterable[ArrayLike] | None = None,
|
92
125
|
) -> DatasetStatsOutput:
|
93
126
|
"""
|
94
|
-
Calculates various :term:`statistics<Statistics>` for each image
|
127
|
+
Calculates various :term:`statistics<Statistics>` for each image.
|
95
128
|
|
96
129
|
This function computes dimension, pixel and visual metrics
|
97
130
|
on the images or individual bounding boxes for each image as
|
@@ -119,13 +152,11 @@ def datasetstats(
|
|
119
152
|
--------
|
120
153
|
Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
|
121
154
|
|
122
|
-
>>> stats = datasetstats(
|
155
|
+
>>> stats = datasetstats(stats_images, bboxes)
|
123
156
|
>>> print(stats.dimensionstats.aspect_ratio)
|
124
|
-
[ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
|
125
|
-
|
126
|
-
|
127
|
-
[1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
|
128
|
-
0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
|
157
|
+
[ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
|
158
|
+
>>> print(stats.visualstats.sharpness)
|
159
|
+
[4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
|
129
160
|
"""
|
130
161
|
outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
|
131
162
|
return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
|
@@ -137,7 +168,7 @@ def channelstats(
|
|
137
168
|
bboxes: Iterable[ArrayLike] | None = None,
|
138
169
|
) -> ChannelStatsOutput:
|
139
170
|
"""
|
140
|
-
Calculates various per-channel statistics for each image
|
171
|
+
Calculates various per-channel :term:`statistics` for each image.
|
141
172
|
|
142
173
|
This function computes pixel and visual metrics on the images
|
143
174
|
or individual bounding boxes for each image.
|
@@ -162,12 +193,10 @@ def channelstats(
|
|
162
193
|
--------
|
163
194
|
Calculating the per-channel pixel and visual stats for a dataset
|
164
195
|
|
165
|
-
>>> stats = channelstats(
|
196
|
+
>>> stats = channelstats(stats_images)
|
166
197
|
>>> print(stats.visualstats.darkness)
|
167
|
-
[0.
|
168
|
-
0.
|
169
|
-
0.6045 0.611 0.617 0.7046 0.711 0.7173 0.8047 0.811 0.8174
|
170
|
-
0.905 0.911 0.917 ]
|
198
|
+
[0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
|
199
|
+
0.8154]
|
171
200
|
"""
|
172
201
|
outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
|
173
202
|
return ChannelStatsOutput(*outputs) # type: ignore
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Callable, Iterable
|
@@ -8,15 +8,15 @@ from typing import Any, Callable, Iterable
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import ArrayLike, NDArray
|
10
10
|
|
11
|
-
from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
11
|
+
from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
|
12
12
|
from dataeval.output import set_metadata
|
13
13
|
from dataeval.utils.image import get_bitdepth
|
14
14
|
|
15
15
|
|
16
16
|
@dataclass(frozen=True)
|
17
|
-
class DimensionStatsOutput(BaseStatsOutput):
|
17
|
+
class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
18
|
"""
|
19
|
-
Output class for :func:`dimensionstats` stats metric
|
19
|
+
Output class for :func:`dimensionstats` stats metric.
|
20
20
|
|
21
21
|
Attributes
|
22
22
|
----------
|
@@ -79,7 +79,7 @@ def dimensionstats(
|
|
79
79
|
bboxes: Iterable[ArrayLike] | None = None,
|
80
80
|
) -> DimensionStatsOutput:
|
81
81
|
"""
|
82
|
-
Calculates dimension :term:`statistics<Statistics>` for each image
|
82
|
+
Calculates dimension :term:`statistics<Statistics>` for each image.
|
83
83
|
|
84
84
|
This function computes various dimensional metrics (e.g., width, height, channels)
|
85
85
|
on the images or individual bounding boxes for each image.
|
@@ -106,10 +106,10 @@ def dimensionstats(
|
|
106
106
|
--------
|
107
107
|
Calculating the dimension statistics on the images, whose shape is (C, H, W)
|
108
108
|
|
109
|
-
>>> results = dimensionstats(
|
109
|
+
>>> results = dimensionstats(stats_images)
|
110
110
|
>>> print(results.aspect_ratio)
|
111
|
-
[
|
111
|
+
[1. 1. 1.333 1. 0.6665]
|
112
112
|
>>> print(results.channels)
|
113
|
-
[
|
113
|
+
[3 3 1 3 1]
|
114
114
|
"""
|
115
115
|
return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
__all__ = []
|
4
6
|
|
5
7
|
from dataclasses import dataclass
|
6
8
|
from typing import Callable, Iterable
|
@@ -23,7 +25,7 @@ MAX_FACTOR = 4
|
|
23
25
|
@dataclass(frozen=True)
|
24
26
|
class HashStatsOutput(BaseStatsOutput):
|
25
27
|
"""
|
26
|
-
Output class for :func:`hashstats` stats metric
|
28
|
+
Output class for :func:`hashstats` stats metric.
|
27
29
|
|
28
30
|
Attributes
|
29
31
|
----------
|
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
|
|
41
43
|
"""
|
42
44
|
Performs a perceptual hash on an image by resizing to a square NxN image
|
43
45
|
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
44
|
-
8 that is smaller than the input image dimensions.
|
46
|
+
8 that is smaller than the input image dimensions. The resampled image
|
45
47
|
is compressed using a discrete cosine transform and the lowest frequency
|
46
48
|
component is encoded as a bit array of greater or less than median value
|
47
49
|
and returned as a hex string.
|
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
|
|
54
56
|
Returns
|
55
57
|
-------
|
56
58
|
str
|
57
|
-
The hex string hash of the image using perceptual hashing
|
59
|
+
The hex string hash of the image using perceptual hashing, or empty
|
60
|
+
string if the image is too small to be hashed
|
58
61
|
"""
|
59
62
|
# Verify that the image is at least larger than an 8x8 image
|
60
63
|
arr = as_numpy(image)
|
61
64
|
min_dim = min(arr.shape[-2:])
|
62
65
|
if min_dim < HASH_SIZE + 1:
|
63
|
-
|
66
|
+
warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
67
|
+
return ""
|
64
68
|
|
65
69
|
# Calculates the dimensions of the resized square image
|
66
70
|
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
|
|
92
96
|
def xxhash(image: ArrayLike) -> str:
|
93
97
|
"""
|
94
98
|
Performs a fast non-cryptographic hash using the xxhash algorithm
|
95
|
-
(xxhash.com) against the image as a flattened bytearray.
|
99
|
+
(xxhash.com) against the image as a flattened bytearray. The hash
|
96
100
|
is returned as a hex string.
|
97
101
|
|
98
102
|
Parameters
|
@@ -122,7 +126,7 @@ def hashstats(
|
|
122
126
|
bboxes: Iterable[ArrayLike] | None = None,
|
123
127
|
) -> HashStatsOutput:
|
124
128
|
"""
|
125
|
-
Calculates hashes for each image
|
129
|
+
Calculates hashes for each image.
|
126
130
|
|
127
131
|
This function computes hashes from the images including exact hashes and perception-based
|
128
132
|
hashes. These hash values can be used to determine if images are exact or near matches.
|
@@ -147,10 +151,10 @@ def hashstats(
|
|
147
151
|
--------
|
148
152
|
Calculating the statistics on the images, whose shape is (C, H, W)
|
149
153
|
|
150
|
-
>>> results = hashstats(
|
154
|
+
>>> results = hashstats(stats_images)
|
151
155
|
>>> print(results.xxhash)
|
152
|
-
['
|
156
|
+
['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
|
153
157
|
>>> print(results.pchash)
|
154
|
-
['
|
158
|
+
['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
|
155
159
|
"""
|
156
160
|
return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
|
@@ -1,21 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
|
+
# import contextlib
|
5
6
|
from collections import Counter, defaultdict
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Any, Iterable, Mapping, TypeVar
|
8
9
|
|
10
|
+
import numpy as np
|
9
11
|
from numpy.typing import ArrayLike
|
10
12
|
|
11
|
-
from dataeval.interop import
|
13
|
+
from dataeval.interop import as_numpy
|
12
14
|
from dataeval.output import Output, set_metadata
|
13
15
|
|
16
|
+
# with contextlib.suppress(ImportError):
|
17
|
+
# import pandas as pd
|
18
|
+
|
14
19
|
|
15
20
|
@dataclass(frozen=True)
|
16
21
|
class LabelStatsOutput(Output):
|
17
22
|
"""
|
18
|
-
Output class for :func:`labelstats` stats metric
|
23
|
+
Output class for :func:`labelstats` stats metric.
|
19
24
|
|
20
25
|
Attributes
|
21
26
|
----------
|
@@ -46,6 +51,47 @@ class LabelStatsOutput(Output):
|
|
46
51
|
class_count: int
|
47
52
|
label_count: int
|
48
53
|
|
54
|
+
def to_table(self) -> str:
|
55
|
+
max_char = max(len(key) if isinstance(key, str) else key // 10 + 1 for key in self.label_counts_per_class)
|
56
|
+
max_char = max(max_char, 5)
|
57
|
+
max_label = max(list(self.label_counts_per_class.values()))
|
58
|
+
max_img = max(list(self.image_counts_per_label.values()))
|
59
|
+
max_num = int(np.ceil(np.log10(max(max_label, max_img))))
|
60
|
+
max_num = max(max_num, 11)
|
61
|
+
|
62
|
+
# Display basic counts
|
63
|
+
table_str = f"Class Count: {self.class_count}\n"
|
64
|
+
table_str += f"Label Count: {self.label_count}\n"
|
65
|
+
table_str += f"Average # Labels per Image: {round(np.mean(self.label_counts_per_image), 2)}\n"
|
66
|
+
table_str += "--------------------------------------\n"
|
67
|
+
|
68
|
+
# Display counts per class
|
69
|
+
table_str += f"{'Label':>{max_char}}: Total Count - Image Count\n"
|
70
|
+
for cls in self.label_counts_per_class:
|
71
|
+
table_str += f"{cls:>{max_char}}: {self.label_counts_per_class[cls]:^{max_num}} "
|
72
|
+
table_str += f"- {self.image_counts_per_label[cls]:^{max_num}}\n"
|
73
|
+
|
74
|
+
return table_str
|
75
|
+
|
76
|
+
# def to_dataframe(self) -> pd.DataFrame:
|
77
|
+
# import pandas as pd
|
78
|
+
|
79
|
+
# class_list = []
|
80
|
+
# total_count = []
|
81
|
+
# image_count = []
|
82
|
+
# for cls in self.label_counts_per_class:
|
83
|
+
# class_list.append(cls)
|
84
|
+
# total_count.append(self.label_counts_per_class[cls])
|
85
|
+
# image_count.append(self.image_counts_per_label[cls])
|
86
|
+
|
87
|
+
# return pd.DataFrame(
|
88
|
+
# {
|
89
|
+
# "Label": class_list,
|
90
|
+
# "Total Count": total_count,
|
91
|
+
# "Image Count": image_count,
|
92
|
+
# }
|
93
|
+
# )
|
94
|
+
|
49
95
|
|
50
96
|
TKey = TypeVar("TKey", int, str)
|
51
97
|
|
@@ -57,12 +103,47 @@ def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
|
|
57
103
|
return dict(sorted(d.items(), key=lambda x: x[0]))
|
58
104
|
|
59
105
|
|
106
|
+
def _ensure_2d(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
|
107
|
+
if isinstance(labels, np.ndarray):
|
108
|
+
return labels[:, None]
|
109
|
+
else:
|
110
|
+
return [[lbl] for lbl in labels] # type: ignore
|
111
|
+
|
112
|
+
|
113
|
+
def _get_list_depth(lst):
|
114
|
+
if isinstance(lst, list) and lst:
|
115
|
+
return 1 + max(_get_list_depth(item) for item in lst)
|
116
|
+
return 0
|
117
|
+
|
118
|
+
|
119
|
+
def _check_labels_dimension(labels: Iterable[ArrayLike]) -> Iterable[ArrayLike]:
|
120
|
+
# Check for nested lists beyond 2 levels
|
121
|
+
|
122
|
+
if isinstance(labels, np.ndarray):
|
123
|
+
if labels.ndim == 1:
|
124
|
+
return _ensure_2d(labels)
|
125
|
+
elif labels.ndim == 2:
|
126
|
+
return labels
|
127
|
+
else:
|
128
|
+
raise ValueError("The label array must not have more than 2 dimensions.")
|
129
|
+
elif isinstance(labels, list):
|
130
|
+
depth = _get_list_depth(labels)
|
131
|
+
if depth == 1:
|
132
|
+
return _ensure_2d(labels)
|
133
|
+
elif depth == 2:
|
134
|
+
return labels
|
135
|
+
else:
|
136
|
+
raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
|
137
|
+
else:
|
138
|
+
raise TypeError("Labels must be either a NumPy array or a list.")
|
139
|
+
|
140
|
+
|
60
141
|
@set_metadata
|
61
142
|
def labelstats(
|
62
143
|
labels: Iterable[ArrayLike],
|
63
144
|
) -> LabelStatsOutput:
|
64
145
|
"""
|
65
|
-
Calculates :term:`statistics<Statistics>` for data labels
|
146
|
+
Calculates :term:`statistics<Statistics>` for data labels.
|
66
147
|
|
67
148
|
This function computes counting metrics (e.g., total per class, total per image)
|
68
149
|
on the labels.
|
@@ -86,23 +167,25 @@ def labelstats(
|
|
86
167
|
|
87
168
|
>>> stats = labelstats(labels)
|
88
169
|
>>> stats.label_counts_per_class
|
89
|
-
{'chicken':
|
170
|
+
{'chicken': 12, 'cow': 5, 'horse': 4, 'pig': 7, 'sheep': 4}
|
90
171
|
>>> stats.label_counts_per_image
|
91
|
-
[3,
|
172
|
+
[3, 3, 5, 3, 2, 5, 5, 2, 2, 2]
|
92
173
|
>>> stats.image_counts_per_label
|
93
|
-
{'chicken':
|
174
|
+
{'chicken': 8, 'cow': 4, 'horse': 4, 'pig': 7, 'sheep': 4}
|
94
175
|
>>> (stats.image_count, stats.class_count, stats.label_count)
|
95
|
-
(10, 5,
|
176
|
+
(10, 5, 32)
|
96
177
|
"""
|
97
178
|
label_counts = Counter()
|
98
179
|
image_counts = Counter()
|
99
180
|
index_location = defaultdict(list[int])
|
100
181
|
label_per_image: list[int] = []
|
101
182
|
|
102
|
-
|
103
|
-
|
104
|
-
|
183
|
+
labels_2d = _check_labels_dimension(labels)
|
184
|
+
|
185
|
+
for i, group in enumerate(labels_2d):
|
186
|
+
group = as_numpy(group)
|
105
187
|
|
188
|
+
# Count occurrences of each label in all sublists
|
106
189
|
label_counts.update(group)
|
107
190
|
|
108
191
|
# Get the number of labels per image
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Callable, Iterable
|
@@ -9,14 +9,14 @@ import numpy as np
|
|
9
9
|
from numpy.typing import ArrayLike, NDArray
|
10
10
|
from scipy.stats import entropy, kurtosis, skew
|
11
11
|
|
12
|
-
from dataeval.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
12
|
+
from dataeval.metrics.stats.base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
|
13
13
|
from dataeval.output import set_metadata
|
14
14
|
|
15
15
|
|
16
16
|
@dataclass(frozen=True)
|
17
|
-
class PixelStatsOutput(BaseStatsOutput):
|
17
|
+
class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
18
|
"""
|
19
|
-
Output class for :func:`pixelstats` stats metric
|
19
|
+
Output class for :func:`pixelstats` stats metric.
|
20
20
|
|
21
21
|
Attributes
|
22
22
|
----------
|
@@ -44,11 +44,13 @@ class PixelStatsOutput(BaseStatsOutput):
|
|
44
44
|
histogram: NDArray[np.uint32]
|
45
45
|
entropy: NDArray[np.float16]
|
46
46
|
|
47
|
+
_excluded_keys = ["histogram"]
|
48
|
+
|
47
49
|
|
48
50
|
class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
49
51
|
output_class: type = PixelStatsOutput
|
50
52
|
image_function_map: dict[str, Callable[[StatsProcessor[PixelStatsOutput]], Any]] = {
|
51
|
-
"mean": lambda
|
53
|
+
"mean": lambda x: np.mean(x.scaled),
|
52
54
|
"std": lambda x: np.std(x.scaled),
|
53
55
|
"var": lambda x: np.var(x.scaled),
|
54
56
|
"skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
|
@@ -74,7 +76,7 @@ def pixelstats(
|
|
74
76
|
per_channel: bool = False,
|
75
77
|
) -> PixelStatsOutput:
|
76
78
|
"""
|
77
|
-
Calculates pixel :term:`statistics<Statistics>` for each image
|
79
|
+
Calculates pixel :term:`statistics<Statistics>` for each image.
|
78
80
|
|
79
81
|
This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
|
80
82
|
on the images as a whole.
|
@@ -106,15 +108,10 @@ def pixelstats(
|
|
106
108
|
--------
|
107
109
|
Calculating the statistics on the images, whose shape is (C, H, W)
|
108
110
|
|
109
|
-
>>> results = pixelstats(
|
111
|
+
>>> results = pixelstats(stats_images)
|
110
112
|
>>> print(results.mean)
|
111
|
-
[0.
|
112
|
-
0.292 0.3242 0.3562 0.3884 0.4204 0.4526 0.4846 0.5166 0.549
|
113
|
-
0.581 0.6133 0.6455 0.6772 0.7095 0.7417 0.774 0.8057 0.838
|
114
|
-
0.87 0.9023 0.934 ]
|
113
|
+
[0.2903 0.2108 0.397 0.596 0.743 ]
|
115
114
|
>>> print(results.entropy)
|
116
|
-
[
|
117
|
-
0.812 0.9883 0.795 0.9243 0.9243 0.795 0.9907 0.8125 1.028 0.8223
|
118
|
-
1.046 0.8247 1.041 0.8203 1.012 0.812 0.9883 0.795 0.9243 0.9243]
|
115
|
+
[4.99 2.371 1.179 2.406 0.668]
|
119
116
|
"""
|
120
117
|
return run_stats(images, bboxes, per_channel, [PixelStatsProcessor])[0]
|