dataeval 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/config.py +68 -11
- dataeval/detectors/drift/__init__.py +2 -2
- dataeval/detectors/drift/_base.py +8 -64
- dataeval/detectors/drift/_mmd.py +12 -38
- dataeval/detectors/drift/_torch.py +7 -7
- dataeval/detectors/drift/_uncertainty.py +6 -5
- dataeval/detectors/drift/updates.py +20 -3
- dataeval/detectors/linters/__init__.py +3 -2
- dataeval/detectors/linters/duplicates.py +14 -46
- dataeval/detectors/linters/outliers.py +25 -159
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +6 -5
- dataeval/detectors/ood/base.py +2 -2
- dataeval/detectors/ood/metadata_ood_mi.py +4 -6
- dataeval/detectors/ood/mixin.py +3 -4
- dataeval/detectors/ood/vae.py +3 -2
- dataeval/metadata/__init__.py +2 -1
- dataeval/metadata/_distance.py +134 -0
- dataeval/metadata/_ood.py +30 -49
- dataeval/metadata/_utils.py +44 -0
- dataeval/metrics/bias/__init__.py +5 -4
- dataeval/metrics/bias/_balance.py +17 -149
- dataeval/metrics/bias/_coverage.py +4 -106
- dataeval/metrics/bias/_diversity.py +12 -107
- dataeval/metrics/bias/_parity.py +7 -71
- dataeval/metrics/estimators/__init__.py +5 -4
- dataeval/metrics/estimators/_ber.py +2 -20
- dataeval/metrics/estimators/_clusterer.py +1 -61
- dataeval/metrics/estimators/_divergence.py +2 -19
- dataeval/metrics/estimators/_uap.py +2 -16
- dataeval/metrics/stats/__init__.py +15 -12
- dataeval/metrics/stats/_base.py +41 -128
- dataeval/metrics/stats/_boxratiostats.py +13 -13
- dataeval/metrics/stats/_dimensionstats.py +17 -58
- dataeval/metrics/stats/_hashstats.py +19 -35
- dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval/metrics/stats/_labelstats.py +42 -121
- dataeval/metrics/stats/_pixelstats.py +19 -51
- dataeval/metrics/stats/_visualstats.py +19 -51
- dataeval/outputs/__init__.py +57 -0
- dataeval/outputs/_base.py +182 -0
- dataeval/outputs/_bias.py +381 -0
- dataeval/outputs/_drift.py +83 -0
- dataeval/outputs/_estimators.py +114 -0
- dataeval/outputs/_linters.py +186 -0
- dataeval/outputs/_metadata.py +54 -0
- dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
- dataeval/outputs/_stats.py +393 -0
- dataeval/outputs/_utils.py +44 -0
- dataeval/outputs/_workflows.py +364 -0
- dataeval/typing.py +187 -7
- dataeval/utils/_method.py +1 -5
- dataeval/utils/_plot.py +2 -2
- dataeval/utils/data/__init__.py +5 -1
- dataeval/utils/data/_dataset.py +217 -0
- dataeval/utils/data/_embeddings.py +12 -14
- dataeval/utils/data/_images.py +30 -27
- dataeval/utils/data/_metadata.py +28 -11
- dataeval/utils/data/_selection.py +25 -22
- dataeval/utils/data/_split.py +5 -29
- dataeval/utils/data/_targets.py +14 -2
- dataeval/utils/data/datasets/_base.py +5 -5
- dataeval/utils/data/datasets/_cifar10.py +1 -1
- dataeval/utils/data/datasets/_milco.py +1 -1
- dataeval/utils/data/datasets/_mnist.py +1 -1
- dataeval/utils/data/datasets/_ships.py +1 -1
- dataeval/utils/data/{_types.py → datasets/_types.py} +10 -16
- dataeval/utils/data/datasets/_voc.py +1 -1
- dataeval/utils/data/selections/_classfilter.py +4 -5
- dataeval/utils/data/selections/_indices.py +2 -2
- dataeval/utils/data/selections/_limit.py +2 -2
- dataeval/utils/data/selections/_reverse.py +2 -2
- dataeval/utils/data/selections/_shuffle.py +2 -2
- dataeval/utils/torch/_internal.py +5 -5
- dataeval/utils/torch/trainer.py +8 -8
- dataeval/workflows/__init__.py +2 -1
- dataeval/workflows/sufficiency.py +6 -342
- {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/METADATA +2 -2
- dataeval-0.82.1.dist-info/RECORD +105 -0
- dataeval/_output.py +0 -137
- dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval/metrics/stats/_datasetstats.py +0 -198
- dataeval-0.81.0.dist-info/RECORD +0 -94
- {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/WHEEL +0 -0
dataeval/metrics/stats/_base.py
CHANGED
@@ -4,35 +4,24 @@ __all__ = []
|
|
4
4
|
|
5
5
|
import re
|
6
6
|
import warnings
|
7
|
+
from collections import ChainMap
|
7
8
|
from copy import deepcopy
|
8
9
|
from dataclasses import dataclass
|
9
10
|
from functools import partial
|
10
|
-
from itertools import repeat
|
11
11
|
from multiprocessing import Pool
|
12
|
-
from typing import Any, Callable, Generic, Iterable,
|
12
|
+
from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar, cast
|
13
13
|
|
14
14
|
import numpy as np
|
15
15
|
import tqdm
|
16
16
|
from numpy.typing import NDArray
|
17
17
|
|
18
|
-
from dataeval._output import Output
|
19
18
|
from dataeval.config import get_max_processes
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
19
|
+
from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
|
20
|
+
from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
|
21
|
+
from dataeval.utils._array import to_numpy
|
22
22
|
from dataeval.utils._image import normalize_image_shape, rescale
|
23
|
-
from dataeval.utils._plot import histogram_plot
|
24
23
|
|
25
24
|
DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
|
26
|
-
SOURCE_INDEX = "source_index"
|
27
|
-
BOX_COUNT = "box_count"
|
28
|
-
|
29
|
-
OptionalRange = Optional[Union[int, Iterable[int]]]
|
30
|
-
|
31
|
-
|
32
|
-
def matches(index: int | None, opt_range: OptionalRange) -> bool:
|
33
|
-
if index is None or opt_range is None:
|
34
|
-
return True
|
35
|
-
return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
|
36
25
|
|
37
26
|
|
38
27
|
def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
@@ -48,87 +37,6 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
|
48
37
|
return bounding_box
|
49
38
|
|
50
39
|
|
51
|
-
@dataclass
|
52
|
-
class SourceIndex:
|
53
|
-
"""
|
54
|
-
Attributes
|
55
|
-
----------
|
56
|
-
image: int
|
57
|
-
Index of the source image
|
58
|
-
box : int | None
|
59
|
-
Index of the box of the source image
|
60
|
-
channel : int | None
|
61
|
-
Index of the channel of the source image
|
62
|
-
"""
|
63
|
-
|
64
|
-
image: int
|
65
|
-
box: int | None
|
66
|
-
channel: int | None
|
67
|
-
|
68
|
-
|
69
|
-
@dataclass(frozen=True)
|
70
|
-
class BaseStatsOutput(Output):
|
71
|
-
"""
|
72
|
-
Attributes
|
73
|
-
----------
|
74
|
-
source_index : List[SourceIndex]
|
75
|
-
Mapping from statistic to source image, box and channel index
|
76
|
-
box_count : NDArray[np.uint16]
|
77
|
-
"""
|
78
|
-
|
79
|
-
source_index: list[SourceIndex]
|
80
|
-
box_count: NDArray[np.uint16]
|
81
|
-
|
82
|
-
def get_channel_mask(
|
83
|
-
self,
|
84
|
-
channel_index: OptionalRange,
|
85
|
-
channel_count: OptionalRange = None,
|
86
|
-
) -> list[bool]:
|
87
|
-
"""
|
88
|
-
Boolean mask for results filtered to specified channel index and optionally the count
|
89
|
-
of the channels per image.
|
90
|
-
|
91
|
-
Parameters
|
92
|
-
----------
|
93
|
-
channel_index : int | Iterable[int] | None
|
94
|
-
Index or indices of channel(s) to filter for
|
95
|
-
channel_count : int | Iterable[int] | None
|
96
|
-
Optional count(s) of channels to filter for
|
97
|
-
"""
|
98
|
-
mask: list[bool] = []
|
99
|
-
cur_mask: list[bool] = []
|
100
|
-
cur_image = 0
|
101
|
-
cur_max_channel = 0
|
102
|
-
for source_index in list(self.source_index) + [None]:
|
103
|
-
if source_index is None or source_index.image > cur_image:
|
104
|
-
mask.extend(cur_mask if matches(cur_max_channel + 1, channel_count) else [False for _ in cur_mask])
|
105
|
-
if source_index is not None:
|
106
|
-
cur_image = source_index.image
|
107
|
-
cur_max_channel = 0
|
108
|
-
cur_mask.clear()
|
109
|
-
if source_index is not None:
|
110
|
-
cur_mask.append(matches(source_index.channel, channel_index))
|
111
|
-
cur_max_channel = max(cur_max_channel, source_index.channel or 0)
|
112
|
-
return mask
|
113
|
-
|
114
|
-
def __len__(self) -> int:
|
115
|
-
return len(self.source_index)
|
116
|
-
|
117
|
-
|
118
|
-
def _is_plottable(k: str, v: Any, excluded_keys: Iterable[str]) -> bool:
|
119
|
-
return isinstance(v, np.ndarray) and v[v != 0].size > 0 and all(k != x for x in excluded_keys)
|
120
|
-
|
121
|
-
|
122
|
-
class HistogramPlotMixin:
|
123
|
-
_excluded_keys: Iterable[str] = []
|
124
|
-
|
125
|
-
def dict(self) -> dict[str, Any]: ...
|
126
|
-
|
127
|
-
def plot(self, log: bool) -> None:
|
128
|
-
data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, self._excluded_keys)}
|
129
|
-
histogram_plot(data_dict, log)
|
130
|
-
|
131
|
-
|
132
40
|
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
|
133
41
|
|
134
42
|
|
@@ -193,10 +101,9 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
193
101
|
cls, source: dict[str, Any], source_index: list[SourceIndex], box_count: list[int]
|
194
102
|
) -> TStatsOutput:
|
195
103
|
output = {}
|
196
|
-
for
|
197
|
-
|
198
|
-
|
199
|
-
stat_type: str = cls.output_class.__annotations__[key]
|
104
|
+
attrs = dict(ChainMap(*(getattr(c, "__annotations__", {}) for c in cls.output_class.__mro__)))
|
105
|
+
for key in (key for key in source if key in attrs):
|
106
|
+
stat_type: str = attrs[key]
|
200
107
|
dtype_match = re.match(DTYPE_REGEX, stat_type)
|
201
108
|
if dtype_match is not None:
|
202
109
|
output[key] = np.asarray(source[key], dtype=np.dtype(dtype_match.group(1)))
|
@@ -215,16 +122,20 @@ class StatsProcessorOutput:
|
|
215
122
|
|
216
123
|
def process_stats(
|
217
124
|
i: int,
|
218
|
-
|
125
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
126
|
+
per_box: bool,
|
219
127
|
per_channel: bool,
|
220
128
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
221
129
|
) -> StatsProcessorOutput:
|
222
|
-
|
130
|
+
data = dataset[i]
|
131
|
+
image, target = (to_numpy(cast(ArrayLike, data[0])), data[1]) if isinstance(data, tuple) else (to_numpy(data), None)
|
132
|
+
target = None if not isinstance(target, ObjectDetectionTarget) else target
|
133
|
+
boxes = to_numpy(target.boxes) if target is not None else None
|
223
134
|
results_list: list[dict[str, Any]] = []
|
224
135
|
source_indices: list[SourceIndex] = []
|
225
136
|
box_counts: list[int] = []
|
226
137
|
warnings_list: list[str] = []
|
227
|
-
nboxes = [None] if boxes is None else normalize_box_shape(boxes)
|
138
|
+
nboxes = [None] if boxes is None or not per_box else normalize_box_shape(boxes)
|
228
139
|
for i_b, box in enumerate(nboxes):
|
229
140
|
i_b = None if box is None else i_b
|
230
141
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
@@ -232,7 +143,7 @@ def process_stats(
|
|
232
143
|
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
|
233
144
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
234
145
|
if per_channel:
|
235
|
-
source_indices.extend([SourceIndex(i, i_b, c) for c in range(
|
146
|
+
source_indices.extend([SourceIndex(i, i_b, c) for c in range(image.shape[-3])])
|
236
147
|
else:
|
237
148
|
source_indices.append(SourceIndex(i, i_b, None))
|
238
149
|
box_counts.append(0 if boxes is None else len(boxes))
|
@@ -240,16 +151,18 @@ def process_stats(
|
|
240
151
|
|
241
152
|
|
242
153
|
def process_stats_unpack(
|
243
|
-
|
154
|
+
i: int,
|
155
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
156
|
+
per_box: bool,
|
244
157
|
per_channel: bool,
|
245
158
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
246
159
|
) -> StatsProcessorOutput:
|
247
|
-
return process_stats(
|
160
|
+
return process_stats(i, dataset, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
|
248
161
|
|
249
162
|
|
250
163
|
def run_stats(
|
251
|
-
|
252
|
-
|
164
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
165
|
+
per_box: bool,
|
253
166
|
per_channel: bool,
|
254
167
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
255
168
|
) -> list[TStatsOutput]:
|
@@ -262,13 +175,11 @@ def run_stats(
|
|
262
175
|
|
263
176
|
Parameters
|
264
177
|
----------
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
|
271
|
-
iterable should match the length of the input images.
|
178
|
+
data : Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
|
179
|
+
A dataset of images and targets to compute statistics on.
|
180
|
+
per_box : bool
|
181
|
+
A flag which determines if the statistics should be evaluated on a per-box basis or not.
|
182
|
+
If the dataset does not include bounding boxes, this flag is ignored.
|
272
183
|
per_channel : bool
|
273
184
|
A flag which determines if the states should be evaluated on a per-channel basis or not.
|
274
185
|
stats_processor_cls : Iterable[type[StatsProcessor]]
|
@@ -276,10 +187,8 @@ def run_stats(
|
|
276
187
|
|
277
188
|
Returns
|
278
189
|
-------
|
279
|
-
|
280
|
-
A
|
281
|
-
The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
|
282
|
-
with the results of the computations.
|
190
|
+
list[TStatsOutput]
|
191
|
+
A list of output classes containing the computed statistics
|
283
192
|
|
284
193
|
Note
|
285
194
|
----
|
@@ -293,20 +202,24 @@ def run_stats(
|
|
293
202
|
results_list: list[dict[str, NDArray[np.float64]]] = []
|
294
203
|
source_index: list[SourceIndex] = []
|
295
204
|
box_count: list[int] = []
|
296
|
-
bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
|
297
205
|
|
298
206
|
warning_list = []
|
299
|
-
total_for_status = len(images) if isinstance(images, Sized) else None
|
300
207
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
301
208
|
|
302
209
|
# TODO: Introduce global controls for CPU job parallelism and GPU configurations
|
303
210
|
with Pool(processes=get_max_processes()) as p:
|
304
211
|
for r in tqdm.tqdm(
|
305
212
|
p.imap(
|
306
|
-
partial(
|
307
|
-
|
213
|
+
partial(
|
214
|
+
process_stats_unpack,
|
215
|
+
dataset=dataset,
|
216
|
+
per_box=per_box,
|
217
|
+
per_channel=per_channel,
|
218
|
+
stats_processor_cls=stats_processor_cls,
|
219
|
+
),
|
220
|
+
range(len(dataset)),
|
308
221
|
),
|
309
|
-
total=
|
222
|
+
total=len(dataset),
|
310
223
|
):
|
311
224
|
results_list.extend(r.results)
|
312
225
|
source_index.extend(r.source_indices)
|
@@ -335,13 +248,13 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
|
335
248
|
if type(a) is not type(b):
|
336
249
|
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
337
250
|
|
338
|
-
sum_dict = deepcopy(a.
|
251
|
+
sum_dict = deepcopy(a.data())
|
339
252
|
|
340
253
|
for k in sum_dict:
|
341
254
|
if isinstance(sum_dict[k], list):
|
342
|
-
sum_dict[k].extend(b.
|
255
|
+
sum_dict[k].extend(b.data()[k])
|
343
256
|
else:
|
344
|
-
sum_dict[k] = np.concatenate((sum_dict[k], b.
|
257
|
+
sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
|
345
258
|
|
346
259
|
return type(a)(**sum_dict)
|
347
260
|
|
@@ -8,9 +8,8 @@ from typing import Any, Callable, Generic, TypeVar, cast
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput, DimensionStatsOutput
|
14
13
|
|
15
14
|
TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
|
16
15
|
ArraySlice = tuple[int, int]
|
@@ -125,27 +124,28 @@ def boxratiostats(
|
|
125
124
|
|
126
125
|
Examples
|
127
126
|
--------
|
128
|
-
|
127
|
+
Calculate the box ratio statistics using the dimension stats of the images and boxes
|
128
|
+
on a dataset containing 15 targets.
|
129
129
|
|
130
130
|
>>> from dataeval.metrics.stats import dimensionstats
|
131
|
-
>>> imagestats = dimensionstats(
|
132
|
-
>>> boxstats = dimensionstats(
|
131
|
+
>>> imagestats = dimensionstats(dataset, per_box=False)
|
132
|
+
>>> boxstats = dimensionstats(dataset, per_box=True)
|
133
133
|
>>> ratiostats = boxratiostats(boxstats, imagestats)
|
134
134
|
>>> print(ratiostats.aspect_ratio)
|
135
|
-
[ 0.
|
136
|
-
0.
|
135
|
+
[ 0.864 0.588 16. 0.857 1.27 0.438 0.667 3.833 1.95 0.833
|
136
|
+
1. 0.6 0.522 15. 3.834]
|
137
137
|
>>> print(ratiostats.size)
|
138
|
-
[0.
|
139
|
-
0.
|
138
|
+
[0.026 0.01 0.001 0.018 0.023 0.007 0.009 0.034 0.021 0.007 0.001 0.008
|
139
|
+
0.017 0.001 0.008]
|
140
140
|
"""
|
141
141
|
output_cls = type(boxstats)
|
142
142
|
if type(boxstats) is not type(imgstats):
|
143
143
|
raise TypeError("Must provide stats outputs of the same type.")
|
144
144
|
if boxstats.source_index[-1].image != imgstats.source_index[-1].image:
|
145
145
|
raise ValueError("Stats index_map length mismatch. Check if the correct box and image stats were provided.")
|
146
|
-
if
|
146
|
+
if any(src_idx.box is None for src_idx in boxstats.source_index):
|
147
147
|
raise ValueError("Input for boxstats must contain box information.")
|
148
|
-
if any(
|
148
|
+
if any(src_idx.box is not None for src_idx in imgstats.source_index):
|
149
149
|
raise ValueError("Input for imgstats must not contain box information.")
|
150
150
|
boxstats_has_channels = any(si.channel is None for si in boxstats.source_index)
|
151
151
|
imgstats_has_channels = any(si.channel is None for si in imgstats.source_index)
|
@@ -153,7 +153,7 @@ def boxratiostats(
|
|
153
153
|
raise ValueError("Input for boxstats and imgstats must have matching channel information.")
|
154
154
|
|
155
155
|
output_dict = {}
|
156
|
-
for key in boxstats.
|
156
|
+
for key in boxstats.data():
|
157
157
|
output_dict[key] = calculate_ratios(key, boxstats, imgstats)
|
158
158
|
|
159
159
|
return output_cls(**output_dict)
|
@@ -2,59 +2,17 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import Any, Callable, Iterable
|
5
|
+
from typing import Any, Callable
|
7
6
|
|
8
7
|
import numpy as np
|
9
|
-
from numpy.typing import NDArray
|
10
8
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
9
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
|
+
from dataeval.outputs import DimensionStatsOutput
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.typing import ArrayLike, Dataset
|
14
13
|
from dataeval.utils._image import get_bitdepth
|
15
14
|
|
16
15
|
|
17
|
-
@dataclass(frozen=True)
|
18
|
-
class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
19
|
-
"""
|
20
|
-
Output class for :func:`.dimensionstats` stats metric.
|
21
|
-
|
22
|
-
Attributes
|
23
|
-
----------
|
24
|
-
left : NDArray[np.int32]
|
25
|
-
Offsets from the left edge of images in pixels
|
26
|
-
top : NDArray[np.int32]
|
27
|
-
Offsets from the top edge of images in pixels
|
28
|
-
width : NDArray[np.uint32]
|
29
|
-
Width of the images in pixels
|
30
|
-
height : NDArray[np.uint32]
|
31
|
-
Height of the images in pixels
|
32
|
-
channels : NDArray[np.uint8]
|
33
|
-
Channel count of the images in pixels
|
34
|
-
size : NDArray[np.uint32]
|
35
|
-
Size of the images in pixels
|
36
|
-
aspect_ratio : NDArray[np.float16]
|
37
|
-
:term:`ASspect Ratio<Aspect Ratio>` of the images (width/height)
|
38
|
-
depth : NDArray[np.uint8]
|
39
|
-
Color depth of the images in bits
|
40
|
-
center : NDArray[np.uint16]
|
41
|
-
Offset from center in [x,y] coordinates of the images in pixels
|
42
|
-
distance : NDArray[np.float16]
|
43
|
-
Distance in pixels from center
|
44
|
-
"""
|
45
|
-
|
46
|
-
left: NDArray[np.int32]
|
47
|
-
top: NDArray[np.int32]
|
48
|
-
width: NDArray[np.uint32]
|
49
|
-
height: NDArray[np.uint32]
|
50
|
-
channels: NDArray[np.uint8]
|
51
|
-
size: NDArray[np.uint32]
|
52
|
-
aspect_ratio: NDArray[np.float16]
|
53
|
-
depth: NDArray[np.uint8]
|
54
|
-
center: NDArray[np.int16]
|
55
|
-
distance: NDArray[np.float16]
|
56
|
-
|
57
|
-
|
58
16
|
class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
59
17
|
output_class: type = DimensionStatsOutput
|
60
18
|
image_function_map: dict[str, Callable[[StatsProcessor[DimensionStatsOutput]], Any]] = {
|
@@ -76,8 +34,9 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
|
76
34
|
|
77
35
|
@set_metadata
|
78
36
|
def dimensionstats(
|
79
|
-
|
80
|
-
|
37
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
38
|
+
*,
|
39
|
+
per_box: bool = False,
|
81
40
|
) -> DimensionStatsOutput:
|
82
41
|
"""
|
83
42
|
Calculates dimension :term:`statistics<Statistics>` for each image.
|
@@ -87,10 +46,10 @@ def dimensionstats(
|
|
87
46
|
|
88
47
|
Parameters
|
89
48
|
----------
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
49
|
+
dataset : Dataset
|
50
|
+
Dataset to perform calculations on.
|
51
|
+
per_box : bool, default False
|
52
|
+
If True, perform calculations on each bounding box.
|
94
53
|
|
95
54
|
Returns
|
96
55
|
-------
|
@@ -105,12 +64,12 @@ def dimensionstats(
|
|
105
64
|
|
106
65
|
Examples
|
107
66
|
--------
|
108
|
-
|
67
|
+
Calculate the dimension statistics of a dataset of 8 images, whose shape is (C, H, W).
|
109
68
|
|
110
|
-
>>> results = dimensionstats(
|
69
|
+
>>> results = dimensionstats(dataset)
|
111
70
|
>>> print(results.aspect_ratio)
|
112
|
-
[1.
|
71
|
+
[1. 1. 1.333 1. 0.667 1. 1. 1. ]
|
113
72
|
>>> print(results.channels)
|
114
|
-
[3 3 1 3 1]
|
73
|
+
[3 3 1 3 1 3 3 3]
|
115
74
|
"""
|
116
|
-
return run_stats(
|
75
|
+
return run_stats(dataset, per_box, False, [DimensionStatsProcessor])[0]
|
@@ -4,17 +4,17 @@ import warnings
|
|
4
4
|
|
5
5
|
__all__ = []
|
6
6
|
|
7
|
-
from
|
8
|
-
from typing import Callable, Iterable
|
7
|
+
from typing import Any, Callable
|
9
8
|
|
10
9
|
import numpy as np
|
11
10
|
import xxhash as xxh
|
12
11
|
from PIL import Image
|
13
12
|
from scipy.fftpack import dct
|
14
13
|
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
14
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
15
|
+
from dataeval.outputs import HashStatsOutput
|
16
|
+
from dataeval.outputs._base import set_metadata
|
17
|
+
from dataeval.typing import ArrayLike, Dataset
|
18
18
|
from dataeval.utils._array import as_numpy
|
19
19
|
from dataeval.utils._image import normalize_image_shape, rescale
|
20
20
|
|
@@ -22,23 +22,6 @@ HASH_SIZE = 8
|
|
22
22
|
MAX_FACTOR = 4
|
23
23
|
|
24
24
|
|
25
|
-
@dataclass(frozen=True)
|
26
|
-
class HashStatsOutput(BaseStatsOutput):
|
27
|
-
"""
|
28
|
-
Output class for :func:`.hashstats` stats metric.
|
29
|
-
|
30
|
-
Attributes
|
31
|
-
----------
|
32
|
-
xxhash : List[str]
|
33
|
-
xxHash hash of the images as a hex string
|
34
|
-
pchash : List[str]
|
35
|
-
:term:`Perception-based Hash` of the images as a hex string
|
36
|
-
"""
|
37
|
-
|
38
|
-
xxhash: list[str]
|
39
|
-
pchash: list[str]
|
40
|
-
|
41
|
-
|
42
25
|
def pchash(image: ArrayLike) -> str:
|
43
26
|
"""
|
44
27
|
Performs a perceptual hash on an image by resizing to a square NxN image
|
@@ -122,8 +105,9 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
|
122
105
|
|
123
106
|
@set_metadata
|
124
107
|
def hashstats(
|
125
|
-
|
126
|
-
|
108
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
109
|
+
*,
|
110
|
+
per_box: bool = False,
|
127
111
|
) -> HashStatsOutput:
|
128
112
|
"""
|
129
113
|
Calculates hashes for each image.
|
@@ -133,10 +117,10 @@ def hashstats(
|
|
133
117
|
|
134
118
|
Parameters
|
135
119
|
----------
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
120
|
+
dataset : Dataset
|
121
|
+
Dataset to perform calculations on.
|
122
|
+
per_box : bool, default False
|
123
|
+
If True, perform calculations on each bounding box.
|
140
124
|
|
141
125
|
Returns
|
142
126
|
-------
|
@@ -149,12 +133,12 @@ def hashstats(
|
|
149
133
|
|
150
134
|
Examples
|
151
135
|
--------
|
152
|
-
|
136
|
+
Calculate the hashes of a dataset of images, whose shape is (C, H, W)
|
153
137
|
|
154
|
-
>>> results = hashstats(
|
155
|
-
>>> print(results.xxhash)
|
156
|
-
['
|
157
|
-
>>> print(results.pchash)
|
158
|
-
['
|
138
|
+
>>> results = hashstats(dataset)
|
139
|
+
>>> print(results.xxhash[:5])
|
140
|
+
['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
|
141
|
+
>>> print(results.pchash[:5])
|
142
|
+
['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
|
159
143
|
"""
|
160
|
-
return run_stats(
|
144
|
+
return run_stats(dataset, per_box, False, [HashStatsProcessor])[0]
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from typing import Any, Literal, overload
|
6
|
+
|
7
|
+
from dataeval.metrics.stats._base import run_stats
|
8
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsProcessor
|
9
|
+
from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
|
10
|
+
from dataeval.metrics.stats._visualstats import VisualStatsProcessor
|
11
|
+
from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
|
12
|
+
from dataeval.outputs._base import set_metadata
|
13
|
+
from dataeval.typing import ArrayLike, Dataset
|
14
|
+
|
15
|
+
|
16
|
+
@overload
|
17
|
+
def imagestats(
|
18
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
19
|
+
*,
|
20
|
+
per_box: bool = False,
|
21
|
+
per_channel: Literal[True],
|
22
|
+
) -> ChannelStatsOutput: ...
|
23
|
+
|
24
|
+
|
25
|
+
@overload
|
26
|
+
def imagestats(
|
27
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
28
|
+
*,
|
29
|
+
per_box: bool = False,
|
30
|
+
per_channel: Literal[False] = False,
|
31
|
+
) -> ImageStatsOutput: ...
|
32
|
+
|
33
|
+
|
34
|
+
@set_metadata
|
35
|
+
def imagestats(
|
36
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
37
|
+
*,
|
38
|
+
per_box: bool = False,
|
39
|
+
per_channel: bool = False,
|
40
|
+
) -> ImageStatsOutput | ChannelStatsOutput:
|
41
|
+
"""
|
42
|
+
Calculates various :term:`statistics<Statistics>` for each image.
|
43
|
+
|
44
|
+
This function computes dimension, pixel and visual metrics
|
45
|
+
on the images or individual bounding boxes for each image. If
|
46
|
+
performing calculations per channel dimension stats are excluded.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
dataset : Dataset
|
51
|
+
Dataset to perform calculations on.
|
52
|
+
per_box : bool, default False
|
53
|
+
If True, perform calculations on each bounding box.
|
54
|
+
per_channel : bool, default False
|
55
|
+
If True, perform calculations on each channel.
|
56
|
+
|
57
|
+
Returns
|
58
|
+
-------
|
59
|
+
ImageStatsOutput or ChannelStatsOutput
|
60
|
+
Output class containing the outputs of various stats functions
|
61
|
+
|
62
|
+
See Also
|
63
|
+
--------
|
64
|
+
dimensionstats, pixelstats, visualstats
|
65
|
+
|
66
|
+
Examples
|
67
|
+
--------
|
68
|
+
Calculate dimension, pixel and visual statistics for a dataset containing 8
|
69
|
+
images.
|
70
|
+
|
71
|
+
>>> stats = imagestats(dataset)
|
72
|
+
>>> print(stats.aspect_ratio)
|
73
|
+
[1. 1. 1.333 1. 0.667 1. 1. 1. ]
|
74
|
+
|
75
|
+
>>> print(stats.sharpness)
|
76
|
+
[20.23 20.23 23.33 20.23 77.06 20.23 20.23 20.23]
|
77
|
+
|
78
|
+
Calculate the pixel and visual stats for a dataset containing 6 3-channel
|
79
|
+
images and 2 1-channel images for a total of 20 channels.
|
80
|
+
|
81
|
+
>>> ch_stats = imagestats(dataset, per_channel=True)
|
82
|
+
>>> print(ch_stats.brightness)
|
83
|
+
[0.027 0.152 0.277 0.127 0.135 0.142 0.259 0.377 0.385 0.392 0.508 0.626
|
84
|
+
0.634 0.642 0.751 0.759 0.767 0.876 0.884 0.892]
|
85
|
+
"""
|
86
|
+
if per_channel:
|
87
|
+
processors = [PixelStatsProcessor, VisualStatsProcessor]
|
88
|
+
output_cls = ChannelStatsOutput
|
89
|
+
else:
|
90
|
+
processors = [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor]
|
91
|
+
output_cls = ImageStatsOutput
|
92
|
+
|
93
|
+
outputs = run_stats(dataset, per_box, per_channel, processors)
|
94
|
+
return output_cls(**{k: v for d in outputs for k, v in d.data().items()})
|