dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +40 -85
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
- dataeval/detectors/drift/updates.py +20 -3
- dataeval/detectors/linters/__init__.py +3 -5
- dataeval/detectors/linters/duplicates.py +13 -36
- dataeval/detectors/linters/outliers.py +23 -148
- dataeval/detectors/ood/__init__.py +1 -1
- dataeval/detectors/ood/ae.py +30 -9
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/mixin.py +21 -7
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +6 -0
- dataeval/metadata/_distance.py +167 -0
- dataeval/metadata/_ood.py +217 -0
- dataeval/metadata/_utils.py +44 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +6 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
- dataeval/metrics/bias/_coverage.py +98 -0
- dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
- dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
- dataeval/metrics/estimators/__init__.py +15 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
- dataeval/metrics/estimators/_clusterer.py +44 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
- dataeval/metrics/stats/__init__.py +16 -13
- dataeval/metrics/stats/{base.py → _base.py} +82 -133
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
- dataeval/metrics/stats/_dimensionstats.py +75 -0
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
- dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval/metrics/stats/_labelstats.py +131 -0
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
- dataeval/outputs/__init__.py +53 -0
- dataeval/{output.py → outputs/_base.py} +55 -25
- dataeval/outputs/_bias.py +381 -0
- dataeval/outputs/_drift.py +83 -0
- dataeval/outputs/_estimators.py +114 -0
- dataeval/outputs/_linters.py +184 -0
- dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
- dataeval/outputs/_stats.py +387 -0
- dataeval/outputs/_utils.py +44 -0
- dataeval/outputs/_workflows.py +364 -0
- dataeval/typing.py +234 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +14 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +6 -6
- dataeval/utils/data/__init__.py +26 -0
- dataeval/utils/data/_dataset.py +217 -0
- dataeval/utils/data/_embeddings.py +104 -0
- dataeval/utils/data/_images.py +68 -0
- dataeval/utils/data/_metadata.py +360 -0
- dataeval/utils/data/_selection.py +126 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
- dataeval/utils/data/_targets.py +85 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_types.py +52 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +57 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/__init__.py +2 -1
- dataeval/workflows/sufficiency.py +11 -346
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
- dataeval-0.82.0.dist-info/RECORD +104 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/metrics/bias/coverage.py +0 -194
- dataeval/metrics/stats/datasetstats.py +0 -202
- dataeval/metrics/stats/dimensionstats.py +0 -115
- dataeval/metrics/stats/labelstats.py +0 -210
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -4,41 +4,24 @@ import warnings
|
|
4
4
|
|
5
5
|
__all__ = []
|
6
6
|
|
7
|
-
from
|
8
|
-
from typing import Callable, Iterable
|
7
|
+
from typing import Any, Callable
|
9
8
|
|
10
9
|
import numpy as np
|
11
10
|
import xxhash as xxh
|
12
|
-
from numpy.typing import ArrayLike
|
13
11
|
from PIL import Image
|
14
12
|
from scipy.fftpack import dct
|
15
13
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
-
from dataeval.
|
19
|
-
from dataeval.
|
14
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
15
|
+
from dataeval.outputs import HashStatsOutput
|
16
|
+
from dataeval.outputs._base import set_metadata
|
17
|
+
from dataeval.typing import ArrayLike, Dataset
|
18
|
+
from dataeval.utils._array import as_numpy
|
19
|
+
from dataeval.utils._image import normalize_image_shape, rescale
|
20
20
|
|
21
21
|
HASH_SIZE = 8
|
22
22
|
MAX_FACTOR = 4
|
23
23
|
|
24
24
|
|
25
|
-
@dataclass(frozen=True)
|
26
|
-
class HashStatsOutput(BaseStatsOutput):
|
27
|
-
"""
|
28
|
-
Output class for :func:`hashstats` stats metric.
|
29
|
-
|
30
|
-
Attributes
|
31
|
-
----------
|
32
|
-
xxhash : List[str]
|
33
|
-
xxHash hash of the images as a hex string
|
34
|
-
pchash : List[str]
|
35
|
-
:term:`Perception-based Hash` of the images as a hex string
|
36
|
-
"""
|
37
|
-
|
38
|
-
xxhash: list[str]
|
39
|
-
pchash: list[str]
|
40
|
-
|
41
|
-
|
42
25
|
def pchash(image: ArrayLike) -> str:
|
43
26
|
"""
|
44
27
|
Performs a perceptual hash on an image by resizing to a square NxN image
|
@@ -122,8 +105,9 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
|
122
105
|
|
123
106
|
@set_metadata
|
124
107
|
def hashstats(
|
125
|
-
|
126
|
-
|
108
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
109
|
+
*,
|
110
|
+
per_box: bool = False,
|
127
111
|
) -> HashStatsOutput:
|
128
112
|
"""
|
129
113
|
Calculates hashes for each image.
|
@@ -133,10 +117,10 @@ def hashstats(
|
|
133
117
|
|
134
118
|
Parameters
|
135
119
|
----------
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
120
|
+
dataset : Dataset
|
121
|
+
Dataset to perform calculations on.
|
122
|
+
per_box : bool, default False
|
123
|
+
If True, perform calculations on each bounding box.
|
140
124
|
|
141
125
|
Returns
|
142
126
|
-------
|
@@ -149,12 +133,12 @@ def hashstats(
|
|
149
133
|
|
150
134
|
Examples
|
151
135
|
--------
|
152
|
-
|
136
|
+
Calculate the hashes of a dataset of images, whose shape is (C, H, W)
|
153
137
|
|
154
|
-
>>> results = hashstats(
|
155
|
-
>>> print(results.xxhash)
|
156
|
-
['
|
157
|
-
>>> print(results.pchash)
|
158
|
-
['
|
138
|
+
>>> results = hashstats(dataset)
|
139
|
+
>>> print(results.xxhash[:5])
|
140
|
+
['66a93f556577c086', 'd8b686fb405c4105', '7ffdb4990ad44ac6', '42cd4c34c80f6006', 'c5519e36ac1f8839']
|
141
|
+
>>> print(results.pchash[:5])
|
142
|
+
['e666999999266666', 'e666999999266666', 'e666999966666299', 'e666999999266666', '96e91656e91616e9']
|
159
143
|
"""
|
160
|
-
return run_stats(
|
144
|
+
return run_stats(dataset, per_box, False, [HashStatsProcessor])[0]
|
@@ -0,0 +1,94 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from typing import Any, Literal, overload
|
6
|
+
|
7
|
+
from dataeval.metrics.stats._base import run_stats
|
8
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsProcessor
|
9
|
+
from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
|
10
|
+
from dataeval.metrics.stats._visualstats import VisualStatsProcessor
|
11
|
+
from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
|
12
|
+
from dataeval.outputs._base import set_metadata
|
13
|
+
from dataeval.typing import ArrayLike, Dataset
|
14
|
+
|
15
|
+
|
16
|
+
@overload
|
17
|
+
def imagestats(
|
18
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
19
|
+
*,
|
20
|
+
per_box: bool = False,
|
21
|
+
per_channel: Literal[True],
|
22
|
+
) -> ChannelStatsOutput: ...
|
23
|
+
|
24
|
+
|
25
|
+
@overload
|
26
|
+
def imagestats(
|
27
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
28
|
+
*,
|
29
|
+
per_box: bool = False,
|
30
|
+
per_channel: Literal[False] = False,
|
31
|
+
) -> ImageStatsOutput: ...
|
32
|
+
|
33
|
+
|
34
|
+
@set_metadata
|
35
|
+
def imagestats(
|
36
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
37
|
+
*,
|
38
|
+
per_box: bool = False,
|
39
|
+
per_channel: bool = False,
|
40
|
+
) -> ImageStatsOutput | ChannelStatsOutput:
|
41
|
+
"""
|
42
|
+
Calculates various :term:`statistics<Statistics>` for each image.
|
43
|
+
|
44
|
+
This function computes dimension, pixel and visual metrics
|
45
|
+
on the images or individual bounding boxes for each image as
|
46
|
+
well as label statistics if provided.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
dataset : Dataset
|
51
|
+
Dataset to perform calculations on.
|
52
|
+
per_box : bool, default False
|
53
|
+
If True, perform calculations on each bounding box.
|
54
|
+
per_channel : bool, default False
|
55
|
+
If True, perform calculations on each channel.
|
56
|
+
|
57
|
+
Returns
|
58
|
+
-------
|
59
|
+
ImageStatsOutput or ChannelStatsOutput
|
60
|
+
Output class containing the outputs of various stats functions
|
61
|
+
|
62
|
+
See Also
|
63
|
+
--------
|
64
|
+
dimensionstats, labelstats, pixelstats, visualstats, Outliers
|
65
|
+
|
66
|
+
Examples
|
67
|
+
--------
|
68
|
+
Calculate dimension, pixel and visual statistics for a dataset containing 8
|
69
|
+
images.
|
70
|
+
|
71
|
+
>>> stats = imagestats(dataset)
|
72
|
+
>>> print(stats.aspect_ratio)
|
73
|
+
[1. 1. 1.333 1. 0.667 1. 1. 1. ]
|
74
|
+
|
75
|
+
>>> print(stats.sharpness)
|
76
|
+
[20.23 20.23 23.33 20.23 77.06 20.23 20.23 20.23]
|
77
|
+
|
78
|
+
Calculate the pixel and visual stats for a dataset containing 6 3-channel
|
79
|
+
images and 2 1-channel images for a total of 20 channels.
|
80
|
+
|
81
|
+
>>> ch_stats = imagestats(dataset, per_channel=True)
|
82
|
+
>>> print(ch_stats.brightness)
|
83
|
+
[0.027 0.152 0.277 0.127 0.135 0.142 0.259 0.377 0.385 0.392 0.508 0.626
|
84
|
+
0.634 0.642 0.751 0.759 0.767 0.876 0.884 0.892]
|
85
|
+
"""
|
86
|
+
if per_channel:
|
87
|
+
processors = [PixelStatsProcessor, VisualStatsProcessor]
|
88
|
+
output_cls = ChannelStatsOutput
|
89
|
+
else:
|
90
|
+
processors = [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor]
|
91
|
+
output_cls = ImageStatsOutput
|
92
|
+
|
93
|
+
outputs = run_stats(dataset, per_box, per_channel, processors)
|
94
|
+
return output_cls(**{k: v for d in outputs for k, v in d.dict().items()})
|
@@ -0,0 +1,131 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from collections import Counter, defaultdict
|
6
|
+
from typing import Any, Mapping, TypeVar
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
from dataeval.outputs import LabelStatsOutput
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.typing import AnnotatedDataset, ArrayLike
|
13
|
+
from dataeval.utils._array import as_numpy
|
14
|
+
from dataeval.utils.data._metadata import Metadata
|
15
|
+
|
16
|
+
TValue = TypeVar("TValue")
|
17
|
+
|
18
|
+
|
19
|
+
def _ensure_2d(labels: ArrayLike) -> ArrayLike:
|
20
|
+
if isinstance(labels, np.ndarray):
|
21
|
+
return labels[:, None]
|
22
|
+
else:
|
23
|
+
return [[lbl] for lbl in labels] # type: ignore
|
24
|
+
|
25
|
+
|
26
|
+
def _get_list_depth(lst):
|
27
|
+
if isinstance(lst, list) and lst:
|
28
|
+
return 1 + max(_get_list_depth(item) for item in lst)
|
29
|
+
return 0
|
30
|
+
|
31
|
+
|
32
|
+
def _check_labels_dimension(labels: ArrayLike) -> ArrayLike:
|
33
|
+
# Check for nested lists beyond 2 levels
|
34
|
+
|
35
|
+
if isinstance(labels, np.ndarray):
|
36
|
+
if labels.ndim == 1:
|
37
|
+
return _ensure_2d(labels)
|
38
|
+
elif labels.ndim == 2:
|
39
|
+
return labels
|
40
|
+
else:
|
41
|
+
raise ValueError("The label array must not have more than 2 dimensions.")
|
42
|
+
elif isinstance(labels, list):
|
43
|
+
depth = _get_list_depth(labels)
|
44
|
+
if depth == 1:
|
45
|
+
return _ensure_2d(labels)
|
46
|
+
elif depth == 2:
|
47
|
+
return labels
|
48
|
+
else:
|
49
|
+
raise ValueError("The label list must not be empty or have more than 2 levels of nesting.")
|
50
|
+
else:
|
51
|
+
raise TypeError("Labels must be either a NumPy array or a list.")
|
52
|
+
|
53
|
+
|
54
|
+
def _sort_to_list(d: Mapping[int, TValue]) -> list[TValue]:
|
55
|
+
return [v for _, v in sorted(d.items())]
|
56
|
+
|
57
|
+
|
58
|
+
@set_metadata
|
59
|
+
def labelstats(dataset: Metadata | AnnotatedDataset[Any]) -> LabelStatsOutput:
|
60
|
+
"""
|
61
|
+
Calculates :term:`statistics<Statistics>` for data labels.
|
62
|
+
|
63
|
+
This function computes counting metrics (e.g., total per class, total per image)
|
64
|
+
on the labels.
|
65
|
+
|
66
|
+
Parameters
|
67
|
+
----------
|
68
|
+
dataset : Metadata or ImageClassificationDataset or ObjectDetect
|
69
|
+
|
70
|
+
Returns
|
71
|
+
-------
|
72
|
+
LabelStatsOutput
|
73
|
+
A dataclass containing the computed counting metrics for the labels.
|
74
|
+
|
75
|
+
Examples
|
76
|
+
--------
|
77
|
+
Calculate basic :term:`statistics<Statistics>` on labels for a dataset.
|
78
|
+
|
79
|
+
>>> from dataeval.utils.data import Metadata
|
80
|
+
>>> stats = labelstats(Metadata(dataset))
|
81
|
+
>>> print(stats.to_table())
|
82
|
+
Class Count: 5
|
83
|
+
Label Count: 15
|
84
|
+
Average # Labels per Image: 1.88
|
85
|
+
--------------------------------------
|
86
|
+
Label: Total Count - Image Count
|
87
|
+
horse: 2 - 2
|
88
|
+
cow: 4 - 3
|
89
|
+
sheep: 2 - 2
|
90
|
+
pig: 2 - 2
|
91
|
+
chicken: 5 - 5
|
92
|
+
"""
|
93
|
+
dataset = Metadata(dataset) if isinstance(dataset, AnnotatedDataset) else dataset
|
94
|
+
|
95
|
+
label_counts: Counter[int] = Counter()
|
96
|
+
image_counts: Counter[int] = Counter()
|
97
|
+
index_location = defaultdict(list[int])
|
98
|
+
label_per_image: list[int] = []
|
99
|
+
|
100
|
+
index2label = dict(enumerate(dataset.class_names))
|
101
|
+
labels = [target.labels.tolist() for target in dataset.targets]
|
102
|
+
|
103
|
+
labels_2d = _check_labels_dimension(labels)
|
104
|
+
|
105
|
+
for i, group in enumerate(labels_2d):
|
106
|
+
group = as_numpy(group).tolist()
|
107
|
+
|
108
|
+
# Count occurrences of each label in all sublists
|
109
|
+
label_counts.update(group)
|
110
|
+
|
111
|
+
# Get the number of labels per image
|
112
|
+
label_per_image.append(len(group))
|
113
|
+
|
114
|
+
# Create a set of unique items in the current sublist
|
115
|
+
unique_items: set[int] = set(group)
|
116
|
+
|
117
|
+
# Update image counts and index locations
|
118
|
+
image_counts.update(unique_items)
|
119
|
+
for item in unique_items:
|
120
|
+
index_location[item].append(i)
|
121
|
+
|
122
|
+
return LabelStatsOutput(
|
123
|
+
label_counts_per_class=_sort_to_list(label_counts),
|
124
|
+
label_counts_per_image=label_per_image,
|
125
|
+
image_counts_per_class=_sort_to_list(image_counts),
|
126
|
+
image_indices_per_class=_sort_to_list(index_location),
|
127
|
+
image_count=len(label_per_image),
|
128
|
+
class_count=len(label_counts),
|
129
|
+
label_count=sum(label_counts.values()),
|
130
|
+
class_names=list(index2label.values()),
|
131
|
+
)
|
@@ -2,49 +2,15 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import Any, Callable, Iterable
|
5
|
+
from typing import Any, Callable
|
7
6
|
|
8
7
|
import numpy as np
|
9
|
-
from numpy.typing import ArrayLike, NDArray
|
10
8
|
from scipy.stats import entropy, kurtosis, skew
|
11
9
|
|
12
|
-
from dataeval.metrics.stats.
|
13
|
-
from dataeval.
|
14
|
-
|
15
|
-
|
16
|
-
@dataclass(frozen=True)
|
17
|
-
class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
|
-
"""
|
19
|
-
Output class for :func:`pixelstats` stats metric.
|
20
|
-
|
21
|
-
Attributes
|
22
|
-
----------
|
23
|
-
mean : NDArray[np.float16]
|
24
|
-
Mean of the pixel values of the images
|
25
|
-
std : NDArray[np.float16]
|
26
|
-
Standard deviation of the pixel values of the images
|
27
|
-
var : NDArray[np.float16]
|
28
|
-
:term:`Variance` of the pixel values of the images
|
29
|
-
skew : NDArray[np.float16]
|
30
|
-
Skew of the pixel values of the images
|
31
|
-
kurtosis : NDArray[np.float16]
|
32
|
-
Kurtosis of the pixel values of the images
|
33
|
-
histogram : NDArray[np.uint32]
|
34
|
-
Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
|
35
|
-
entropy : NDArray[np.float16]
|
36
|
-
Entropy of the pixel values of the images
|
37
|
-
"""
|
38
|
-
|
39
|
-
mean: NDArray[np.float16]
|
40
|
-
std: NDArray[np.float16]
|
41
|
-
var: NDArray[np.float16]
|
42
|
-
skew: NDArray[np.float16]
|
43
|
-
kurtosis: NDArray[np.float16]
|
44
|
-
histogram: NDArray[np.uint32]
|
45
|
-
entropy: NDArray[np.float16]
|
46
|
-
|
47
|
-
_excluded_keys = ["histogram"]
|
10
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
11
|
+
from dataeval.outputs import PixelStatsOutput
|
12
|
+
from dataeval.outputs._base import set_metadata
|
13
|
+
from dataeval.typing import ArrayLike, Dataset
|
48
14
|
|
49
15
|
|
50
16
|
class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
@@ -71,8 +37,9 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
|
71
37
|
|
72
38
|
@set_metadata
|
73
39
|
def pixelstats(
|
74
|
-
|
75
|
-
|
40
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
41
|
+
*,
|
42
|
+
per_box: bool = False,
|
76
43
|
per_channel: bool = False,
|
77
44
|
) -> PixelStatsOutput:
|
78
45
|
"""
|
@@ -83,10 +50,12 @@ def pixelstats(
|
|
83
50
|
|
84
51
|
Parameters
|
85
52
|
----------
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
53
|
+
dataset : Dataset
|
54
|
+
Dataset to perform calculations on.
|
55
|
+
per_box : bool, default False
|
56
|
+
If True, perform calculations on each bounding box.
|
57
|
+
per_channel : bool, default False
|
58
|
+
If True, perform calculations on each channel.
|
90
59
|
|
91
60
|
Returns
|
92
61
|
-------
|
@@ -106,12 +75,12 @@ def pixelstats(
|
|
106
75
|
|
107
76
|
Examples
|
108
77
|
--------
|
109
|
-
|
78
|
+
Calculate the pixel statistics of a dataset of 8 images, whose shape is (C, H, W).
|
110
79
|
|
111
|
-
>>> results = pixelstats(
|
80
|
+
>>> results = pixelstats(dataset)
|
112
81
|
>>> print(results.mean)
|
113
|
-
[0.
|
82
|
+
[0.181 0.132 0.248 0.373 0.464 0.613 0.734 0.854]
|
114
83
|
>>> print(results.entropy)
|
115
|
-
[4.
|
84
|
+
[4.527 1.883 0.811 1.883 0.298 1.883 1.883 1.883]
|
116
85
|
"""
|
117
|
-
return run_stats(
|
86
|
+
return run_stats(dataset, per_box, per_channel, [PixelStatsProcessor])[0]
|
@@ -2,60 +2,26 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import Any, Callable, Iterable
|
5
|
+
from typing import Any, Callable
|
7
6
|
|
8
7
|
import numpy as np
|
9
|
-
from numpy.typing import ArrayLike, NDArray
|
10
8
|
|
11
|
-
from dataeval.metrics.stats.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
9
|
+
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
|
+
from dataeval.outputs import VisualStatsOutput
|
11
|
+
from dataeval.outputs._base import set_metadata
|
12
|
+
from dataeval.typing import ArrayLike, Dataset
|
13
|
+
from dataeval.utils._image import edge_filter
|
14
14
|
|
15
15
|
QUARTILES = (0, 25, 50, 75, 100)
|
16
16
|
|
17
17
|
|
18
|
-
@dataclass(frozen=True)
|
19
|
-
class VisualStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
20
|
-
"""
|
21
|
-
Output class for :func:`visualstats` stats metric.
|
22
|
-
|
23
|
-
Attributes
|
24
|
-
----------
|
25
|
-
brightness : NDArray[np.float16]
|
26
|
-
Brightness of the images
|
27
|
-
contrast : NDArray[np.float16]
|
28
|
-
Image contrast ratio
|
29
|
-
darkness : NDArray[np.float16]
|
30
|
-
Darkness of the images
|
31
|
-
missing : NDArray[np.float16]
|
32
|
-
Percentage of the images with missing pixels
|
33
|
-
sharpness : NDArray[np.float16]
|
34
|
-
Sharpness of the images
|
35
|
-
zeros : NDArray[np.float16]
|
36
|
-
Percentage of the images with zero value pixels
|
37
|
-
percentiles : NDArray[np.float16]
|
38
|
-
Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
|
39
|
-
"""
|
40
|
-
|
41
|
-
brightness: NDArray[np.float16]
|
42
|
-
contrast: NDArray[np.float16]
|
43
|
-
darkness: NDArray[np.float16]
|
44
|
-
missing: NDArray[np.float16]
|
45
|
-
sharpness: NDArray[np.float16]
|
46
|
-
zeros: NDArray[np.float16]
|
47
|
-
percentiles: NDArray[np.float16]
|
48
|
-
|
49
|
-
_excluded_keys = ["percentiles"]
|
50
|
-
|
51
|
-
|
52
18
|
class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
53
19
|
output_class: type = VisualStatsOutput
|
54
20
|
image_function_map: dict[str, Callable[[StatsProcessor[VisualStatsOutput]], Any]] = {
|
55
21
|
"brightness": lambda x: x.get("percentiles")[1],
|
56
|
-
"contrast": lambda x:
|
57
|
-
|
58
|
-
),
|
22
|
+
"contrast": lambda x: 0
|
23
|
+
if np.mean(x.get("percentiles")) == 0
|
24
|
+
else (np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles")),
|
59
25
|
"darkness": lambda x: x.get("percentiles")[-2],
|
60
26
|
"missing": lambda x: np.count_nonzero(np.isnan(np.sum(x.image, axis=0))) / np.prod(x.shape[-2:]),
|
61
27
|
"sharpness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
|
@@ -78,8 +44,9 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
|
78
44
|
|
79
45
|
@set_metadata
|
80
46
|
def visualstats(
|
81
|
-
|
82
|
-
|
47
|
+
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
48
|
+
*,
|
49
|
+
per_box: bool = False,
|
83
50
|
per_channel: bool = False,
|
84
51
|
) -> VisualStatsOutput:
|
85
52
|
"""
|
@@ -90,10 +57,12 @@ def visualstats(
|
|
90
57
|
|
91
58
|
Parameters
|
92
59
|
----------
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
60
|
+
dataset : Dataset
|
61
|
+
Dataset to perform calculations on.
|
62
|
+
per_box : bool, default False
|
63
|
+
If True, perform calculations on each bounding box.
|
64
|
+
per_channel : bool, default False
|
65
|
+
If True, perform calculations on each channel.
|
97
66
|
|
98
67
|
Returns
|
99
68
|
-------
|
@@ -112,12 +81,12 @@ def visualstats(
|
|
112
81
|
|
113
82
|
Examples
|
114
83
|
--------
|
115
|
-
|
84
|
+
Calculate the visual statistics of a dataset of 8 images, whose shape is (C, H, W).
|
116
85
|
|
117
|
-
>>> results = visualstats(
|
86
|
+
>>> results = visualstats(dataset)
|
118
87
|
>>> print(results.brightness)
|
119
|
-
[0.
|
88
|
+
[0.084 0.13 0.259 0.38 0.508 0.63 0.755 0.88 ]
|
120
89
|
>>> print(results.contrast)
|
121
|
-
[2.04 1.331 1.261 1.279 1.253]
|
90
|
+
[2.04 1.331 1.261 1.279 1.253 1.268 1.265 1.263]
|
122
91
|
"""
|
123
|
-
return run_stats(
|
92
|
+
return run_stats(dataset, per_box, per_channel, [VisualStatsProcessor])[0]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""
|
2
|
+
Output classes for DataEval to store function and method outputs
|
3
|
+
as well as runtime metadata for reproducibility and logging.
|
4
|
+
"""
|
5
|
+
|
6
|
+
from ._base import ExecutionMetadata
|
7
|
+
from ._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOutput, ParityOutput
|
8
|
+
from ._drift import DriftMMDOutput, DriftOutput
|
9
|
+
from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
|
10
|
+
from ._linters import DuplicatesOutput, OutliersOutput
|
11
|
+
from ._ood import OODOutput, OODScoreOutput
|
12
|
+
from ._stats import (
|
13
|
+
ChannelStatsOutput,
|
14
|
+
DimensionStatsOutput,
|
15
|
+
HashStatsOutput,
|
16
|
+
ImageStatsOutput,
|
17
|
+
LabelStatsOutput,
|
18
|
+
PixelStatsOutput,
|
19
|
+
SourceIndex,
|
20
|
+
VisualStatsOutput,
|
21
|
+
)
|
22
|
+
from ._utils import SplitDatasetOutput, TrainValSplit
|
23
|
+
from ._workflows import SufficiencyOutput
|
24
|
+
|
25
|
+
__all__ = [
|
26
|
+
"BEROutput",
|
27
|
+
"BalanceOutput",
|
28
|
+
"ChannelStatsOutput",
|
29
|
+
"ClustererOutput",
|
30
|
+
"CoverageOutput",
|
31
|
+
"DimensionStatsOutput",
|
32
|
+
"DivergenceOutput",
|
33
|
+
"DiversityOutput",
|
34
|
+
"DriftMMDOutput",
|
35
|
+
"DriftOutput",
|
36
|
+
"DuplicatesOutput",
|
37
|
+
"ExecutionMetadata",
|
38
|
+
"HashStatsOutput",
|
39
|
+
"ImageStatsOutput",
|
40
|
+
"LabelParityOutput",
|
41
|
+
"LabelStatsOutput",
|
42
|
+
"OODOutput",
|
43
|
+
"OODScoreOutput",
|
44
|
+
"OutliersOutput",
|
45
|
+
"ParityOutput",
|
46
|
+
"PixelStatsOutput",
|
47
|
+
"SourceIndex",
|
48
|
+
"SplitDatasetOutput",
|
49
|
+
"SufficiencyOutput",
|
50
|
+
"TrainValSplit",
|
51
|
+
"UAPOutput",
|
52
|
+
"VisualStatsOutput",
|
53
|
+
]
|