dataeval 0.69.4__py3-none-any.whl → 0.70.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +8 -8
- dataeval/_internal/datasets.py +235 -131
- dataeval/_internal/detectors/clusterer.py +2 -0
- dataeval/_internal/detectors/drift/base.py +7 -8
- dataeval/_internal/detectors/drift/mmd.py +4 -4
- dataeval/_internal/detectors/duplicates.py +64 -45
- dataeval/_internal/detectors/merged_stats.py +23 -54
- dataeval/_internal/detectors/ood/ae.py +8 -6
- dataeval/_internal/detectors/ood/aegmm.py +6 -4
- dataeval/_internal/detectors/ood/base.py +12 -7
- dataeval/_internal/detectors/ood/llr.py +6 -4
- dataeval/_internal/detectors/ood/vae.py +5 -3
- dataeval/_internal/detectors/ood/vaegmm.py +6 -4
- dataeval/_internal/detectors/outliers.py +137 -63
- dataeval/_internal/interop.py +11 -7
- dataeval/_internal/metrics/balance.py +13 -11
- dataeval/_internal/metrics/ber.py +5 -3
- dataeval/_internal/metrics/coverage.py +4 -0
- dataeval/_internal/metrics/divergence.py +9 -5
- dataeval/_internal/metrics/diversity.py +14 -12
- dataeval/_internal/metrics/parity.py +32 -22
- dataeval/_internal/metrics/stats/base.py +231 -0
- dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
- dataeval/_internal/metrics/stats/datasetstats.py +99 -0
- dataeval/_internal/metrics/stats/dimensionstats.py +113 -0
- dataeval/_internal/metrics/stats/hashstats.py +75 -0
- dataeval/_internal/metrics/stats/labelstats.py +125 -0
- dataeval/_internal/metrics/stats/pixelstats.py +119 -0
- dataeval/_internal/metrics/stats/visualstats.py +124 -0
- dataeval/_internal/metrics/uap.py +8 -4
- dataeval/_internal/metrics/utils.py +30 -15
- dataeval/_internal/models/pytorch/autoencoder.py +5 -5
- dataeval/_internal/models/tensorflow/pixelcnn.py +1 -4
- dataeval/_internal/output.py +3 -18
- dataeval/_internal/utils.py +11 -16
- dataeval/_internal/workflows/sufficiency.py +152 -151
- dataeval/detectors/__init__.py +4 -0
- dataeval/detectors/drift/__init__.py +8 -3
- dataeval/detectors/drift/kernels/__init__.py +4 -0
- dataeval/detectors/drift/updates/__init__.py +4 -0
- dataeval/detectors/linters/__init__.py +15 -4
- dataeval/detectors/ood/__init__.py +14 -2
- dataeval/metrics/__init__.py +5 -0
- dataeval/metrics/bias/__init__.py +13 -4
- dataeval/metrics/estimators/__init__.py +8 -8
- dataeval/metrics/stats/__init__.py +25 -3
- dataeval/utils/__init__.py +16 -3
- dataeval/utils/tensorflow/__init__.py +11 -0
- dataeval/utils/torch/__init__.py +12 -0
- dataeval/utils/torch/datasets/__init__.py +7 -0
- dataeval/workflows/__init__.py +6 -2
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/METADATA +12 -4
- dataeval-0.70.1.dist-info/RECORD +80 -0
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/WHEEL +1 -1
- dataeval/_internal/flags.py +0 -77
- dataeval/_internal/metrics/stats.py +0 -397
- dataeval/flags/__init__.py +0 -3
- dataeval/tensorflow/__init__.py +0 -3
- dataeval/torch/__init__.py +0 -3
- dataeval-0.69.4.dist-info/RECORD +0 -74
- /dataeval/{tensorflow → utils/tensorflow}/loss/__init__.py +0 -0
- /dataeval/{tensorflow → utils/tensorflow}/models/__init__.py +0 -0
- /dataeval/{tensorflow → utils/tensorflow}/recon/__init__.py +0 -0
- /dataeval/{torch → utils/torch}/models/__init__.py +0 -0
- /dataeval/{torch → utils/torch}/trainer/__init__.py +0 -0
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
|
+
|
9
|
+
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
10
|
+
from dataeval._internal.metrics.utils import get_bitdepth
|
11
|
+
from dataeval._internal.output import set_metadata
|
12
|
+
|
13
|
+
|
14
|
+
class DimensionStatsProcessor(StatsProcessor):
|
15
|
+
image_function_map = {
|
16
|
+
"left": lambda x: x.box[0],
|
17
|
+
"top": lambda x: x.box[1],
|
18
|
+
"width": lambda x: x.shape[-1],
|
19
|
+
"height": lambda x: x.shape[-2],
|
20
|
+
"channels": lambda x: x.shape[-3],
|
21
|
+
"size": lambda x: np.prod(x.shape[-2:]),
|
22
|
+
"aspect_ratio": lambda x: x.shape[-1] / x.shape[-2],
|
23
|
+
"depth": lambda x: get_bitdepth(x.image).depth,
|
24
|
+
"center": lambda x: np.asarray([(x.box[0] + x.box[2]) / 2, (x.box[1] + x.box[3]) / 2]),
|
25
|
+
"distance": lambda x: np.sqrt(
|
26
|
+
np.square(((x.box[0] + x.box[2]) / 2) - (x.width / 2))
|
27
|
+
+ np.square(((x.box[1] + x.box[3]) / 2) - (x.height / 2))
|
28
|
+
),
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass(frozen=True)
|
33
|
+
class DimensionStatsOutput(BaseStatsOutput):
|
34
|
+
"""
|
35
|
+
Output class for :func:`dimensionstats` stats metric
|
36
|
+
|
37
|
+
Attributes
|
38
|
+
----------
|
39
|
+
left : NDArray[np.int32]
|
40
|
+
Offsets from the left edge of images in pixels
|
41
|
+
top : NDArray[np.int32]
|
42
|
+
Offsets from the top edge of images in pixels
|
43
|
+
width : NDArray[np.uint32]
|
44
|
+
Width of the images in pixels
|
45
|
+
height : NDArray[np.uint32]
|
46
|
+
Height of the images in pixels
|
47
|
+
channels : NDArray[np.uint8]
|
48
|
+
Channel count of the images in pixels
|
49
|
+
size : NDArray[np.uint32]
|
50
|
+
Size of the images in pixels
|
51
|
+
aspect_ratio : NDArray[np.float16]
|
52
|
+
Aspect ratio of the images (width/height)
|
53
|
+
depth : NDArray[np.uint8]
|
54
|
+
Color depth of the images in bits
|
55
|
+
center : NDArray[np.uint16]
|
56
|
+
Offset from center in [x,y] coordinates of the images in pixels
|
57
|
+
distance : NDArray[np.float16]
|
58
|
+
Distance in pixels from center
|
59
|
+
"""
|
60
|
+
|
61
|
+
left: NDArray[np.int32]
|
62
|
+
top: NDArray[np.int32]
|
63
|
+
width: NDArray[np.uint32]
|
64
|
+
height: NDArray[np.uint32]
|
65
|
+
channels: NDArray[np.uint8]
|
66
|
+
size: NDArray[np.uint32]
|
67
|
+
aspect_ratio: NDArray[np.float16]
|
68
|
+
depth: NDArray[np.uint8]
|
69
|
+
center: NDArray[np.int16]
|
70
|
+
distance: NDArray[np.float16]
|
71
|
+
|
72
|
+
|
73
|
+
@set_metadata("dataeval.metrics")
|
74
|
+
def dimensionstats(
|
75
|
+
images: Iterable[ArrayLike],
|
76
|
+
bboxes: Iterable[ArrayLike] | None = None,
|
77
|
+
) -> DimensionStatsOutput:
|
78
|
+
"""
|
79
|
+
Calculates dimension statistics for each image
|
80
|
+
|
81
|
+
This function computes various dimensional metrics (e.g., width, height, channels)
|
82
|
+
on the images or individual bounding boxes for each image.
|
83
|
+
|
84
|
+
Parameters
|
85
|
+
----------
|
86
|
+
images : Iterable[ArrayLike]
|
87
|
+
Images to perform calculations on
|
88
|
+
bboxes : Iterable[ArrayLike] or None
|
89
|
+
Bounding boxes in `xyxy` format for each image to perform calculations on
|
90
|
+
|
91
|
+
Returns
|
92
|
+
-------
|
93
|
+
DimensionStatsOutput
|
94
|
+
A dictionary-like object containing the computed dimension statistics for each image or bounding
|
95
|
+
box. The keys correspond to the names of the statistics (e.g., 'width', 'height'), and the values
|
96
|
+
are lists of results for each image or numpy arrays when the results are multi-dimensional.
|
97
|
+
|
98
|
+
See Also
|
99
|
+
--------
|
100
|
+
pixelstats, visualstats, Outliers
|
101
|
+
|
102
|
+
Examples
|
103
|
+
--------
|
104
|
+
Calculating the dimension statistics on the images, whose shape is (C, H, W)
|
105
|
+
|
106
|
+
>>> results = dimensionstats(images)
|
107
|
+
>>> print(results.aspect_ratio)
|
108
|
+
[0.75 0.75 0.75 0.75 0.75 0.75 1.333 0.75 0.75 1. ]
|
109
|
+
>>> print(results.channels)
|
110
|
+
[1 1 1 1 1 1 3 1 1 3]
|
111
|
+
"""
|
112
|
+
output = run_stats(images, bboxes, False, DimensionStatsProcessor, DimensionStatsOutput)
|
113
|
+
return DimensionStatsOutput(**output)
|
@@ -0,0 +1,75 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable
|
5
|
+
|
6
|
+
from numpy.typing import ArrayLike
|
7
|
+
|
8
|
+
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
9
|
+
from dataeval._internal.metrics.utils import pchash, xxhash
|
10
|
+
from dataeval._internal.output import set_metadata
|
11
|
+
|
12
|
+
|
13
|
+
class HashStatsProcessor(StatsProcessor):
|
14
|
+
image_function_map = {
|
15
|
+
"xxhash": lambda x: xxhash(x.image),
|
16
|
+
"pchash": lambda x: pchash(x.image),
|
17
|
+
}
|
18
|
+
|
19
|
+
|
20
|
+
@dataclass(frozen=True)
|
21
|
+
class HashStatsOutput(BaseStatsOutput):
|
22
|
+
"""
|
23
|
+
Output class for :func:`hashstats` stats metric
|
24
|
+
|
25
|
+
Attributes
|
26
|
+
----------
|
27
|
+
xxhash : List[str]
|
28
|
+
xxHash hash of the images as a hex string
|
29
|
+
pchash : List[str]
|
30
|
+
Perception hash of the images as a hex string
|
31
|
+
"""
|
32
|
+
|
33
|
+
xxhash: list[str]
|
34
|
+
pchash: list[str]
|
35
|
+
|
36
|
+
|
37
|
+
@set_metadata("dataeval.metrics")
|
38
|
+
def hashstats(
|
39
|
+
images: Iterable[ArrayLike],
|
40
|
+
bboxes: Iterable[ArrayLike] | None = None,
|
41
|
+
) -> HashStatsOutput:
|
42
|
+
"""
|
43
|
+
Calculates hashes for each image
|
44
|
+
|
45
|
+
This function computes hashes from the images including exact hashes and perception-based
|
46
|
+
hashes. These hash values can be used to determine if images are exact or near matches.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
images : ArrayLike
|
51
|
+
Images to hashing
|
52
|
+
bboxes : Iterable[ArrayLike] or None
|
53
|
+
Bounding boxes in `xyxy` format for each image
|
54
|
+
|
55
|
+
Returns
|
56
|
+
-------
|
57
|
+
HashStatsOutput
|
58
|
+
A dictionary-like object containing the computed hashes for each image.
|
59
|
+
|
60
|
+
See Also
|
61
|
+
--------
|
62
|
+
Duplicates
|
63
|
+
|
64
|
+
Examples
|
65
|
+
--------
|
66
|
+
Calculating the statistics on the images, whose shape is (C, H, W)
|
67
|
+
|
68
|
+
>>> results = hashstats(images)
|
69
|
+
>>> print(results.xxhash)
|
70
|
+
['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
|
71
|
+
>>> print(results.pchash)
|
72
|
+
['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
|
73
|
+
"""
|
74
|
+
output = run_stats(images, bboxes, False, HashStatsProcessor, HashStatsOutput)
|
75
|
+
return HashStatsOutput(**output)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections import Counter, defaultdict
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Any, Iterable, Mapping, TypeVar
|
6
|
+
|
7
|
+
from numpy.typing import ArrayLike
|
8
|
+
|
9
|
+
from dataeval._internal.interop import to_numpy
|
10
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass(frozen=True)
|
14
|
+
class LabelStatsOutput(OutputMetadata):
|
15
|
+
"""
|
16
|
+
Output class for :func:`labelstats` stats metric
|
17
|
+
|
18
|
+
Attributes
|
19
|
+
----------
|
20
|
+
label_counts_per_class : dict[str | int, int]
|
21
|
+
Dictionary whose keys are the different label classes and
|
22
|
+
values are total counts of each class
|
23
|
+
label_counts_per_image : list[int]
|
24
|
+
Number of labels per image
|
25
|
+
image_counts_per_label : dict[str | int, int]
|
26
|
+
Dictionary whose keys are the different label classes and
|
27
|
+
values are total counts of each image the class is present in
|
28
|
+
image_indices_per_label : dict[str | int, list]
|
29
|
+
Dictionary whose keys are the different label classes and
|
30
|
+
values are lists containing the images that have that label
|
31
|
+
image_count : int
|
32
|
+
Total number of images present
|
33
|
+
class_count : int
|
34
|
+
Total number of classes present
|
35
|
+
label_count : int
|
36
|
+
Total number of labels present
|
37
|
+
"""
|
38
|
+
|
39
|
+
label_counts_per_class: dict[str | int, int]
|
40
|
+
label_counts_per_image: list[int]
|
41
|
+
image_counts_per_label: dict[str | int, int]
|
42
|
+
image_indices_per_label: dict[str | int, list[int]]
|
43
|
+
image_count: int
|
44
|
+
class_count: int
|
45
|
+
label_count: int
|
46
|
+
|
47
|
+
|
48
|
+
TKey = TypeVar("TKey", int, str)
|
49
|
+
|
50
|
+
|
51
|
+
def sort(d: Mapping[TKey, Any]) -> dict[TKey, Any]:
|
52
|
+
"""
|
53
|
+
Sort mappings by key in increasing order
|
54
|
+
"""
|
55
|
+
return dict(sorted(d.items(), key=lambda x: x[0]))
|
56
|
+
|
57
|
+
|
58
|
+
@set_metadata("dataeval.metrics")
|
59
|
+
def labelstats(
|
60
|
+
labels: Iterable[ArrayLike],
|
61
|
+
) -> LabelStatsOutput:
|
62
|
+
"""
|
63
|
+
Calculates statistics for data labels
|
64
|
+
|
65
|
+
This function computes counting metrics (e.g., total per class, total per image)
|
66
|
+
on the labels.
|
67
|
+
|
68
|
+
Parameters
|
69
|
+
----------
|
70
|
+
labels : ArrayLike, shape - [label] | [[label]] or (N,M) | (N,)
|
71
|
+
Lists or numpy array of labels.
|
72
|
+
A set of lists where each list contains all labels per image -
|
73
|
+
(e.g. [[label1, label2], [label2], [label1, label3]] or [label1, label2, label1, label3]).
|
74
|
+
If a numpy array, N is the number of images, M is the number of labels per image.
|
75
|
+
|
76
|
+
Returns
|
77
|
+
-------
|
78
|
+
LabelStatsOutput
|
79
|
+
A dictionary-like object containing the computed counting metrics for the labels.
|
80
|
+
|
81
|
+
Examples
|
82
|
+
--------
|
83
|
+
Calculating the statistics on labels for a set of data
|
84
|
+
|
85
|
+
>>> stats = labelstats(labels)
|
86
|
+
>>> stats.label_counts_per_class
|
87
|
+
{'chicken': 3, 'cow': 8, 'horse': 9, 'pig': 7, 'sheep': 7}
|
88
|
+
>>> stats.label_counts_per_image
|
89
|
+
[3, 2, 3, 4, 1, 5, 4, 4, 4, 4]
|
90
|
+
>>> stats.image_counts_per_label
|
91
|
+
{'chicken': 2, 'cow': 6, 'horse': 7, 'pig': 5, 'sheep': 7}
|
92
|
+
>>> (stats.image_count, stats.class_count, stats.label_count)
|
93
|
+
(10, 5, 34)
|
94
|
+
"""
|
95
|
+
label_counts = Counter()
|
96
|
+
image_counts = Counter()
|
97
|
+
index_location = defaultdict(list[int])
|
98
|
+
label_per_image: list[int] = []
|
99
|
+
|
100
|
+
for i, group in enumerate(labels):
|
101
|
+
# Count occurrences of each label in all sublists
|
102
|
+
group = to_numpy(group)
|
103
|
+
|
104
|
+
label_counts.update(group)
|
105
|
+
|
106
|
+
# Get the number of labels per image
|
107
|
+
label_per_image.append(len(group))
|
108
|
+
|
109
|
+
# Create a set of unique items in the current sublist
|
110
|
+
unique_items: set[int] = set(group)
|
111
|
+
|
112
|
+
# Update image counts and index locations
|
113
|
+
image_counts.update(unique_items)
|
114
|
+
for item in unique_items:
|
115
|
+
index_location[item].append(i)
|
116
|
+
|
117
|
+
return LabelStatsOutput(
|
118
|
+
label_counts_per_class=sort(label_counts),
|
119
|
+
label_counts_per_image=label_per_image,
|
120
|
+
image_counts_per_label=sort(image_counts),
|
121
|
+
image_indices_per_label=sort(index_location),
|
122
|
+
image_count=len(label_per_image),
|
123
|
+
class_count=len(label_counts),
|
124
|
+
label_count=sum(label_counts.values()),
|
125
|
+
)
|
@@ -0,0 +1,119 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
|
+
from scipy.stats import entropy, kurtosis, skew
|
9
|
+
|
10
|
+
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
11
|
+
from dataeval._internal.output import set_metadata
|
12
|
+
|
13
|
+
|
14
|
+
class PixelStatsProcessor(StatsProcessor):
|
15
|
+
cache_keys = ["histogram"]
|
16
|
+
image_function_map = {
|
17
|
+
"mean": lambda self: np.mean(self.scaled),
|
18
|
+
"std": lambda x: np.std(x.scaled),
|
19
|
+
"var": lambda x: np.var(x.scaled),
|
20
|
+
"skew": lambda x: np.nan_to_num(skew(x.scaled.ravel())),
|
21
|
+
"kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled.ravel())),
|
22
|
+
"histogram": lambda x: np.histogram(x.scaled, 256, (0, 1))[0],
|
23
|
+
"entropy": lambda x: entropy(x.get("histogram")),
|
24
|
+
}
|
25
|
+
channel_function_map = {
|
26
|
+
"mean": lambda x: np.mean(x.scaled, axis=1),
|
27
|
+
"std": lambda x: np.std(x.scaled, axis=1),
|
28
|
+
"var": lambda x: np.var(x.scaled, axis=1),
|
29
|
+
"skew": lambda x: np.nan_to_num(skew(x.scaled, axis=1)),
|
30
|
+
"kurtosis": lambda x: np.nan_to_num(kurtosis(x.scaled, axis=1)),
|
31
|
+
"histogram": lambda x: np.apply_along_axis(lambda y: np.histogram(y, 256, (0, 1))[0], 1, x.scaled),
|
32
|
+
"entropy": lambda x: entropy(x.get("histogram"), axis=1),
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
@dataclass(frozen=True)
|
37
|
+
class PixelStatsOutput(BaseStatsOutput):
|
38
|
+
"""
|
39
|
+
Output class for :func:`pixelstats` stats metric
|
40
|
+
|
41
|
+
Attributes
|
42
|
+
----------
|
43
|
+
mean : NDArray[np.float16]
|
44
|
+
Mean of the pixel values of the images
|
45
|
+
std : NDArray[np.float16]
|
46
|
+
Standard deviation of the pixel values of the images
|
47
|
+
var : NDArray[np.float16]
|
48
|
+
Variance of the pixel values of the images
|
49
|
+
skew : NDArray[np.float16]
|
50
|
+
Skew of the pixel values of the images
|
51
|
+
kurtosis : NDArray[np.float16]
|
52
|
+
Kurtosis of the pixel values of the images
|
53
|
+
histogram : NDArray[np.uint32]
|
54
|
+
Histogram of the pixel values of the images across 256 bins scaled between 0 and 1
|
55
|
+
entropy : NDArray[np.float16]
|
56
|
+
Entropy of the pixel values of the images
|
57
|
+
"""
|
58
|
+
|
59
|
+
mean: NDArray[np.float16]
|
60
|
+
std: NDArray[np.float16]
|
61
|
+
var: NDArray[np.float16]
|
62
|
+
skew: NDArray[np.float16]
|
63
|
+
kurtosis: NDArray[np.float16]
|
64
|
+
histogram: NDArray[np.uint32]
|
65
|
+
entropy: NDArray[np.float16]
|
66
|
+
|
67
|
+
|
68
|
+
@set_metadata("dataeval.metrics")
|
69
|
+
def pixelstats(
|
70
|
+
images: Iterable[ArrayLike],
|
71
|
+
bboxes: Iterable[ArrayLike] | None = None,
|
72
|
+
per_channel: bool = False,
|
73
|
+
) -> PixelStatsOutput:
|
74
|
+
"""
|
75
|
+
Calculates pixel statistics for each image
|
76
|
+
|
77
|
+
This function computes various statistical metrics (e.g., mean, standard deviation, entropy)
|
78
|
+
on the images as a whole.
|
79
|
+
|
80
|
+
Parameters
|
81
|
+
----------
|
82
|
+
images : Iterable[ArrayLike]
|
83
|
+
Images to perform calculations on
|
84
|
+
bboxes : Iterable[ArrayLike] or None
|
85
|
+
Bounding boxes in `xyxy` format for each image to perform calculations
|
86
|
+
|
87
|
+
Returns
|
88
|
+
-------
|
89
|
+
PixelStatsOutput
|
90
|
+
A dictionary-like object containing the computed statistics for each image. The keys correspond
|
91
|
+
to the names of the statistics (e.g., 'mean', 'std'), and the values are lists of results for
|
92
|
+
each image or numpy arrays when the results are multi-dimensional.
|
93
|
+
|
94
|
+
See Also
|
95
|
+
--------
|
96
|
+
dimensionstats, visualstats, Outliers
|
97
|
+
|
98
|
+
Note
|
99
|
+
----
|
100
|
+
- All metrics are scaled based on the perceived bit depth (which is derived from the largest pixel value)
|
101
|
+
to allow for better comparison between images stored in different formats and different resolutions.
|
102
|
+
|
103
|
+
Examples
|
104
|
+
--------
|
105
|
+
Calculating the statistics on the images, whose shape is (C, H, W)
|
106
|
+
|
107
|
+
>>> results = pixelstats(images)
|
108
|
+
>>> print(results.mean)
|
109
|
+
[0.04828 0.562 0.06726 0.09937 0.1315 0.1636 0.1957 0.2278 0.26
|
110
|
+
0.292 0.3242 0.3562 0.3884 0.4204 0.4526 0.4846 0.5166 0.549
|
111
|
+
0.581 0.6133 0.6455 0.6772 0.7095 0.7417 0.774 0.8057 0.838
|
112
|
+
0.87 0.9023 0.934 ]
|
113
|
+
>>> print(results.entropy)
|
114
|
+
[3.238 3.303 0.8125 1.028 0.8223 1.046 0.8247 1.041 0.8203 1.012
|
115
|
+
0.812 0.9883 0.795 0.9243 0.9243 0.795 0.9907 0.8125 1.028 0.8223
|
116
|
+
1.046 0.8247 1.041 0.8203 1.012 0.812 0.9883 0.795 0.9243 0.9243]
|
117
|
+
"""
|
118
|
+
output = run_stats(images, bboxes, per_channel, PixelStatsProcessor, PixelStatsOutput)
|
119
|
+
return PixelStatsOutput(**output)
|
@@ -0,0 +1,124 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
|
+
|
9
|
+
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
10
|
+
from dataeval._internal.metrics.utils import edge_filter
|
11
|
+
from dataeval._internal.output import set_metadata
|
12
|
+
|
13
|
+
QUARTILES = (0, 25, 50, 75, 100)
|
14
|
+
|
15
|
+
|
16
|
+
class VisualStatsProcessor(StatsProcessor):
|
17
|
+
cache_keys = ["percentiles"]
|
18
|
+
image_function_map = {
|
19
|
+
"brightness": lambda x: x.get("percentiles")[-2],
|
20
|
+
"blurriness": lambda x: np.std(edge_filter(np.mean(x.image, axis=0))),
|
21
|
+
"contrast": lambda x: np.nan_to_num(
|
22
|
+
(np.max(x.get("percentiles")) - np.min(x.get("percentiles"))) / np.mean(x.get("percentiles"))
|
23
|
+
),
|
24
|
+
"darkness": lambda x: x.get("percentiles")[1],
|
25
|
+
"missing": lambda x: np.sum(np.isnan(x.image)) / np.prod(x.shape[-2:]),
|
26
|
+
"zeros": lambda x: np.count_nonzero(x.image == 0) / np.prod(x.shape[-2:]),
|
27
|
+
"percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES),
|
28
|
+
}
|
29
|
+
channel_function_map = {
|
30
|
+
"brightness": lambda x: x.get("percentiles")[:, -2],
|
31
|
+
"blurriness": lambda x: np.std(np.vectorize(edge_filter, signature="(m,n)->(m,n)")(x.image), axis=(1, 2)),
|
32
|
+
"contrast": lambda x: np.nan_to_num(
|
33
|
+
(np.max(x.get("percentiles"), axis=1) - np.min(x.get("percentiles"), axis=1))
|
34
|
+
/ np.mean(x.get("percentiles"), axis=1)
|
35
|
+
),
|
36
|
+
"darkness": lambda x: x.get("percentiles")[:, 1],
|
37
|
+
"missing": lambda x: np.sum(np.isnan(x.image), axis=(1, 2)) / np.prod(x.shape[-2:]),
|
38
|
+
"zeros": lambda x: np.count_nonzero(x.image == 0, axis=(1, 2)) / np.prod(x.shape[-2:]),
|
39
|
+
"percentiles": lambda x: np.nanpercentile(x.scaled, q=QUARTILES, axis=1).T,
|
40
|
+
}
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass(frozen=True)
|
44
|
+
class VisualStatsOutput(BaseStatsOutput):
|
45
|
+
"""
|
46
|
+
Output class for :func:`visualstats` stats metric
|
47
|
+
|
48
|
+
Attributes
|
49
|
+
----------
|
50
|
+
brightness : NDArray[np.float16]
|
51
|
+
Brightness of the images
|
52
|
+
blurriness : NDArray[np.float16]
|
53
|
+
Blurriness of the images
|
54
|
+
contrast : NDArray[np.float16]
|
55
|
+
Image contrast ratio
|
56
|
+
darkness : NDArray[np.float16]
|
57
|
+
Darkness of the images
|
58
|
+
missing : NDArray[np.float16]
|
59
|
+
Percentage of the images with missing pixels
|
60
|
+
zeros : NDArray[np.float16]
|
61
|
+
Percentage of the images with zero value pixels
|
62
|
+
percentiles : NDArray[np.float16]
|
63
|
+
Percentiles of the pixel values of the images with quartiles of (0, 25, 50, 75, 100)
|
64
|
+
"""
|
65
|
+
|
66
|
+
brightness: NDArray[np.float16]
|
67
|
+
blurriness: NDArray[np.float16]
|
68
|
+
contrast: NDArray[np.float16]
|
69
|
+
darkness: NDArray[np.float16]
|
70
|
+
missing: NDArray[np.float16]
|
71
|
+
zeros: NDArray[np.float16]
|
72
|
+
percentiles: NDArray[np.float16]
|
73
|
+
|
74
|
+
|
75
|
+
@set_metadata("dataeval.metrics")
|
76
|
+
def visualstats(
|
77
|
+
images: Iterable[ArrayLike],
|
78
|
+
bboxes: Iterable[ArrayLike] | None = None,
|
79
|
+
per_channel: bool = False,
|
80
|
+
) -> VisualStatsOutput:
|
81
|
+
"""
|
82
|
+
Calculates visual statistics for each image
|
83
|
+
|
84
|
+
This function computes various visual metrics (e.g., brightness, darkness, contrast, blurriness)
|
85
|
+
on the images as a whole.
|
86
|
+
|
87
|
+
Parameters
|
88
|
+
----------
|
89
|
+
images : Iterable[ArrayLike]
|
90
|
+
Images to perform calculations on
|
91
|
+
bboxes : Iterable[ArrayLike] or None
|
92
|
+
Bounding boxes in `xyxy` format for each image to perform calculations on
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
VisualStatsOutput
|
97
|
+
A dictionary-like object containing the computed visual statistics for each image. The keys correspond
|
98
|
+
to the names of the statistics (e.g., 'brightness', 'blurriness'), and the values are lists of results for
|
99
|
+
each image or numpy arrays when the results are multi-dimensional.
|
100
|
+
|
101
|
+
See Also
|
102
|
+
--------
|
103
|
+
dimensionstats, pixelstats, Outliers
|
104
|
+
|
105
|
+
Note
|
106
|
+
----
|
107
|
+
- `zeros` and `missing` are presented as a percentage of total pixel counts
|
108
|
+
|
109
|
+
Examples
|
110
|
+
--------
|
111
|
+
Calculating the statistics on the images, whose shape is (C, H, W)
|
112
|
+
|
113
|
+
>>> results = visualstats(images)
|
114
|
+
>>> print(results.brightness)
|
115
|
+
[0.0737 0.607 0.0713 0.1046 0.138 0.1713 0.2046 0.2379 0.2712 0.3047
|
116
|
+
0.338 0.3713 0.4045 0.438 0.4712 0.5044 0.538 0.5713 0.6045 0.638
|
117
|
+
0.6714 0.7046 0.738 0.7715 0.8047 0.838 0.871 0.905 0.938 0.971 ]
|
118
|
+
>>> print(results.contrast)
|
119
|
+
[2.041 1.332 1.293 1.279 1.272 1.268 1.265 1.263 1.261 1.26 1.259 1.258
|
120
|
+
1.258 1.257 1.257 1.256 1.256 1.255 1.255 1.255 1.255 1.254 1.254 1.254
|
121
|
+
1.254 1.254 1.254 1.253 1.253 1.253]
|
122
|
+
"""
|
123
|
+
output = run_stats(images, bboxes, per_channel, VisualStatsProcessor, VisualStatsOutput)
|
124
|
+
return VisualStatsOutput(**output)
|
@@ -4,18 +4,22 @@ FR Test Statistic based estimate for the upperbound
|
|
4
4
|
average precision using empirical mean precision
|
5
5
|
"""
|
6
6
|
|
7
|
+
from __future__ import annotations
|
8
|
+
|
7
9
|
from dataclasses import dataclass
|
8
10
|
|
9
11
|
from numpy.typing import ArrayLike
|
10
12
|
from sklearn.metrics import average_precision_score
|
11
13
|
|
12
|
-
from dataeval._internal.interop import
|
14
|
+
from dataeval._internal.interop import as_numpy
|
13
15
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
14
16
|
|
15
17
|
|
16
18
|
@dataclass(frozen=True)
|
17
19
|
class UAPOutput(OutputMetadata):
|
18
20
|
"""
|
21
|
+
Output class for :func:`uap` estimator metric
|
22
|
+
|
19
23
|
Attributes
|
20
24
|
----------
|
21
25
|
uap : float
|
@@ -48,8 +52,8 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
|
48
52
|
ValueError
|
49
53
|
If unique classes M < 2
|
50
54
|
|
51
|
-
|
52
|
-
|
55
|
+
Note
|
56
|
+
----
|
53
57
|
This function calculates the empirical mean precision using the
|
54
58
|
``average_precision_score`` from scikit-learn, weighted by the class distribution.
|
55
59
|
|
@@ -75,5 +79,5 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
|
75
79
|
UAPOutput(uap=0.7777777777777777)
|
76
80
|
"""
|
77
81
|
|
78
|
-
precision = float(average_precision_score(
|
82
|
+
precision = float(average_precision_score(as_numpy(labels), as_numpy(scores), average="weighted"))
|
79
83
|
return UAPOutput(precision)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import Any, Callable, Literal,
|
3
|
+
from typing import Any, Callable, Literal, Mapping, NamedTuple
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
import xxhash as xxh
|
7
|
-
from numpy.typing import NDArray
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
8
|
from PIL import Image
|
9
9
|
from scipy.fftpack import dct
|
10
10
|
from scipy.signal import convolve2d
|
@@ -14,6 +14,8 @@ from scipy.spatial.distance import pdist, squareform
|
|
14
14
|
from scipy.stats import entropy as sp_entropy
|
15
15
|
from sklearn.neighbors import NearestNeighbors
|
16
16
|
|
17
|
+
from dataeval._internal.interop import to_numpy
|
18
|
+
|
17
19
|
EPSILON = 1e-5
|
18
20
|
EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
|
19
21
|
BIT_DEPTH = (1, 8, 12, 16, 32)
|
@@ -89,8 +91,8 @@ def entropy(
|
|
89
91
|
subset_mask: NDArray[np.bool_] | None
|
90
92
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
91
93
|
|
92
|
-
|
93
|
-
|
94
|
+
Note
|
95
|
+
----
|
94
96
|
For continuous variables, histogram bins are chosen automatically. See
|
95
97
|
numpy.histogram for details.
|
96
98
|
|
@@ -162,26 +164,26 @@ def infer_categorical(X: NDArray, threshold: float = 0.2) -> NDArray:
|
|
162
164
|
|
163
165
|
|
164
166
|
def preprocess_metadata(
|
165
|
-
class_labels:
|
167
|
+
class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
|
166
168
|
) -> tuple[NDArray, list[str], list[bool]]:
|
167
|
-
# convert class_labels and
|
168
|
-
|
169
|
-
"class_label": np.asarray(class_labels, dtype=int),
|
170
|
-
**{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
|
171
|
-
}
|
169
|
+
# convert class_labels and dict of lists to matrix of metadata values
|
170
|
+
preprocessed_metadata = {"class_label": np.asarray(class_labels, dtype=int)}
|
172
171
|
|
173
172
|
# map columns of dict that are not numeric (e.g. string) to numeric values
|
174
173
|
# that mutual information and diversity functions can accommodate. Each
|
175
174
|
# unique string receives a unique integer value.
|
176
|
-
for k, v in
|
175
|
+
for k, v in metadata.items():
|
177
176
|
# if not numeric
|
177
|
+
v = to_numpy(v)
|
178
178
|
if not np.issubdtype(v.dtype, np.number):
|
179
179
|
_, mapped_vals = np.unique(v, return_inverse=True)
|
180
|
-
|
180
|
+
preprocessed_metadata[k] = mapped_vals
|
181
|
+
else:
|
182
|
+
preprocessed_metadata[k] = v
|
181
183
|
|
182
|
-
data = np.stack(list(
|
183
|
-
names = list(
|
184
|
-
is_categorical = [infer_categorical(
|
184
|
+
data = np.stack(list(preprocessed_metadata.values()), axis=-1)
|
185
|
+
names = list(preprocessed_metadata.keys())
|
186
|
+
is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
|
185
187
|
|
186
188
|
return data, names, is_categorical
|
187
189
|
|
@@ -350,6 +352,19 @@ def normalize_image_shape(image: NDArray) -> NDArray:
|
|
350
352
|
raise ValueError("Images must have 2 or more dimensions.")
|
351
353
|
|
352
354
|
|
355
|
+
def normalize_box_shape(bounding_box: NDArray) -> NDArray:
|
356
|
+
"""
|
357
|
+
Normalizes the bounding box shape into (N,4).
|
358
|
+
"""
|
359
|
+
ndim = bounding_box.ndim
|
360
|
+
if ndim == 1:
|
361
|
+
return np.expand_dims(bounding_box, axis=0)
|
362
|
+
elif ndim > 2:
|
363
|
+
raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
|
364
|
+
else:
|
365
|
+
return bounding_box
|
366
|
+
|
367
|
+
|
353
368
|
def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
|
354
369
|
"""
|
355
370
|
Returns the image filtered using a 3x3 edge detection kernel:
|