dataeval 0.74.1__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +33 -10
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +1 -1
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +1 -1
- dataeval/detectors/drift/mmd.py +6 -5
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +3 -2
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +2 -7
- dataeval/detectors/linters/duplicates.py +6 -10
- dataeval/detectors/linters/outliers.py +4 -2
- dataeval/detectors/ood/__init__.py +3 -10
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +64 -161
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +16 -3
- dataeval/log.py +18 -0
- dataeval/metrics/__init__.py +2 -2
- dataeval/metrics/bias/__init__.py +9 -12
- dataeval/metrics/bias/balance.py +10 -8
- dataeval/metrics/bias/coverage.py +52 -4
- dataeval/metrics/bias/diversity.py +42 -14
- dataeval/metrics/bias/parity.py +15 -12
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +3 -1
- dataeval/metrics/estimators/divergence.py +1 -1
- dataeval/metrics/estimators/uap.py +1 -1
- dataeval/metrics/stats/__init__.py +18 -18
- dataeval/metrics/stats/base.py +4 -4
- dataeval/metrics/stats/boxratiostats.py +8 -9
- dataeval/metrics/stats/datasetstats.py +10 -14
- dataeval/metrics/stats/dimensionstats.py +4 -4
- dataeval/metrics/stats/hashstats.py +12 -8
- dataeval/metrics/stats/labelstats.py +5 -5
- dataeval/metrics/stats/pixelstats.py +4 -9
- dataeval/metrics/stats/visualstats.py +4 -9
- dataeval/output.py +1 -1
- dataeval/utils/__init__.py +4 -13
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/dataset/split.py +527 -0
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +310 -5
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +1 -104
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +16 -6
- dataeval-0.75.0.dist-info/METADATA +136 -0
- dataeval-0.75.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval/utils/split_dataset.py +0 -492
- dataeval-0.74.1.dist-info/METADATA +0 -120
- dataeval-0.74.1.dist-info/RECORD +0 -65
- {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/WHEEL +0 -0
dataeval/metrics/__init__.py
CHANGED
@@ -3,6 +3,6 @@ Metrics are a way to measure the performance of your models or datasets that
|
|
3
3
|
can then be analyzed in the context of a given problem.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from dataeval.metrics import bias, estimators, stats
|
7
|
-
|
8
6
|
__all__ = ["bias", "estimators", "stats"]
|
7
|
+
|
8
|
+
from dataeval.metrics import bias, estimators, stats
|
@@ -3,22 +3,19 @@ Bias metrics check for skewed or imbalanced datasets and incomplete feature
|
|
3
3
|
representation which may impact model performance.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from dataeval.metrics.bias.balance import BalanceOutput, balance
|
7
|
-
from dataeval.metrics.bias.coverage import CoverageOutput, coverage
|
8
|
-
from dataeval.metrics.bias.diversity import DiversityOutput, diversity
|
9
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput, metadata_preprocessing
|
10
|
-
from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
|
11
|
-
|
12
6
|
__all__ = [
|
7
|
+
"BalanceOutput",
|
8
|
+
"CoverageOutput",
|
9
|
+
"DiversityOutput",
|
10
|
+
"ParityOutput",
|
13
11
|
"balance",
|
14
12
|
"coverage",
|
15
13
|
"diversity",
|
16
14
|
"label_parity",
|
17
15
|
"parity",
|
18
|
-
"metadata_preprocessing",
|
19
|
-
"BalanceOutput",
|
20
|
-
"CoverageOutput",
|
21
|
-
"DiversityOutput",
|
22
|
-
"ParityOutput",
|
23
|
-
"MetadataOutput",
|
24
16
|
]
|
17
|
+
|
18
|
+
from dataeval.metrics.bias.balance import BalanceOutput, balance
|
19
|
+
from dataeval.metrics.bias.coverage import CoverageOutput, coverage
|
20
|
+
from dataeval.metrics.bias.diversity import DiversityOutput, diversity
|
21
|
+
from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
|
dataeval/metrics/bias/balance.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
import warnings
|
@@ -12,9 +12,9 @@ import scipy as sp
|
|
12
12
|
from numpy.typing import NDArray
|
13
13
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
14
14
|
|
15
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
16
|
-
from dataeval.metrics.bias.metadata_utils import get_counts, heatmap
|
17
15
|
from dataeval.output import Output, set_metadata
|
16
|
+
from dataeval.utils.metadata import Metadata, get_counts
|
17
|
+
from dataeval.utils.plot import heatmap
|
18
18
|
|
19
19
|
with contextlib.suppress(ImportError):
|
20
20
|
from matplotlib.figure import Figure
|
@@ -119,7 +119,7 @@ def _validate_num_neighbors(num_neighbors: int) -> int:
|
|
119
119
|
|
120
120
|
@set_metadata
|
121
121
|
def balance(
|
122
|
-
metadata:
|
122
|
+
metadata: Metadata,
|
123
123
|
num_neighbors: int = 5,
|
124
124
|
) -> BalanceOutput:
|
125
125
|
"""
|
@@ -127,14 +127,16 @@ def balance(
|
|
127
127
|
|
128
128
|
Parameters
|
129
129
|
----------
|
130
|
-
metadata :
|
131
|
-
|
130
|
+
metadata : Metadata
|
131
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
132
|
+
num_neighbors : int, default 5
|
133
|
+
Number of points to consider as neighbors
|
132
134
|
|
133
135
|
Returns
|
134
136
|
-------
|
135
137
|
BalanceOutput
|
136
|
-
(num_factors+1) x (num_factors+1) estimate of mutual information
|
137
|
-
|
138
|
+
(num_factors+1) x (num_factors+1) estimate of mutual information \
|
139
|
+
between num_factors metadata factors and class label. Symmetry is enforced.
|
138
140
|
|
139
141
|
Note
|
140
142
|
----
|
@@ -1,18 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
import math
|
7
7
|
from dataclasses import dataclass
|
8
|
-
from typing import Literal
|
8
|
+
from typing import Any, Literal
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
12
12
|
from scipy.spatial.distance import pdist, squareform
|
13
13
|
|
14
14
|
from dataeval.interop import to_numpy
|
15
|
-
from dataeval.metrics.bias.metadata_utils import coverage_plot
|
16
15
|
from dataeval.output import Output, set_metadata
|
17
16
|
from dataeval.utils.shared import flatten
|
18
17
|
|
@@ -20,6 +19,55 @@ with contextlib.suppress(ImportError):
|
|
20
19
|
from matplotlib.figure import Figure
|
21
20
|
|
22
21
|
|
22
|
+
def _plot(images: NDArray[Any], num_images: int) -> Figure:
|
23
|
+
"""
|
24
|
+
Creates a single plot of all of the provided images
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
images : NDArray
|
29
|
+
Array containing only the desired images to plot
|
30
|
+
|
31
|
+
Returns
|
32
|
+
-------
|
33
|
+
matplotlib.figure.Figure
|
34
|
+
Plot of all provided images
|
35
|
+
"""
|
36
|
+
import matplotlib.pyplot as plt
|
37
|
+
|
38
|
+
num_images = min(num_images, len(images))
|
39
|
+
|
40
|
+
if images.ndim == 4:
|
41
|
+
images = np.moveaxis(images, 1, -1)
|
42
|
+
elif images.ndim == 3:
|
43
|
+
images = np.repeat(images[:, :, :, np.newaxis], 3, axis=-1)
|
44
|
+
else:
|
45
|
+
raise ValueError(
|
46
|
+
f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
|
47
|
+
)
|
48
|
+
|
49
|
+
rows = int(np.ceil(num_images / 3))
|
50
|
+
fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
|
51
|
+
|
52
|
+
if rows == 1:
|
53
|
+
for j in range(3):
|
54
|
+
if j >= len(images):
|
55
|
+
continue
|
56
|
+
axs[j].imshow(images[j])
|
57
|
+
axs[j].axis("off")
|
58
|
+
else:
|
59
|
+
for i in range(rows):
|
60
|
+
for j in range(3):
|
61
|
+
i_j = i * 3 + j
|
62
|
+
if i_j >= len(images):
|
63
|
+
continue
|
64
|
+
axs[i, j].imshow(images[i_j])
|
65
|
+
axs[i, j].axis("off")
|
66
|
+
|
67
|
+
fig.tight_layout()
|
68
|
+
return fig
|
69
|
+
|
70
|
+
|
23
71
|
@dataclass(frozen=True)
|
24
72
|
class CoverageOutput(Output):
|
25
73
|
"""
|
@@ -62,7 +110,7 @@ class CoverageOutput(Output):
|
|
62
110
|
selected_images = images[highest_uncovered_indices]
|
63
111
|
|
64
112
|
# Plot the images
|
65
|
-
fig =
|
113
|
+
fig = _plot(selected_images, top_k)
|
66
114
|
|
67
115
|
return fig
|
68
116
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import dataclass
|
@@ -10,15 +10,44 @@ import numpy as np
|
|
10
10
|
import scipy as sp
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
12
12
|
|
13
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
14
|
-
from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
|
15
13
|
from dataeval.output import Output, set_metadata
|
14
|
+
from dataeval.utils.metadata import Metadata, get_counts
|
15
|
+
from dataeval.utils.plot import heatmap
|
16
16
|
from dataeval.utils.shared import get_method
|
17
17
|
|
18
18
|
with contextlib.suppress(ImportError):
|
19
19
|
from matplotlib.figure import Figure
|
20
20
|
|
21
21
|
|
22
|
+
def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
|
23
|
+
"""
|
24
|
+
Plots a formatted bar plot
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
labels : NDArray
|
29
|
+
Array containing the labels for each bar
|
30
|
+
bar_heights : NDArray
|
31
|
+
Array containing the values for each bar
|
32
|
+
|
33
|
+
Returns
|
34
|
+
-------
|
35
|
+
matplotlib.figure.Figure
|
36
|
+
Bar plot figure
|
37
|
+
"""
|
38
|
+
import matplotlib.pyplot as plt
|
39
|
+
|
40
|
+
fig, ax = plt.subplots(figsize=(10, 10))
|
41
|
+
|
42
|
+
ax.bar(labels, bar_heights)
|
43
|
+
ax.set_xlabel("Factors")
|
44
|
+
|
45
|
+
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
|
46
|
+
|
47
|
+
fig.tight_layout()
|
48
|
+
return fig
|
49
|
+
|
50
|
+
|
22
51
|
@dataclass(frozen=True)
|
23
52
|
class DiversityOutput(Output):
|
24
53
|
"""
|
@@ -77,8 +106,7 @@ class DiversityOutput(Output):
|
|
77
106
|
else:
|
78
107
|
# Creating label array for heat map axes
|
79
108
|
heat_labels = np.concatenate((["class"], self.factor_names))
|
80
|
-
|
81
|
-
fig = diversity_bar_plot(heat_labels, self.diversity_index)
|
109
|
+
fig = _plot(heat_labels, self.diversity_index)
|
82
110
|
|
83
111
|
return fig
|
84
112
|
|
@@ -165,7 +193,7 @@ def diversity_simpson(
|
|
165
193
|
|
166
194
|
@set_metadata
|
167
195
|
def diversity(
|
168
|
-
metadata:
|
196
|
+
metadata: Metadata,
|
169
197
|
method: Literal["simpson", "shannon"] = "simpson",
|
170
198
|
) -> DiversityOutput:
|
171
199
|
"""
|
@@ -179,8 +207,8 @@ def diversity(
|
|
179
207
|
|
180
208
|
Parameters
|
181
209
|
----------
|
182
|
-
metadata :
|
183
|
-
|
210
|
+
metadata : Metadata
|
211
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
184
212
|
|
185
213
|
Note
|
186
214
|
----
|
@@ -199,21 +227,21 @@ def diversity(
|
|
199
227
|
|
200
228
|
>>> div_simp = diversity(metadata, method="simpson")
|
201
229
|
>>> div_simp.diversity_index
|
202
|
-
array([0.
|
230
|
+
array([0.6 , 0.80882353, 1. , 0.8 ])
|
203
231
|
|
204
232
|
>>> div_simp.classwise
|
205
|
-
array([[0.
|
206
|
-
[0.
|
233
|
+
array([[0.5 , 0.8 , 0.8 ],
|
234
|
+
[0.63043478, 0.97560976, 0.52830189]])
|
207
235
|
|
208
236
|
Compute Shannon diversity index of metadata and class labels
|
209
237
|
|
210
238
|
>>> div_shan = diversity(metadata, method="shannon")
|
211
239
|
>>> div_shan.diversity_index
|
212
|
-
array([0.
|
240
|
+
array([0.81127812, 0.9426312 , 1. , 0.91829583])
|
213
241
|
|
214
242
|
>>> div_shan.classwise
|
215
|
-
array([[0.
|
216
|
-
[0.
|
243
|
+
array([[0.68260619, 0.91829583, 0.91829583],
|
244
|
+
[0.81443569, 0.99107606, 0.76420451]])
|
217
245
|
|
218
246
|
See Also
|
219
247
|
--------
|
dataeval/metrics/bias/parity.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
@@ -12,8 +12,8 @@ from scipy.stats import chisquare
|
|
12
12
|
from scipy.stats.contingency import chi2_contingency, crosstab
|
13
13
|
|
14
14
|
from dataeval.interop import as_numpy, to_numpy
|
15
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
16
15
|
from dataeval.output import Output, set_metadata
|
16
|
+
from dataeval.utils.metadata import Metadata
|
17
17
|
|
18
18
|
TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
19
19
|
|
@@ -167,8 +167,9 @@ def label_parity(
|
|
167
167
|
--------
|
168
168
|
Randomly creating some label distributions using ``np.random.default_rng``
|
169
169
|
|
170
|
-
>>>
|
171
|
-
>>>
|
170
|
+
>>> rng = np.random.default_rng(175)
|
171
|
+
>>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
|
172
|
+
>>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
|
172
173
|
>>> label_parity(expected_labels, observed_labels)
|
173
174
|
ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
|
174
175
|
"""
|
@@ -205,7 +206,7 @@ def label_parity(
|
|
205
206
|
|
206
207
|
|
207
208
|
@set_metadata
|
208
|
-
def parity(metadata:
|
209
|
+
def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
209
210
|
"""
|
210
211
|
Calculate chi-square statistics to assess the linear relationship between multiple factors
|
211
212
|
and class labels.
|
@@ -216,8 +217,8 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
|
|
216
217
|
|
217
218
|
Parameters
|
218
219
|
----------
|
219
|
-
metadata :
|
220
|
-
|
220
|
+
metadata : Metadata
|
221
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
221
222
|
|
222
223
|
Returns
|
223
224
|
-------
|
@@ -249,16 +250,18 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
|
|
249
250
|
--------
|
250
251
|
Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
|
251
252
|
|
252
|
-
>>>
|
253
|
+
>>> from dataeval.utils.metadata import preprocess
|
254
|
+
>>> rng = np.random.default_rng(175)
|
255
|
+
>>> labels = rng.choice([0, 1, 2], (100))
|
253
256
|
>>> metadata_dict = [
|
254
257
|
... {
|
255
|
-
... "age": list(
|
256
|
-
... "income": list(
|
257
|
-
... "gender": list(
|
258
|
+
... "age": list(rng.choice([25, 30, 35, 45], (100))),
|
259
|
+
... "income": list(rng.choice([50000, 65000, 80000], (100))),
|
260
|
+
... "gender": list(rng.choice(["M", "F"], (100))),
|
258
261
|
... }
|
259
262
|
... ]
|
260
263
|
>>> continuous_factor_bincounts = {"age": 4, "income": 3}
|
261
|
-
>>> metadata =
|
264
|
+
>>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
|
262
265
|
>>> parity(metadata)
|
263
266
|
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
|
264
267
|
""" # noqa: E501
|
@@ -2,8 +2,8 @@
|
|
2
2
|
Estimators calculate performance bounds and the statistical distance between datasets.
|
3
3
|
"""
|
4
4
|
|
5
|
+
__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
|
6
|
+
|
5
7
|
from dataeval.metrics.estimators.ber import BEROutput, ber
|
6
8
|
from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
|
7
9
|
from dataeval.metrics.estimators.uap import UAPOutput, uap
|
8
|
-
|
9
|
-
__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
|
@@ -5,11 +5,12 @@ KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
|
|
5
5
|
|
6
6
|
Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
|
7
7
|
https://arxiv.org/abs/1811.06419
|
8
|
+
|
8
9
|
"""
|
9
10
|
|
10
11
|
from __future__ import annotations
|
11
12
|
|
12
|
-
__all__ = [
|
13
|
+
__all__ = []
|
13
14
|
|
14
15
|
from dataclasses import dataclass
|
15
16
|
from typing import Literal
|
@@ -38,6 +39,7 @@ class BEROutput(Output):
|
|
38
39
|
"""
|
39
40
|
|
40
41
|
ber: float
|
42
|
+
|
41
43
|
ber_lower: float
|
42
44
|
|
43
45
|
|
@@ -3,6 +3,24 @@ Statistics metrics calculate a variety of image properties and pixel statistics
|
|
3
3
|
and label statistics against the images and labels of a dataset.
|
4
4
|
"""
|
5
5
|
|
6
|
+
__all__ = [
|
7
|
+
"ChannelStatsOutput",
|
8
|
+
"DatasetStatsOutput",
|
9
|
+
"DimensionStatsOutput",
|
10
|
+
"HashStatsOutput",
|
11
|
+
"LabelStatsOutput",
|
12
|
+
"PixelStatsOutput",
|
13
|
+
"VisualStatsOutput",
|
14
|
+
"boxratiostats",
|
15
|
+
"channelstats",
|
16
|
+
"datasetstats",
|
17
|
+
"dimensionstats",
|
18
|
+
"hashstats",
|
19
|
+
"labelstats",
|
20
|
+
"pixelstats",
|
21
|
+
"visualstats",
|
22
|
+
]
|
23
|
+
|
6
24
|
from dataeval.metrics.stats.boxratiostats import boxratiostats
|
7
25
|
from dataeval.metrics.stats.datasetstats import (
|
8
26
|
ChannelStatsOutput,
|
@@ -15,21 +33,3 @@ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
|
|
15
33
|
from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
16
34
|
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
|
17
35
|
from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
|
18
|
-
|
19
|
-
__all__ = [
|
20
|
-
"boxratiostats",
|
21
|
-
"channelstats",
|
22
|
-
"datasetstats",
|
23
|
-
"dimensionstats",
|
24
|
-
"hashstats",
|
25
|
-
"labelstats",
|
26
|
-
"pixelstats",
|
27
|
-
"visualstats",
|
28
|
-
"ChannelStatsOutput",
|
29
|
-
"DatasetStatsOutput",
|
30
|
-
"DimensionStatsOutput",
|
31
|
-
"HashStatsOutput",
|
32
|
-
"LabelStatsOutput",
|
33
|
-
"PixelStatsOutput",
|
34
|
-
"VisualStatsOutput",
|
35
|
-
]
|
dataeval/metrics/stats/base.py
CHANGED
@@ -193,7 +193,7 @@ class StatsProcessorOutput(NamedTuple):
|
|
193
193
|
results: list[dict[str, Any]]
|
194
194
|
source_indices: list[SourceIndex]
|
195
195
|
box_counts: list[int]
|
196
|
-
warnings_list: list[
|
196
|
+
warnings_list: list[str]
|
197
197
|
|
198
198
|
|
199
199
|
def process_stats(
|
@@ -206,13 +206,13 @@ def process_stats(
|
|
206
206
|
results_list: list[dict[str, Any]] = []
|
207
207
|
source_indices: list[SourceIndex] = []
|
208
208
|
box_counts: list[int] = []
|
209
|
-
warnings_list: list[
|
209
|
+
warnings_list: list[str] = []
|
210
210
|
nboxes = [None] if boxes is None else normalize_box_shape(boxes)
|
211
211
|
for i_b, box in enumerate(nboxes):
|
212
212
|
i_b = None if box is None else i_b
|
213
213
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
214
214
|
if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
|
215
|
-
warnings_list.append(
|
215
|
+
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
|
216
216
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
217
217
|
if per_channel:
|
218
218
|
source_indices.extend([SourceIndex(i, i_b, c) for c in range(image_boxes[0].shape[-3])])
|
@@ -302,7 +302,7 @@ def run_stats(
|
|
302
302
|
|
303
303
|
# warnings are not emitted while in multiprocessing pools so we emit after gathering all warnings
|
304
304
|
for w in warning_list:
|
305
|
-
warnings.warn(
|
305
|
+
warnings.warn(w, UserWarning)
|
306
306
|
|
307
307
|
output = {}
|
308
308
|
for results in results_list:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import copy
|
6
6
|
from typing import Any, Callable, Generic, TypeVar, cast
|
@@ -130,17 +130,16 @@ def boxratiostats(
|
|
130
130
|
--------
|
131
131
|
Calculating the box ratio statistics using the dimension stats of the boxes and images
|
132
132
|
|
133
|
-
>>>
|
134
|
-
>>>
|
133
|
+
>>> from dataeval.metrics.stats import dimensionstats
|
134
|
+
>>> imagestats = dimensionstats(stats_images)
|
135
|
+
>>> boxstats = dimensionstats(stats_images, bboxes)
|
135
136
|
>>> ratiostats = boxratiostats(boxstats, imagestats)
|
136
137
|
>>> print(ratiostats.aspect_ratio)
|
137
|
-
[
|
138
|
-
0.
|
139
|
-
0.69596354 20. 5.11197917 2.33333333 0.75 0.70019531]
|
138
|
+
[ 0.86376953 0.58837891 16. 0.85714286 1.26959707 0.43772894
|
139
|
+
0.66650391 3.83296703 1.95018315]
|
140
140
|
>>> print(ratiostats.size)
|
141
|
-
[0.
|
142
|
-
0.
|
143
|
-
0.02246094 0.0012207 0.01123047 0.00911458 0.02636719 0.06835938]
|
141
|
+
[0.0255127 0.01037598 0.00097656 0.01822917 0.02327474 0.00683594
|
142
|
+
0.00915527 0.03369141 0.02115885]
|
144
143
|
"""
|
145
144
|
output_cls = type(boxstats)
|
146
145
|
if type(boxstats) is not type(imgstats):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Iterable
|
@@ -25,7 +25,7 @@ class DatasetStatsOutput(Output):
|
|
25
25
|
|
26
26
|
This class represents the outputs of various stats functions against a single
|
27
27
|
dataset, such that each index across all stat outputs are representative of
|
28
|
-
the same source image.
|
28
|
+
the same source image. Modifying or mixing outputs will result in inaccurate
|
29
29
|
outlier calculations if not created correctly.
|
30
30
|
|
31
31
|
Attributes
|
@@ -60,7 +60,7 @@ class ChannelStatsOutput(Output):
|
|
60
60
|
|
61
61
|
This class represents the outputs of various per-channel stats functions against
|
62
62
|
a single dataset, such that each index across all stat outputs are representative
|
63
|
-
of the same source image.
|
63
|
+
of the same source image. Modifying or mixing outputs will result in inaccurate
|
64
64
|
outlier calculations if not created correctly.
|
65
65
|
|
66
66
|
Attributes
|
@@ -119,13 +119,11 @@ def datasetstats(
|
|
119
119
|
--------
|
120
120
|
Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
|
121
121
|
|
122
|
-
>>> stats = datasetstats(
|
122
|
+
>>> stats = datasetstats(stats_images, bboxes)
|
123
123
|
>>> print(stats.dimensionstats.aspect_ratio)
|
124
|
-
[ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3
|
125
|
-
|
126
|
-
|
127
|
-
[1.744 1.946 0.1164 0.0635 0.0633 0.06274 0.0429 0.0317 0.0317
|
128
|
-
0.02576 0.02081 0.02171 0.01915 0.01767 0.01799 0.01595 0.01433 0.01478]
|
124
|
+
[ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
|
125
|
+
>>> print(stats.visualstats.sharpness)
|
126
|
+
[4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
|
129
127
|
"""
|
130
128
|
outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
|
131
129
|
return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
|
@@ -162,12 +160,10 @@ def channelstats(
|
|
162
160
|
--------
|
163
161
|
Calculating the per-channel pixel and visual stats for a dataset
|
164
162
|
|
165
|
-
>>> stats = channelstats(
|
163
|
+
>>> stats = channelstats(stats_images)
|
166
164
|
>>> print(stats.visualstats.darkness)
|
167
|
-
[0.
|
168
|
-
0.
|
169
|
-
0.6045 0.611 0.617 0.7046 0.711 0.7173 0.8047 0.811 0.8174
|
170
|
-
0.905 0.911 0.917 ]
|
165
|
+
[0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
|
166
|
+
0.8154]
|
171
167
|
"""
|
172
168
|
outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
|
173
169
|
return ChannelStatsOutput(*outputs) # type: ignore
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Callable, Iterable
|
@@ -106,10 +106,10 @@ def dimensionstats(
|
|
106
106
|
--------
|
107
107
|
Calculating the dimension statistics on the images, whose shape is (C, H, W)
|
108
108
|
|
109
|
-
>>> results = dimensionstats(
|
109
|
+
>>> results = dimensionstats(stats_images)
|
110
110
|
>>> print(results.aspect_ratio)
|
111
|
-
[
|
111
|
+
[1. 1. 1.333 1. 0.6665]
|
112
112
|
>>> print(results.channels)
|
113
|
-
[
|
113
|
+
[3 3 1 3 1]
|
114
114
|
"""
|
115
115
|
return run_stats(images, bboxes, False, [DimensionStatsProcessor])[0]
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
|
3
|
+
import warnings
|
4
|
+
|
5
|
+
__all__ = []
|
4
6
|
|
5
7
|
from dataclasses import dataclass
|
6
8
|
from typing import Callable, Iterable
|
@@ -41,7 +43,7 @@ def pchash(image: ArrayLike) -> str:
|
|
41
43
|
"""
|
42
44
|
Performs a perceptual hash on an image by resizing to a square NxN image
|
43
45
|
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
44
|
-
8 that is smaller than the input image dimensions.
|
46
|
+
8 that is smaller than the input image dimensions. The resampled image
|
45
47
|
is compressed using a discrete cosine transform and the lowest frequency
|
46
48
|
component is encoded as a bit array of greater or less than median value
|
47
49
|
and returned as a hex string.
|
@@ -54,13 +56,15 @@ def pchash(image: ArrayLike) -> str:
|
|
54
56
|
Returns
|
55
57
|
-------
|
56
58
|
str
|
57
|
-
The hex string hash of the image using perceptual hashing
|
59
|
+
The hex string hash of the image using perceptual hashing, or empty
|
60
|
+
string if the image is too small to be hashed
|
58
61
|
"""
|
59
62
|
# Verify that the image is at least larger than an 8x8 image
|
60
63
|
arr = as_numpy(image)
|
61
64
|
min_dim = min(arr.shape[-2:])
|
62
65
|
if min_dim < HASH_SIZE + 1:
|
63
|
-
|
66
|
+
warnings.warn(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
67
|
+
return ""
|
64
68
|
|
65
69
|
# Calculates the dimensions of the resized square image
|
66
70
|
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
@@ -92,7 +96,7 @@ def pchash(image: ArrayLike) -> str:
|
|
92
96
|
def xxhash(image: ArrayLike) -> str:
|
93
97
|
"""
|
94
98
|
Performs a fast non-cryptographic hash using the xxhash algorithm
|
95
|
-
(xxhash.com) against the image as a flattened bytearray.
|
99
|
+
(xxhash.com) against the image as a flattened bytearray. The hash
|
96
100
|
is returned as a hex string.
|
97
101
|
|
98
102
|
Parameters
|
@@ -147,10 +151,10 @@ def hashstats(
|
|
147
151
|
--------
|
148
152
|
Calculating the statistics on the images, whose shape is (C, H, W)
|
149
153
|
|
150
|
-
>>> results = hashstats(
|
154
|
+
>>> results = hashstats(stats_images)
|
151
155
|
>>> print(results.xxhash)
|
152
|
-
['
|
156
|
+
['6274f837b34ed9f0', '256504fdb6e3d2a4', '7dd0c56ca8474fb0', '50956ad4592f5bbc', '5ba2354079d42aa5']
|
153
157
|
>>> print(results.pchash)
|
154
|
-
['
|
158
|
+
['a666999999666666', 'e666999999266666', 'e666999966663299', 'e666999999266666', '96e91656e91616e9']
|
155
159
|
"""
|
156
160
|
return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
|