dataeval 0.82.0__py3-none-any.whl → 0.83.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +7 -2
- dataeval/config.py +78 -11
- dataeval/detectors/drift/_mmd.py +9 -9
- dataeval/detectors/drift/_torch.py +7 -7
- dataeval/detectors/drift/_uncertainty.py +4 -4
- dataeval/detectors/linters/duplicates.py +3 -3
- dataeval/detectors/linters/outliers.py +3 -3
- dataeval/detectors/ood/ae.py +5 -4
- dataeval/detectors/ood/base.py +2 -2
- dataeval/detectors/ood/mixin.py +1 -1
- dataeval/detectors/ood/vae.py +2 -1
- dataeval/metadata/__init__.py +2 -2
- dataeval/metadata/_distance.py +11 -44
- dataeval/metadata/_ood.py +152 -33
- dataeval/metrics/bias/_balance.py +9 -5
- dataeval/metrics/bias/_diversity.py +3 -0
- dataeval/metrics/bias/_parity.py +2 -0
- dataeval/metrics/estimators/_ber.py +2 -1
- dataeval/metrics/stats/_base.py +20 -21
- dataeval/metrics/stats/_boxratiostats.py +1 -1
- dataeval/metrics/stats/_dimensionstats.py +2 -2
- dataeval/metrics/stats/_hashstats.py +2 -2
- dataeval/metrics/stats/_imagestats.py +8 -8
- dataeval/metrics/stats/_pixelstats.py +2 -2
- dataeval/metrics/stats/_visualstats.py +2 -2
- dataeval/outputs/__init__.py +5 -0
- dataeval/outputs/_base.py +50 -21
- dataeval/outputs/_bias.py +1 -1
- dataeval/outputs/_linters.py +4 -2
- dataeval/outputs/_metadata.py +61 -0
- dataeval/outputs/_stats.py +12 -6
- dataeval/typing.py +40 -9
- dataeval/utils/_mst.py +1 -2
- dataeval/utils/data/_embeddings.py +23 -19
- dataeval/utils/data/_metadata.py +16 -7
- dataeval/utils/data/_selection.py +22 -15
- dataeval/utils/data/_split.py +3 -2
- dataeval/utils/data/datasets/_base.py +4 -2
- dataeval/utils/data/datasets/_cifar10.py +17 -9
- dataeval/utils/data/datasets/_milco.py +18 -12
- dataeval/utils/data/datasets/_mnist.py +24 -8
- dataeval/utils/data/datasets/_ships.py +18 -8
- dataeval/utils/data/datasets/_types.py +1 -5
- dataeval/utils/data/datasets/_voc.py +47 -24
- dataeval/utils/data/selections/__init__.py +2 -0
- dataeval/utils/data/selections/_classfilter.py +5 -3
- dataeval/utils/data/selections/_prioritize.py +296 -0
- dataeval/utils/data/selections/_shuffle.py +13 -4
- dataeval/utils/torch/_gmm.py +3 -2
- dataeval/utils/torch/_internal.py +5 -5
- dataeval/utils/torch/trainer.py +8 -8
- {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/METADATA +4 -4
- dataeval-0.83.0.dist-info/RECORD +105 -0
- dataeval/detectors/ood/metadata_ood_mi.py +0 -93
- dataeval-0.82.0.dist-info/RECORD +0 -104
- {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.82.0.dist-info → dataeval-0.83.0.dist-info}/WHEEL +0 -0
dataeval/metadata/_ood.py
CHANGED
@@ -6,13 +6,44 @@ import warnings
|
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from numpy.typing import NDArray
|
9
|
+
from sklearn.feature_selection import mutual_info_classif
|
9
10
|
|
11
|
+
from dataeval.config import get_seed
|
10
12
|
from dataeval.metadata._utils import _compare_keys, _validate_factors_and_data
|
11
|
-
from dataeval.outputs import OODOutput
|
13
|
+
from dataeval.outputs import MostDeviatedFactorsOutput, OODOutput, OODPredictorOutput
|
14
|
+
from dataeval.outputs._base import set_metadata
|
12
15
|
from dataeval.utils.data import Metadata
|
13
16
|
|
14
17
|
|
15
|
-
def
|
18
|
+
def _combine_discrete_continuous(metadata: Metadata) -> tuple[list[str], NDArray[np.float64]]:
|
19
|
+
"""Combines the discrete and continuous data of a :class:`Metadata` object
|
20
|
+
|
21
|
+
Returns
|
22
|
+
-------
|
23
|
+
Tuple[list[str], NDArray]
|
24
|
+
The combined list of factors names and the combined discrete and continuous data
|
25
|
+
|
26
|
+
Note
|
27
|
+
----
|
28
|
+
Discrete and continuous data must have the same number of samples
|
29
|
+
"""
|
30
|
+
names = []
|
31
|
+
data = []
|
32
|
+
|
33
|
+
if metadata.discrete_factor_names and metadata.discrete_data.size != 0:
|
34
|
+
names.extend(metadata.discrete_factor_names)
|
35
|
+
data.append(metadata.discrete_data)
|
36
|
+
|
37
|
+
if metadata.continuous_factor_names and metadata.continuous_data.size != 0:
|
38
|
+
names.extend(metadata.continuous_factor_names)
|
39
|
+
data.append(metadata.continuous_data)
|
40
|
+
|
41
|
+
return names, np.hstack(data, dtype=np.float64) if data else np.array([], dtype=np.float64)
|
42
|
+
|
43
|
+
|
44
|
+
def _combine_metadata(
|
45
|
+
metadata_1: Metadata, metadata_2: Metadata
|
46
|
+
) -> tuple[list[str], list[NDArray[np.float64 | np.int64]], list[NDArray[np.int64 | np.float64]]]:
|
16
47
|
"""
|
17
48
|
Combines the factor names and data arrays of metadata_1 and metadata_2 when the names
|
18
49
|
match exactly and data has the same number of columns (factors).
|
@@ -41,8 +72,8 @@ def _combine_metadata(metadata_1: Metadata, metadata_2: Metadata) -> tuple[list[
|
|
41
72
|
If the length of keys do not match the length of the data
|
42
73
|
"""
|
43
74
|
factor_names: list[str] = []
|
44
|
-
m1_data: list[NDArray] = []
|
45
|
-
m2_data: list[NDArray] = []
|
75
|
+
m1_data: list[NDArray[np.int64 | np.float64]] = []
|
76
|
+
m2_data: list[NDArray[np.int64 | np.float64]] = []
|
46
77
|
|
47
78
|
# Both metadata must have the same number of factors (cols), but not necessarily samples (row)
|
48
79
|
if metadata_1.total_num_factors != metadata_2.total_num_factors:
|
@@ -119,36 +150,38 @@ def _calc_median_deviations(reference: NDArray, test: NDArray) -> NDArray:
|
|
119
150
|
return np.abs(np.where(test_dev >= 0, test_dev / pscale, test_dev / nscale)) # (S_t, F)
|
120
151
|
|
121
152
|
|
122
|
-
|
123
|
-
|
124
|
-
|
153
|
+
@set_metadata
|
154
|
+
def find_most_deviated_factors(
|
155
|
+
metadata_ref: Metadata,
|
156
|
+
metadata_tst: Metadata,
|
125
157
|
ood: OODOutput,
|
126
|
-
) ->
|
158
|
+
) -> MostDeviatedFactorsOutput:
|
127
159
|
"""
|
128
|
-
|
160
|
+
Determine greatest deviation in metadata features per out of distribution sample in test metadata.
|
129
161
|
|
130
162
|
Parameters
|
131
163
|
----------
|
132
|
-
|
164
|
+
metadata_ref : Metadata
|
133
165
|
A reference set of Metadata containing factor names and samples
|
134
166
|
with discrete and/or continuous values per factor
|
135
|
-
|
167
|
+
metadata_tst : Metadata
|
136
168
|
The set of Metadata that is tested against the reference metadata.
|
137
169
|
This set must have the same number of features but does not require the same number of samples.
|
138
170
|
ood : OODOutput
|
139
|
-
A class output by
|
171
|
+
A class output by DataEval's OOD functions that contains which examples are OOD.
|
140
172
|
|
141
173
|
Returns
|
142
174
|
-------
|
143
|
-
|
144
|
-
An
|
175
|
+
MostDeviatedFactorsOutput
|
176
|
+
An output class containing the factor name and deviation of the highest metadata deviations for each
|
177
|
+
OOD example in the test metadata.
|
145
178
|
|
146
179
|
Notes
|
147
180
|
-----
|
148
181
|
1. Both :class:`.Metadata` inputs must have discrete and continuous data in the shape (samples, factors)
|
149
182
|
and have equivalent factor names and lengths
|
150
183
|
2. The flag at index `i` in :attr:`.OODOutput.is_ood` must correspond
|
151
|
-
directly to sample `i` of `
|
184
|
+
directly to sample `i` of `metadata_tst` being out-of-distribution from `metadata_ref`
|
152
185
|
|
153
186
|
Examples
|
154
187
|
--------
|
@@ -158,50 +191,49 @@ def most_deviated_factors(
|
|
158
191
|
All samples are out-of-distribution
|
159
192
|
|
160
193
|
>>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
|
161
|
-
>>>
|
162
|
-
[('time', 2.0), ('time', 2.592), ('time', 3.51)]
|
194
|
+
>>> find_most_deviated_factors(metadata1, metadata2, is_ood)
|
195
|
+
MostDeviatedFactorsOutput([('time', 2.0), ('time', 2.592), ('time', 3.51)])
|
163
196
|
|
164
|
-
|
197
|
+
No samples are out-of-distribution
|
165
198
|
|
166
199
|
>>> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
|
167
|
-
>>>
|
168
|
-
[]
|
200
|
+
>>> find_most_deviated_factors(metadata1, metadata2, is_ood)
|
201
|
+
MostDeviatedFactorsOutput([])
|
169
202
|
"""
|
170
203
|
|
171
204
|
ood_mask: NDArray[np.bool] = ood.is_ood
|
172
205
|
|
173
206
|
# No metadata correlated with out of distribution data
|
174
207
|
if not any(ood_mask):
|
175
|
-
return []
|
208
|
+
return MostDeviatedFactorsOutput([])
|
176
209
|
|
177
210
|
# Combines reference and test factor names and data if exists and match exactly
|
178
211
|
# shape -> (samples, factors)
|
179
212
|
factor_names, md_1, md_2 = _combine_metadata(
|
180
|
-
metadata_1=
|
181
|
-
metadata_2=
|
213
|
+
metadata_1=metadata_ref,
|
214
|
+
metadata_2=metadata_tst,
|
182
215
|
)
|
183
216
|
|
184
217
|
# Stack discrete and continuous factors as separate factors. Must have equal sample counts
|
185
|
-
|
186
|
-
|
218
|
+
ref_data = np.hstack(md_1) if md_1 else np.array([]) # (S, Fd + Fc)
|
219
|
+
tst_data = np.hstack(md_2) if md_2 else np.array([]) # (S, Fd + Fc)
|
187
220
|
|
188
|
-
if len(
|
221
|
+
if len(ref_data) < 3:
|
189
222
|
warnings.warn(
|
190
|
-
f"At least 3 reference metadata samples are needed, got {len(
|
223
|
+
f"At least 3 reference metadata samples are needed, got {len(ref_data)}",
|
191
224
|
UserWarning,
|
192
225
|
)
|
193
|
-
return []
|
226
|
+
return MostDeviatedFactorsOutput([])
|
194
227
|
|
195
|
-
if len(
|
228
|
+
if len(tst_data) != len(ood_mask):
|
196
229
|
raise ValueError(
|
197
|
-
f"ood and test metadata must have the same length, "
|
198
|
-
f"got {len(ood_mask)} and {len(metadata_tst)} respectively."
|
230
|
+
f"ood and test metadata must have the same length, got {len(ood_mask)} and {len(tst_data)} respectively."
|
199
231
|
)
|
200
232
|
|
201
233
|
# Calculates deviations of all samples in m2_data
|
202
234
|
# from the median values of the corresponding index in m1_data
|
203
235
|
# Guaranteed for inputs to not be empty
|
204
|
-
deviations = _calc_median_deviations(
|
236
|
+
deviations = _calc_median_deviations(ref_data, tst_data)
|
205
237
|
|
206
238
|
# Get most impactful factor deviation of each sample for ood samples only
|
207
239
|
deviation = np.max(deviations, axis=1)[ood_mask].astype(np.float16)
|
@@ -214,4 +246,91 @@ def most_deviated_factors(
|
|
214
246
|
|
215
247
|
# List of tuples matching the factor name with its deviation
|
216
248
|
|
217
|
-
return [(factor, dev) for factor, dev in zip(most_ood_factors, deviation)]
|
249
|
+
return MostDeviatedFactorsOutput([(factor, dev) for factor, dev in zip(most_ood_factors, deviation)])
|
250
|
+
|
251
|
+
|
252
|
+
_NATS2BITS = 1.442695
|
253
|
+
"""
|
254
|
+
_NATS2BITS is the reciprocal of natural log of 2. If you have an information/entropy-type quantity measured in nats,
|
255
|
+
which is what many library functions return, multiply it by _NATS2BITS to get it in bits.
|
256
|
+
"""
|
257
|
+
|
258
|
+
|
259
|
+
def find_ood_predictors(
|
260
|
+
metadata: Metadata,
|
261
|
+
ood: OODOutput,
|
262
|
+
) -> OODPredictorOutput:
|
263
|
+
"""Computes mutual information between a set of metadata features and per sample out-of-distribution flags.
|
264
|
+
|
265
|
+
Given a set of metadata features per sample and a corresponding OODOutput that indicates whether a sample was
|
266
|
+
determined to be out of distribution, this function calculates the mutual information between each factor and being
|
267
|
+
out of distribution. In other words, it finds which metadata factors most likely correlate to an
|
268
|
+
out of distribution sample.
|
269
|
+
|
270
|
+
Note
|
271
|
+
----
|
272
|
+
A high mutual information between a factor and ood samples is an indication of correlation, but not causation.
|
273
|
+
Additional analysis should be done to determine how to handle factors with a high mutual information.
|
274
|
+
|
275
|
+
|
276
|
+
Parameters
|
277
|
+
----------
|
278
|
+
metadata : Metadata
|
279
|
+
A set of arrays of values, indexed by metadata feature names, with one value per data example per feature.
|
280
|
+
ood : OODOutput
|
281
|
+
A class output by DataEval's OOD functions that contains which examples are OOD.
|
282
|
+
|
283
|
+
Returns
|
284
|
+
-------
|
285
|
+
OODPredictorOutput
|
286
|
+
A dictionary with keys corresponding to metadata feature names, and values indicating the strength of
|
287
|
+
association between each named feature and the OOD flag, as mutual information measured in bits.
|
288
|
+
|
289
|
+
Examples
|
290
|
+
--------
|
291
|
+
>>> from dataeval.outputs import OODOutput
|
292
|
+
|
293
|
+
All samples are out-of-distribution
|
294
|
+
|
295
|
+
>>> is_ood = OODOutput(np.array([True, True, True]), np.array([]), np.array([]))
|
296
|
+
>>> find_ood_predictors(metadata1, is_ood)
|
297
|
+
OODPredictorOutput({'time': 8.008566032557951e-17, 'altitude': 8.008566032557951e-17})
|
298
|
+
|
299
|
+
No out-of-distribution samples
|
300
|
+
|
301
|
+
>> is_ood = OODOutput(np.array([False, False, False]), np.array([]), np.array([]))
|
302
|
+
>> find_ood_predictors(metadata1, is_ood)
|
303
|
+
OODPredictorOutput({})
|
304
|
+
"""
|
305
|
+
|
306
|
+
ood_mask: NDArray[np.bool] = ood.is_ood
|
307
|
+
|
308
|
+
discrete_features_count = len(metadata.discrete_factor_names)
|
309
|
+
factors, data = _combine_discrete_continuous(metadata) # (F, ), (S, F) => F = Fd + Fc
|
310
|
+
|
311
|
+
# No metadata correlated with out of distribution data, return 0.0 for all factors
|
312
|
+
if not any(ood_mask):
|
313
|
+
return OODPredictorOutput(dict.fromkeys(factors, 0.0))
|
314
|
+
|
315
|
+
if len(data) != len(ood_mask):
|
316
|
+
raise ValueError(
|
317
|
+
f"ood and metadata must have the same length, got {len(ood_mask)} and {len(data)} respectively."
|
318
|
+
)
|
319
|
+
|
320
|
+
# Calculate mean, std of each factor over all samples
|
321
|
+
scaled_data = (data - np.mean(data, axis=0)) / np.std(data, axis=0, ddof=1) # (S, F)
|
322
|
+
|
323
|
+
discrete_features = np.zeros_like(factors, dtype=np.bool)
|
324
|
+
discrete_features[:discrete_features_count] = True
|
325
|
+
|
326
|
+
mutual_info_values = (
|
327
|
+
mutual_info_classif(
|
328
|
+
X=scaled_data,
|
329
|
+
y=ood_mask,
|
330
|
+
discrete_features=discrete_features, # type: ignore -> sklearn issue - NDArray[bool] not of accepted type Union[ArrayLike, 'auto']
|
331
|
+
random_state=get_seed(),
|
332
|
+
)
|
333
|
+
* _NATS2BITS
|
334
|
+
)
|
335
|
+
|
336
|
+
return OODPredictorOutput({k: mutual_info_values[i] for i, k in enumerate(factors)})
|
@@ -8,6 +8,7 @@ import numpy as np
|
|
8
8
|
import scipy as sp
|
9
9
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
10
10
|
|
11
|
+
from dataeval.config import EPSILON, get_seed
|
11
12
|
from dataeval.outputs import BalanceOutput
|
12
13
|
from dataeval.outputs._base import set_metadata
|
13
14
|
from dataeval.utils._bin import get_counts
|
@@ -91,6 +92,9 @@ def balance(
|
|
91
92
|
sklearn.feature_selection.mutual_info_regression
|
92
93
|
sklearn.metrics.mutual_info_score
|
93
94
|
"""
|
95
|
+
if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
|
96
|
+
raise ValueError("No factors found in provided metadata.")
|
97
|
+
|
94
98
|
num_neighbors = _validate_num_neighbors(num_neighbors)
|
95
99
|
|
96
100
|
num_factors = metadata.total_num_factors
|
@@ -110,7 +114,7 @@ def balance(
|
|
110
114
|
data[:, idx],
|
111
115
|
discrete_features=is_discrete, # type: ignore
|
112
116
|
n_neighbors=num_neighbors,
|
113
|
-
random_state=
|
117
|
+
random_state=get_seed(),
|
114
118
|
)
|
115
119
|
else:
|
116
120
|
mi[idx, :] = mutual_info_classif(
|
@@ -118,13 +122,13 @@ def balance(
|
|
118
122
|
data[:, idx],
|
119
123
|
discrete_features=is_discrete, # type: ignore
|
120
124
|
n_neighbors=num_neighbors,
|
121
|
-
random_state=
|
125
|
+
random_state=get_seed(),
|
122
126
|
)
|
123
127
|
|
124
128
|
# Normalization via entropy
|
125
129
|
bin_cnts = get_counts(discretized_data)
|
126
130
|
ent_factor = sp.stats.entropy(bin_cnts, axis=0)
|
127
|
-
norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) +
|
131
|
+
norm_factor = 0.5 * np.add.outer(ent_factor, ent_factor) + EPSILON
|
128
132
|
|
129
133
|
# in principle MI should be symmetric, but it is not in practice.
|
130
134
|
nmi = 0.5 * (mi + mi.T) / norm_factor
|
@@ -147,13 +151,13 @@ def balance(
|
|
147
151
|
tgt_bin[:, idx],
|
148
152
|
discrete_features=is_discrete, # type: ignore
|
149
153
|
n_neighbors=num_neighbors,
|
150
|
-
random_state=
|
154
|
+
random_state=get_seed(),
|
151
155
|
)
|
152
156
|
|
153
157
|
# Classwise normalization via entropy
|
154
158
|
classwise_bin_cnts = get_counts(tgt_bin)
|
155
159
|
ent_tgt_bin = sp.stats.entropy(classwise_bin_cnts, axis=0)
|
156
|
-
norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) +
|
160
|
+
norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_factor) + EPSILON
|
157
161
|
classwise = classwise_mi / norm_factor
|
158
162
|
|
159
163
|
# Grabbing factor names for plotting function
|
@@ -158,6 +158,9 @@ def diversity(
|
|
158
158
|
--------
|
159
159
|
scipy.stats.entropy
|
160
160
|
"""
|
161
|
+
if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
|
162
|
+
raise ValueError("No factors found in provided metadata.")
|
163
|
+
|
161
164
|
diversity_fn = get_method(_DIVERSITY_FN_MAP, method)
|
162
165
|
discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
|
163
166
|
cnts = get_counts(discretized_data)
|
dataeval/metrics/bias/_parity.py
CHANGED
@@ -241,6 +241,8 @@ def parity(metadata: Metadata) -> ParityOutput:
|
|
241
241
|
>>> parity(metadata)
|
242
242
|
ParityOutput(score=array([7.357, 5.467, 0.515]), p_value=array([0.289, 0.243, 0.773]), factor_names=['age', 'income', 'gender'], insufficient_data={'age': {3: {'artist': 4}, 4: {'artist': 4, 'teacher': 3}}, 'income': {1: {'artist': 3}}})
|
243
243
|
""" # noqa: E501
|
244
|
+
if not metadata.discrete_factor_names and not metadata.continuous_factor_names:
|
245
|
+
raise ValueError("No factors found in provided metadata.")
|
244
246
|
|
245
247
|
chi_scores = np.zeros(metadata.discrete_data.shape[1])
|
246
248
|
p_values = np.zeros_like(chi_scores)
|
@@ -19,6 +19,7 @@ from numpy.typing import NDArray
|
|
19
19
|
from scipy.sparse import coo_matrix
|
20
20
|
from scipy.stats import mode
|
21
21
|
|
22
|
+
from dataeval.config import EPSILON
|
22
23
|
from dataeval.outputs import BEROutput
|
23
24
|
from dataeval.outputs._base import set_metadata
|
24
25
|
from dataeval.typing import ArrayLike
|
@@ -82,7 +83,7 @@ def ber_knn(images: NDArray[np.float64], labels: NDArray[np.int_], k: int) -> tu
|
|
82
83
|
|
83
84
|
def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
84
85
|
"""Several cases for computing the BER lower bound"""
|
85
|
-
if value <=
|
86
|
+
if value <= EPSILON:
|
86
87
|
return 0.0
|
87
88
|
|
88
89
|
if classes == 2 and k != 1:
|
dataeval/metrics/stats/_base.py
CHANGED
@@ -9,7 +9,7 @@ from copy import deepcopy
|
|
9
9
|
from dataclasses import dataclass
|
10
10
|
from functools import partial
|
11
11
|
from multiprocessing import Pool
|
12
|
-
from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
|
12
|
+
from typing import Any, Callable, Generic, Iterable, Sequence, TypeVar
|
13
13
|
|
14
14
|
import numpy as np
|
15
15
|
import tqdm
|
@@ -17,7 +17,7 @@ from numpy.typing import NDArray
|
|
17
17
|
|
18
18
|
from dataeval.config import get_max_processes
|
19
19
|
from dataeval.outputs._stats import BaseStatsOutput, SourceIndex
|
20
|
-
from dataeval.typing import ArrayLike, Dataset, ObjectDetectionTarget
|
20
|
+
from dataeval.typing import Array, ArrayLike, Dataset, ObjectDetectionTarget
|
21
21
|
from dataeval.utils._array import to_numpy
|
22
22
|
from dataeval.utils._image import normalize_image_shape, rescale
|
23
23
|
|
@@ -122,22 +122,19 @@ class StatsProcessorOutput:
|
|
122
122
|
|
123
123
|
def process_stats(
|
124
124
|
i: int,
|
125
|
-
|
125
|
+
image: ArrayLike,
|
126
|
+
target: Any,
|
126
127
|
per_box: bool,
|
127
128
|
per_channel: bool,
|
128
129
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
129
130
|
) -> StatsProcessorOutput:
|
130
|
-
|
131
|
-
|
132
|
-
target = None if not isinstance(target, ObjectDetectionTarget) else target
|
133
|
-
boxes = to_numpy(target.boxes) if target is not None else None
|
131
|
+
image = to_numpy(image)
|
132
|
+
boxes = to_numpy(target.boxes) if isinstance(target, ObjectDetectionTarget) else None
|
134
133
|
results_list: list[dict[str, Any]] = []
|
135
134
|
source_indices: list[SourceIndex] = []
|
136
135
|
box_counts: list[int] = []
|
137
136
|
warnings_list: list[str] = []
|
138
|
-
|
139
|
-
for i_b, box in enumerate(nboxes):
|
140
|
-
i_b = None if box is None else i_b
|
137
|
+
for i_b, box in [(None, None)] if boxes is None else enumerate(normalize_box_shape(boxes)):
|
141
138
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
142
139
|
if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
|
143
140
|
warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} is out of bounds of {image.shape}.")
|
@@ -151,17 +148,16 @@ def process_stats(
|
|
151
148
|
|
152
149
|
|
153
150
|
def process_stats_unpack(
|
154
|
-
|
155
|
-
dataset: Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
|
151
|
+
args: tuple[int, ArrayLike, Any],
|
156
152
|
per_box: bool,
|
157
153
|
per_channel: bool,
|
158
154
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
159
155
|
) -> StatsProcessorOutput:
|
160
|
-
return process_stats(
|
156
|
+
return process_stats(*args, per_box=per_box, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
|
161
157
|
|
162
158
|
|
163
159
|
def run_stats(
|
164
|
-
dataset: Dataset[
|
160
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
165
161
|
per_box: bool,
|
166
162
|
per_channel: bool,
|
167
163
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
@@ -175,7 +171,7 @@ def run_stats(
|
|
175
171
|
|
176
172
|
Parameters
|
177
173
|
----------
|
178
|
-
data : Dataset[
|
174
|
+
data : Dataset[Array] | Dataset[tuple[Array, Any, Any]]
|
179
175
|
A dataset of images and targets to compute statistics on.
|
180
176
|
per_box : bool
|
181
177
|
A flag which determines if the statistics should be evaluated on a per-box basis or not.
|
@@ -206,18 +202,21 @@ def run_stats(
|
|
206
202
|
warning_list = []
|
207
203
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
208
204
|
|
209
|
-
|
205
|
+
def _enumerate(dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]], per_box: bool):
|
206
|
+
for i in range(len(dataset)):
|
207
|
+
d = dataset[i]
|
208
|
+
yield i, d[0] if isinstance(d, tuple) else d, d[1] if isinstance(d, tuple) and per_box else None
|
209
|
+
|
210
210
|
with Pool(processes=get_max_processes()) as p:
|
211
211
|
for r in tqdm.tqdm(
|
212
212
|
p.imap(
|
213
213
|
partial(
|
214
214
|
process_stats_unpack,
|
215
|
-
dataset=dataset,
|
216
215
|
per_box=per_box,
|
217
216
|
per_channel=per_channel,
|
218
217
|
stats_processor_cls=stats_processor_cls,
|
219
218
|
),
|
220
|
-
|
219
|
+
_enumerate(dataset, per_box),
|
221
220
|
),
|
222
221
|
total=len(dataset),
|
223
222
|
):
|
@@ -248,13 +247,13 @@ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
|
248
247
|
if type(a) is not type(b):
|
249
248
|
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
250
249
|
|
251
|
-
sum_dict = deepcopy(a.
|
250
|
+
sum_dict = deepcopy(a.data())
|
252
251
|
|
253
252
|
for k in sum_dict:
|
254
253
|
if isinstance(sum_dict[k], list):
|
255
|
-
sum_dict[k].extend(b.
|
254
|
+
sum_dict[k].extend(b.data()[k])
|
256
255
|
else:
|
257
|
-
sum_dict[k] = np.concatenate((sum_dict[k], b.
|
256
|
+
sum_dict[k] = np.concatenate((sum_dict[k], b.data()[k]))
|
258
257
|
|
259
258
|
return type(a)(**sum_dict)
|
260
259
|
|
@@ -153,7 +153,7 @@ def boxratiostats(
|
|
153
153
|
raise ValueError("Input for boxstats and imgstats must have matching channel information.")
|
154
154
|
|
155
155
|
output_dict = {}
|
156
|
-
for key in boxstats.
|
156
|
+
for key in boxstats.data():
|
157
157
|
output_dict[key] = calculate_ratios(key, boxstats, imgstats)
|
158
158
|
|
159
159
|
return output_cls(**output_dict)
|
@@ -9,7 +9,7 @@ import numpy as np
|
|
9
9
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
10
|
from dataeval.outputs import DimensionStatsOutput
|
11
11
|
from dataeval.outputs._base import set_metadata
|
12
|
-
from dataeval.typing import
|
12
|
+
from dataeval.typing import Array, Dataset
|
13
13
|
from dataeval.utils._image import get_bitdepth
|
14
14
|
|
15
15
|
|
@@ -34,7 +34,7 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
|
|
34
34
|
|
35
35
|
@set_metadata
|
36
36
|
def dimensionstats(
|
37
|
-
dataset: Dataset[
|
37
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
38
38
|
*,
|
39
39
|
per_box: bool = False,
|
40
40
|
) -> DimensionStatsOutput:
|
@@ -14,7 +14,7 @@ from scipy.fftpack import dct
|
|
14
14
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
15
15
|
from dataeval.outputs import HashStatsOutput
|
16
16
|
from dataeval.outputs._base import set_metadata
|
17
|
-
from dataeval.typing import ArrayLike, Dataset
|
17
|
+
from dataeval.typing import Array, ArrayLike, Dataset
|
18
18
|
from dataeval.utils._array import as_numpy
|
19
19
|
from dataeval.utils._image import normalize_image_shape, rescale
|
20
20
|
|
@@ -105,7 +105,7 @@ class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
|
105
105
|
|
106
106
|
@set_metadata
|
107
107
|
def hashstats(
|
108
|
-
dataset: Dataset[
|
108
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
109
109
|
*,
|
110
110
|
per_box: bool = False,
|
111
111
|
) -> HashStatsOutput:
|
@@ -10,12 +10,12 @@ from dataeval.metrics.stats._pixelstats import PixelStatsProcessor
|
|
10
10
|
from dataeval.metrics.stats._visualstats import VisualStatsProcessor
|
11
11
|
from dataeval.outputs import ChannelStatsOutput, ImageStatsOutput
|
12
12
|
from dataeval.outputs._base import set_metadata
|
13
|
-
from dataeval.typing import
|
13
|
+
from dataeval.typing import Array, Dataset
|
14
14
|
|
15
15
|
|
16
16
|
@overload
|
17
17
|
def imagestats(
|
18
|
-
dataset: Dataset[
|
18
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
19
19
|
*,
|
20
20
|
per_box: bool = False,
|
21
21
|
per_channel: Literal[True],
|
@@ -24,7 +24,7 @@ def imagestats(
|
|
24
24
|
|
25
25
|
@overload
|
26
26
|
def imagestats(
|
27
|
-
dataset: Dataset[
|
27
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
28
28
|
*,
|
29
29
|
per_box: bool = False,
|
30
30
|
per_channel: Literal[False] = False,
|
@@ -33,7 +33,7 @@ def imagestats(
|
|
33
33
|
|
34
34
|
@set_metadata
|
35
35
|
def imagestats(
|
36
|
-
dataset: Dataset[
|
36
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
37
37
|
*,
|
38
38
|
per_box: bool = False,
|
39
39
|
per_channel: bool = False,
|
@@ -42,8 +42,8 @@ def imagestats(
|
|
42
42
|
Calculates various :term:`statistics<Statistics>` for each image.
|
43
43
|
|
44
44
|
This function computes dimension, pixel and visual metrics
|
45
|
-
on the images or individual bounding boxes for each image
|
46
|
-
|
45
|
+
on the images or individual bounding boxes for each image. If
|
46
|
+
performing calculations per channel dimension stats are excluded.
|
47
47
|
|
48
48
|
Parameters
|
49
49
|
----------
|
@@ -61,7 +61,7 @@ def imagestats(
|
|
61
61
|
|
62
62
|
See Also
|
63
63
|
--------
|
64
|
-
dimensionstats,
|
64
|
+
dimensionstats, pixelstats, visualstats
|
65
65
|
|
66
66
|
Examples
|
67
67
|
--------
|
@@ -91,4 +91,4 @@ def imagestats(
|
|
91
91
|
output_cls = ImageStatsOutput
|
92
92
|
|
93
93
|
outputs = run_stats(dataset, per_box, per_channel, processors)
|
94
|
-
return output_cls(**{k: v for d in outputs for k, v in d.
|
94
|
+
return output_cls(**{k: v for d in outputs for k, v in d.data().items()})
|
@@ -10,7 +10,7 @@ from scipy.stats import entropy, kurtosis, skew
|
|
10
10
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
11
11
|
from dataeval.outputs import PixelStatsOutput
|
12
12
|
from dataeval.outputs._base import set_metadata
|
13
|
-
from dataeval.typing import
|
13
|
+
from dataeval.typing import Array, Dataset
|
14
14
|
|
15
15
|
|
16
16
|
class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
@@ -37,7 +37,7 @@ class PixelStatsProcessor(StatsProcessor[PixelStatsOutput]):
|
|
37
37
|
|
38
38
|
@set_metadata
|
39
39
|
def pixelstats(
|
40
|
-
dataset: Dataset[
|
40
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
41
41
|
*,
|
42
42
|
per_box: bool = False,
|
43
43
|
per_channel: bool = False,
|
@@ -9,7 +9,7 @@ import numpy as np
|
|
9
9
|
from dataeval.metrics.stats._base import StatsProcessor, run_stats
|
10
10
|
from dataeval.outputs import VisualStatsOutput
|
11
11
|
from dataeval.outputs._base import set_metadata
|
12
|
-
from dataeval.typing import
|
12
|
+
from dataeval.typing import Array, Dataset
|
13
13
|
from dataeval.utils._image import edge_filter
|
14
14
|
|
15
15
|
QUARTILES = (0, 25, 50, 75, 100)
|
@@ -44,7 +44,7 @@ class VisualStatsProcessor(StatsProcessor[VisualStatsOutput]):
|
|
44
44
|
|
45
45
|
@set_metadata
|
46
46
|
def visualstats(
|
47
|
-
dataset: Dataset[
|
47
|
+
dataset: Dataset[Array] | Dataset[tuple[Array, Any, Any]],
|
48
48
|
*,
|
49
49
|
per_box: bool = False,
|
50
50
|
per_channel: bool = False,
|
dataeval/outputs/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from ._bias import BalanceOutput, CoverageOutput, DiversityOutput, LabelParityOu
|
|
8
8
|
from ._drift import DriftMMDOutput, DriftOutput
|
9
9
|
from ._estimators import BEROutput, ClustererOutput, DivergenceOutput, UAPOutput
|
10
10
|
from ._linters import DuplicatesOutput, OutliersOutput
|
11
|
+
from ._metadata import MetadataDistanceOutput, MetadataDistanceValues, MostDeviatedFactorsOutput, OODPredictorOutput
|
11
12
|
from ._ood import OODOutput, OODScoreOutput
|
12
13
|
from ._stats import (
|
13
14
|
ChannelStatsOutput,
|
@@ -39,7 +40,11 @@ __all__ = [
|
|
39
40
|
"ImageStatsOutput",
|
40
41
|
"LabelParityOutput",
|
41
42
|
"LabelStatsOutput",
|
43
|
+
"MetadataDistanceOutput",
|
44
|
+
"MetadataDistanceValues",
|
45
|
+
"MostDeviatedFactorsOutput",
|
42
46
|
"OODOutput",
|
47
|
+
"OODPredictorOutput",
|
43
48
|
"OODScoreOutput",
|
44
49
|
"OutliersOutput",
|
45
50
|
"ParityOutput",
|