dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +13 -9
- dataeval/_internal/detectors/clusterer.py +63 -49
- dataeval/_internal/detectors/drift/base.py +248 -51
- dataeval/_internal/detectors/drift/cvm.py +28 -26
- dataeval/_internal/detectors/drift/ks.py +31 -28
- dataeval/_internal/detectors/drift/mmd.py +62 -42
- dataeval/_internal/detectors/drift/torch.py +69 -60
- dataeval/_internal/detectors/drift/uncertainty.py +32 -32
- dataeval/_internal/detectors/duplicates.py +67 -31
- dataeval/_internal/detectors/ood/ae.py +15 -29
- dataeval/_internal/detectors/ood/aegmm.py +33 -27
- dataeval/_internal/detectors/ood/base.py +86 -47
- dataeval/_internal/detectors/ood/llr.py +34 -31
- dataeval/_internal/detectors/ood/vae.py +32 -31
- dataeval/_internal/detectors/ood/vaegmm.py +34 -28
- dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
- dataeval/_internal/flags.py +44 -21
- dataeval/_internal/interop.py +5 -3
- dataeval/_internal/metrics/balance.py +42 -5
- dataeval/_internal/metrics/ber.py +11 -8
- dataeval/_internal/metrics/coverage.py +15 -8
- dataeval/_internal/metrics/divergence.py +41 -7
- dataeval/_internal/metrics/diversity.py +57 -19
- dataeval/_internal/metrics/parity.py +141 -66
- dataeval/_internal/metrics/stats.py +330 -313
- dataeval/_internal/metrics/uap.py +33 -4
- dataeval/_internal/metrics/utils.py +79 -40
- dataeval/_internal/models/pytorch/autoencoder.py +127 -22
- dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
- dataeval/_internal/models/tensorflow/gmm.py +4 -2
- dataeval/_internal/models/tensorflow/losses.py +17 -13
- dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
- dataeval/_internal/models/tensorflow/trainer.py +10 -7
- dataeval/_internal/models/tensorflow/utils.py +23 -20
- dataeval/_internal/output.py +85 -0
- dataeval/_internal/utils.py +5 -3
- dataeval/_internal/workflows/sufficiency.py +122 -121
- dataeval/detectors/__init__.py +6 -25
- dataeval/detectors/drift/__init__.py +16 -0
- dataeval/detectors/drift/kernels/__init__.py +6 -0
- dataeval/detectors/drift/updates/__init__.py +3 -0
- dataeval/detectors/linters/__init__.py +5 -0
- dataeval/detectors/ood/__init__.py +11 -0
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +2 -26
- dataeval/metrics/bias/__init__.py +14 -0
- dataeval/metrics/estimators/__init__.py +9 -0
- dataeval/metrics/stats/__init__.py +6 -0
- dataeval/tensorflow/__init__.py +3 -0
- dataeval/tensorflow/loss/__init__.py +3 -0
- dataeval/tensorflow/models/__init__.py +5 -0
- dataeval/tensorflow/recon/__init__.py +3 -0
- dataeval/torch/__init__.py +3 -0
- dataeval/{models/torch → torch/models}/__init__.py +1 -2
- dataeval/torch/trainer/__init__.py +3 -0
- dataeval/utils/__init__.py +3 -6
- dataeval/workflows/__init__.py +2 -4
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
- dataeval-0.66.0.dist-info/RECORD +72 -0
- dataeval/_internal/metrics/base.py +0 -10
- dataeval/models/__init__.py +0 -15
- dataeval/models/tensorflow/__init__.py +0 -6
- dataeval-0.64.0.dist-info/RECORD +0 -60
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -6,15 +6,17 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from functools import partial
|
10
|
-
from typing import Callable
|
12
|
+
from typing import Callable
|
11
13
|
|
12
14
|
import keras
|
13
15
|
import numpy as np
|
14
16
|
import tensorflow as tf
|
15
17
|
from keras.layers import Input
|
16
18
|
from keras.models import Model
|
17
|
-
from numpy.typing import ArrayLike
|
19
|
+
from numpy.typing import ArrayLike, NDArray
|
18
20
|
|
19
21
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
20
22
|
from dataeval._internal.interop import to_numpy
|
@@ -24,8 +26,8 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
24
26
|
|
25
27
|
|
26
28
|
def build_model(
|
27
|
-
dist: PixelCNN, input_shape:
|
28
|
-
) ->
|
29
|
+
dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
30
|
+
) -> tuple[keras.Model, PixelCNN]:
|
29
31
|
"""
|
30
32
|
Create keras.Model from TF distribution.
|
31
33
|
|
@@ -52,7 +54,7 @@ def build_model(
|
|
52
54
|
|
53
55
|
|
54
56
|
def mutate_categorical(
|
55
|
-
X:
|
57
|
+
X: NDArray,
|
56
58
|
rate: float,
|
57
59
|
seed: int = 0,
|
58
60
|
feature_range: tuple = (0, 255),
|
@@ -87,28 +89,29 @@ def mutate_categorical(
|
|
87
89
|
|
88
90
|
|
89
91
|
class OOD_LLR(OODBase):
|
92
|
+
"""
|
93
|
+
Likelihood Ratios based outlier detector.
|
94
|
+
|
95
|
+
Parameters
|
96
|
+
----------
|
97
|
+
model : PixelCNN
|
98
|
+
Generative distribution model.
|
99
|
+
model_background : Optional[PixelCNN], default None
|
100
|
+
Optional model for the background. Only needed if it is different from `model`.
|
101
|
+
log_prob : Optional[Callable], default None
|
102
|
+
Function used to evaluate log probabilities under the model
|
103
|
+
if the model does not have a `log_prob` function.
|
104
|
+
sequential : bool, default False
|
105
|
+
Whether the data is sequential. Used to create targets during training.
|
106
|
+
"""
|
107
|
+
|
90
108
|
def __init__(
|
91
109
|
self,
|
92
110
|
model: PixelCNN,
|
93
|
-
model_background:
|
94
|
-
log_prob:
|
111
|
+
model_background: PixelCNN | None = None,
|
112
|
+
log_prob: Callable | None = None,
|
95
113
|
sequential: bool = False,
|
96
114
|
) -> None:
|
97
|
-
"""
|
98
|
-
Likelihood Ratios based outlier detector.
|
99
|
-
|
100
|
-
Parameters
|
101
|
-
----------
|
102
|
-
model : PixelCNN
|
103
|
-
Generative distribution model.
|
104
|
-
model_background : Optional[PixelCNN], default None
|
105
|
-
Optional model for the background. Only needed if it is different from `model`.
|
106
|
-
log_prob : Optional[Callable], default None
|
107
|
-
Function used to evaluate log probabilities under the model
|
108
|
-
if the model does not have a `log_prob` function.
|
109
|
-
sequential : bool, default False
|
110
|
-
Whether the data is sequential. Used to create targets during training.
|
111
|
-
"""
|
112
115
|
self.dist_s = model
|
113
116
|
self.dist_b = (
|
114
117
|
model.copy()
|
@@ -123,13 +126,13 @@ class OOD_LLR(OODBase):
|
|
123
126
|
|
124
127
|
self._ref_score: OODScore
|
125
128
|
self._threshold_perc: float
|
126
|
-
self._data_info:
|
129
|
+
self._data_info: tuple[tuple, type] | None = None
|
127
130
|
|
128
131
|
def fit(
|
129
132
|
self,
|
130
133
|
x_ref: ArrayLike,
|
131
134
|
threshold_perc: float = 100.0,
|
132
|
-
loss_fn:
|
135
|
+
loss_fn: Callable | None = None,
|
133
136
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
134
137
|
epochs: int = 20,
|
135
138
|
batch_size: int = 64,
|
@@ -144,10 +147,10 @@ class OOD_LLR(OODBase):
|
|
144
147
|
Parameters
|
145
148
|
----------
|
146
149
|
x_ref : ArrayLike
|
147
|
-
Training
|
150
|
+
Training data.
|
148
151
|
threshold_perc : float, default 100.0
|
149
152
|
Percentage of reference data that is normal.
|
150
|
-
loss_fn :
|
153
|
+
loss_fn : Callable | None, default None
|
151
154
|
Loss function used for training.
|
152
155
|
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
153
156
|
Optimizer used for training.
|
@@ -221,10 +224,10 @@ class OOD_LLR(OODBase):
|
|
221
224
|
def _logp(
|
222
225
|
self,
|
223
226
|
dist,
|
224
|
-
X:
|
227
|
+
X: NDArray,
|
225
228
|
return_per_feature: bool = False,
|
226
229
|
batch_size: int = int(1e10),
|
227
|
-
) ->
|
230
|
+
) -> NDArray:
|
228
231
|
"""
|
229
232
|
Compute log probability of a batch of instances under the generative model.
|
230
233
|
"""
|
@@ -235,10 +238,10 @@ class OOD_LLR(OODBase):
|
|
235
238
|
def _logp_alt(
|
236
239
|
self,
|
237
240
|
model: keras.Model,
|
238
|
-
X:
|
241
|
+
X: NDArray,
|
239
242
|
return_per_feature: bool = False,
|
240
243
|
batch_size: int = int(1e10),
|
241
|
-
) ->
|
244
|
+
) -> NDArray:
|
242
245
|
"""
|
243
246
|
Compute log probability of a batch of instances with the user defined log_prob function.
|
244
247
|
"""
|
@@ -254,7 +257,7 @@ class OOD_LLR(OODBase):
|
|
254
257
|
axis = tuple(np.arange(len(logp.shape))[1:])
|
255
258
|
return np.mean(logp, axis=axis)
|
256
259
|
|
257
|
-
def _llr(self, X:
|
260
|
+
def _llr(self, X: NDArray, return_per_feature: bool, batch_size: int = int(1e10)) -> NDArray:
|
258
261
|
"""
|
259
262
|
Compute likelihood ratios.
|
260
263
|
|
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from typing import Callable
|
10
12
|
|
11
13
|
import keras
|
12
14
|
import numpy as np
|
15
|
+
import tensorflow as tf
|
13
16
|
from numpy.typing import ArrayLike
|
14
17
|
|
15
18
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
@@ -20,17 +23,33 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
20
23
|
|
21
24
|
|
22
25
|
class OOD_VAE(OODBase):
|
26
|
+
"""
|
27
|
+
VAE based outlier detector.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
model : VAE
|
32
|
+
A VAE model.
|
33
|
+
samples : int, default 10
|
34
|
+
Number of samples sampled to evaluate each instance.
|
35
|
+
|
36
|
+
Examples
|
37
|
+
--------
|
38
|
+
Instantiate an OOD detector metric with a generic dataset - batch of images with shape (3,25,25)
|
39
|
+
|
40
|
+
>>> metric = OOD_VAE(create_model(VAE, dataset[0].shape))
|
41
|
+
|
42
|
+
Adjusting fit parameters,
|
43
|
+
including setting the fit threshold at 85% for a training set with about 15% out-of-distribution
|
44
|
+
|
45
|
+
>>> metric.fit(dataset, threshold_perc=85, batch_size=128, verbose=False)
|
46
|
+
|
47
|
+
Detect out of distribution samples at the 'feature' level
|
48
|
+
|
49
|
+
>>> result = metric.predict(dataset, ood_type="feature")
|
50
|
+
"""
|
51
|
+
|
23
52
|
def __init__(self, model: VAE, samples: int = 10) -> None:
|
24
|
-
"""
|
25
|
-
VAE based outlier detector.
|
26
|
-
|
27
|
-
Parameters
|
28
|
-
----------
|
29
|
-
model : VAE
|
30
|
-
A VAE model.
|
31
|
-
samples : int, default 10
|
32
|
-
Number of samples sampled to evaluate each instance.
|
33
|
-
"""
|
34
53
|
super().__init__(model)
|
35
54
|
self.samples = samples
|
36
55
|
|
@@ -38,32 +57,14 @@ class OOD_VAE(OODBase):
|
|
38
57
|
self,
|
39
58
|
x_ref: ArrayLike,
|
40
59
|
threshold_perc: float = 100.0,
|
41
|
-
loss_fn: Callable =
|
60
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
42
61
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
43
62
|
epochs: int = 20,
|
44
63
|
batch_size: int = 64,
|
45
64
|
verbose: bool = True,
|
46
65
|
) -> None:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
Parameters
|
51
|
-
----------
|
52
|
-
x_ref : ArrayLike
|
53
|
-
Training batch.
|
54
|
-
threshold_perc : float, default 100.0
|
55
|
-
Percentage of reference data that is normal.
|
56
|
-
loss_fn : Callable, default Elbo(0.05)
|
57
|
-
Loss function used for training.
|
58
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
59
|
-
Optimizer used for training.
|
60
|
-
epochs : int, default 20
|
61
|
-
Number of training epochs.
|
62
|
-
batch_size : int, default 64
|
63
|
-
Batch size used for training.
|
64
|
-
verbose : bool, default True
|
65
|
-
Whether to print training progress.
|
66
|
-
"""
|
66
|
+
if loss_fn is None:
|
67
|
+
loss_fn = Elbo(0.05)
|
67
68
|
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
68
69
|
|
69
70
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from typing import Callable
|
10
12
|
|
11
13
|
import keras
|
12
14
|
import numpy as np
|
15
|
+
import tensorflow as tf
|
13
16
|
from numpy.typing import ArrayLike
|
14
17
|
|
15
18
|
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
@@ -21,17 +24,18 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
21
24
|
|
22
25
|
|
23
26
|
class OOD_VAEGMM(OODGMMBase):
|
24
|
-
|
25
|
-
|
26
|
-
VAE with Gaussian Mixture Model based outlier detector.
|
27
|
+
"""
|
28
|
+
VAE with Gaussian Mixture Model based outlier detector.
|
27
29
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
model : VAEGMM
|
33
|
+
A VAEGMM model.
|
34
|
+
samples
|
35
|
+
Number of samples sampled to evaluate each instance.
|
36
|
+
"""
|
37
|
+
|
38
|
+
def __init__(self, model: VAEGMM, samples: int = 10) -> None:
|
35
39
|
super().__init__(model)
|
36
40
|
self.samples = samples
|
37
41
|
|
@@ -39,35 +43,37 @@ class OOD_VAEGMM(OODGMMBase):
|
|
39
43
|
self,
|
40
44
|
x_ref: ArrayLike,
|
41
45
|
threshold_perc: float = 100.0,
|
42
|
-
loss_fn: Callable =
|
46
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
43
47
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
44
48
|
epochs: int = 20,
|
45
49
|
batch_size: int = 64,
|
46
50
|
verbose: bool = True,
|
47
51
|
) -> None:
|
52
|
+
if loss_fn is None:
|
53
|
+
loss_fn = LossGMM(elbo=Elbo(0.05))
|
54
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
55
|
+
|
56
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
48
57
|
"""
|
49
|
-
|
58
|
+
Compute the out-of-distribution (OOD) score for a given dataset.
|
50
59
|
|
51
60
|
Parameters
|
52
61
|
----------
|
53
62
|
X : ArrayLike
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
Loss function used for training.
|
59
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
60
|
-
Optimizer used for training.
|
61
|
-
epochs : int, default 20
|
62
|
-
Number of training epochs.
|
63
|
-
batch_size : int, default 64
|
64
|
-
Batch size used for training.
|
65
|
-
verbose : bool, default True
|
66
|
-
Whether to print training progress.
|
67
|
-
"""
|
68
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
63
|
+
Input data to score.
|
64
|
+
batch_size : int, default 1e10
|
65
|
+
Number of instances to process in each batch.
|
66
|
+
Use a smaller batch size if your dataset is large or if you encounter memory issues.
|
69
67
|
|
70
|
-
|
68
|
+
Returns
|
69
|
+
-------
|
70
|
+
OODScore
|
71
|
+
An object containing the instance-level OOD score.
|
72
|
+
|
73
|
+
Note
|
74
|
+
----
|
75
|
+
This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
|
76
|
+
"""
|
71
77
|
self._validate(X := to_numpy(X))
|
72
78
|
|
73
79
|
# draw samples from latent space
|
@@ -1,15 +1,32 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable, Literal
|
2
5
|
|
3
6
|
import numpy as np
|
4
|
-
from numpy.typing import ArrayLike
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
|
+
|
9
|
+
from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
|
10
|
+
from dataeval._internal.metrics.stats import StatsOutput, imagestats
|
11
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass(frozen=True)
|
15
|
+
class OutliersOutput(OutputMetadata):
|
16
|
+
"""
|
17
|
+
Attributes
|
18
|
+
----------
|
19
|
+
issues : Dict[int, Dict[str, float]]
|
20
|
+
Dictionary containing the indices of outliers and a dictionary showing
|
21
|
+
the issues and calculated values for the given index.
|
22
|
+
"""
|
5
23
|
|
6
|
-
|
7
|
-
from dataeval._internal.metrics.stats import ImageStats
|
24
|
+
issues: dict[int, dict[str, float]]
|
8
25
|
|
9
26
|
|
10
27
|
def _get_outlier_mask(
|
11
|
-
values:
|
12
|
-
) ->
|
28
|
+
values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
|
29
|
+
) -> NDArray:
|
13
30
|
if method == "zscore":
|
14
31
|
threshold = threshold if threshold else 3.0
|
15
32
|
std = np.std(values)
|
@@ -18,7 +35,7 @@ def _get_outlier_mask(
|
|
18
35
|
elif method == "modzscore":
|
19
36
|
threshold = threshold if threshold else 3.5
|
20
37
|
abs_diff = np.abs(values - np.median(values))
|
21
|
-
med_abs_diff = np.median(abs_diff)
|
38
|
+
med_abs_diff = np.median(abs_diff) if np.median(abs_diff) != 0 else np.mean(abs_diff)
|
22
39
|
mod_z_score = 0.6745 * abs_diff / med_abs_diff
|
23
40
|
return mod_z_score > threshold
|
24
41
|
elif method == "iqr":
|
@@ -30,14 +47,15 @@ def _get_outlier_mask(
|
|
30
47
|
raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
|
31
48
|
|
32
49
|
|
33
|
-
class
|
50
|
+
class Outliers:
|
34
51
|
r"""
|
35
52
|
Calculates statistical outliers of a dataset using various statistical tests applied to each image
|
36
53
|
|
37
54
|
Parameters
|
38
55
|
----------
|
39
|
-
flags :
|
56
|
+
flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
|
40
57
|
Metric(s) to calculate for each image - calculates all metrics if None
|
58
|
+
Only supports ImageStat.ALL_STATS
|
41
59
|
outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
|
42
60
|
Statistical method used to identify outliers
|
43
61
|
outlier_threshold : float, optional - default None
|
@@ -46,8 +64,8 @@ class Linter:
|
|
46
64
|
|
47
65
|
Attributes
|
48
66
|
----------
|
49
|
-
stats :
|
50
|
-
|
67
|
+
stats : Dict[str, Any]
|
68
|
+
Dictionary to hold the value of each metric for each image
|
51
69
|
|
52
70
|
See Also
|
53
71
|
--------
|
@@ -75,42 +93,40 @@ class Linter:
|
|
75
93
|
|
76
94
|
Examples
|
77
95
|
--------
|
78
|
-
Initialize the
|
96
|
+
Initialize the Outliers class:
|
79
97
|
|
80
|
-
>>>
|
98
|
+
>>> outliers = Outliers()
|
81
99
|
|
82
100
|
Specifying specific metrics to analyze:
|
83
101
|
|
84
|
-
>>>
|
102
|
+
>>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
|
85
103
|
|
86
104
|
Specifying an outlier method:
|
87
105
|
|
88
|
-
>>>
|
106
|
+
>>> outliers = Outliers(outlier_method="iqr")
|
89
107
|
|
90
108
|
Specifying an outlier method and threshold:
|
91
109
|
|
92
|
-
>>>
|
110
|
+
>>> outliers = Outliers(outlier_method="zscore", outlier_threshold=2.5)
|
93
111
|
"""
|
94
112
|
|
95
113
|
def __init__(
|
96
114
|
self,
|
97
|
-
flags:
|
115
|
+
flags: ImageStat = ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS,
|
98
116
|
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
99
|
-
outlier_threshold:
|
117
|
+
outlier_threshold: float | None = None,
|
100
118
|
):
|
101
|
-
flags
|
102
|
-
self.
|
119
|
+
verify_supported(flags, ImageStat.ALL_STATS)
|
120
|
+
self.flags = flags
|
103
121
|
self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
|
104
122
|
self.outlier_threshold = outlier_threshold
|
105
123
|
|
106
124
|
def _get_outliers(self) -> dict:
|
107
125
|
flagged_images = {}
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
if values.ndim == 1 and np.std(values) != 0:
|
126
|
+
stats_dict = self.stats.dict()
|
127
|
+
supported = to_distinct(ImageStat.ALL_STATS)
|
128
|
+
for stat, values in stats_dict.items():
|
129
|
+
if stat in supported.values() and values.ndim == 1 and np.std(values) != 0:
|
114
130
|
mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
|
115
131
|
indices = np.flatnonzero(mask)
|
116
132
|
for i, value in zip(indices, values[mask]):
|
@@ -118,30 +134,36 @@ class Linter:
|
|
118
134
|
|
119
135
|
return dict(sorted(flagged_images.items()))
|
120
136
|
|
121
|
-
|
137
|
+
@set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
|
138
|
+
def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> OutliersOutput:
|
122
139
|
"""
|
123
140
|
Returns indices of outliers with the issues identified for each
|
124
141
|
|
125
142
|
Parameters
|
126
143
|
----------
|
127
|
-
|
128
|
-
A dataset in an ArrayLike format
|
129
|
-
Function expects the data to have 3 dimensions, CxHxW.
|
144
|
+
data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput
|
145
|
+
A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
|
130
146
|
|
131
147
|
Returns
|
132
148
|
-------
|
133
|
-
|
134
|
-
|
149
|
+
OutliersOutput
|
150
|
+
Output class containing the indices of outliers and a dictionary showing
|
135
151
|
the issues and calculated values for the given index.
|
136
152
|
|
137
153
|
Example
|
138
154
|
-------
|
139
155
|
Evaluate the dataset:
|
140
156
|
|
141
|
-
>>>
|
142
|
-
{18: {'brightness': 0.78}, 25: {'brightness': 0.98}}
|
157
|
+
>>> outliers.evaluate(images)
|
158
|
+
OutliersOutput(issues={18: {'brightness': 0.78}, 25: {'brightness': 0.98}})
|
143
159
|
"""
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
160
|
+
if isinstance(data, StatsOutput):
|
161
|
+
flags = set(to_distinct(self.flags).values())
|
162
|
+
stats = set(data.dict())
|
163
|
+
missing = flags - stats
|
164
|
+
if missing:
|
165
|
+
raise ValueError(f"StatsOutput is missing {missing} from the required stats: {flags}.")
|
166
|
+
self.stats = data
|
167
|
+
else:
|
168
|
+
self.stats = imagestats(data, self.flags)
|
169
|
+
return OutliersOutput(self._get_outliers())
|
dataeval/_internal/flags.py
CHANGED
@@ -1,37 +1,33 @@
|
|
1
|
-
from
|
2
|
-
from typing import Union
|
1
|
+
from __future__ import annotations
|
3
2
|
|
3
|
+
from enum import IntFlag, auto
|
4
|
+
from functools import reduce
|
5
|
+
from typing import Iterable, TypeVar, cast
|
4
6
|
|
5
|
-
|
6
|
-
def __get__(self, _, cls):
|
7
|
-
return ~cls(0)
|
7
|
+
TFlag = TypeVar("TFlag", bound=IntFlag)
|
8
8
|
|
9
9
|
|
10
|
-
class
|
10
|
+
class ImageStat(IntFlag):
|
11
|
+
"""
|
12
|
+
Flags for calculating image and channel statistics
|
13
|
+
"""
|
14
|
+
|
15
|
+
# HASHES
|
11
16
|
XXHASH = auto()
|
12
17
|
PCHASH = auto()
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
class ImageProperty(Flag):
|
18
|
+
# PROPERTIES
|
17
19
|
WIDTH = auto()
|
18
20
|
HEIGHT = auto()
|
19
21
|
SIZE = auto()
|
20
22
|
ASPECT_RATIO = auto()
|
21
23
|
CHANNELS = auto()
|
22
24
|
DEPTH = auto()
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
class ImageVisuals(Flag):
|
25
|
+
# VISUALS
|
27
26
|
BRIGHTNESS = auto()
|
28
27
|
BLURRINESS = auto()
|
29
28
|
MISSING = auto()
|
30
29
|
ZERO = auto()
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
class ImageStatistics(Flag):
|
30
|
+
# PIXEL STATS
|
35
31
|
MEAN = auto()
|
36
32
|
STD = auto()
|
37
33
|
VAR = auto()
|
@@ -40,8 +36,35 @@ class ImageStatistics(Flag):
|
|
40
36
|
ENTROPY = auto()
|
41
37
|
PERCENTILES = auto()
|
42
38
|
HISTOGRAM = auto()
|
43
|
-
|
39
|
+
# JOINT FLAGS
|
40
|
+
ALL_HASHES = XXHASH | PCHASH
|
41
|
+
ALL_PROPERTIES = WIDTH | HEIGHT | SIZE | ASPECT_RATIO | CHANNELS | DEPTH
|
42
|
+
ALL_VISUALS = BRIGHTNESS | BLURRINESS | MISSING | ZERO
|
43
|
+
ALL_PIXELSTATS = MEAN | STD | VAR | SKEW | KURTOSIS | ENTROPY | PERCENTILES | HISTOGRAM
|
44
|
+
ALL_STATS = ALL_PROPERTIES | ALL_VISUALS | ALL_PIXELSTATS
|
45
|
+
ALL = ALL_HASHES | ALL_STATS
|
46
|
+
|
47
|
+
|
48
|
+
def is_distinct(flag: IntFlag) -> bool:
|
49
|
+
return (flag & (flag - 1) == 0) and flag != 0
|
50
|
+
|
51
|
+
|
52
|
+
def to_distinct(flag: TFlag) -> dict[TFlag, str]:
|
53
|
+
"""
|
54
|
+
Returns a distinct set of all flags set on the input flag and their names
|
55
|
+
|
56
|
+
NOTE: this is supported natively in Python 3.11, but for earlier versions we need
|
57
|
+
to use a combination of list comprehension and bit fiddling to determine distinct
|
58
|
+
flag values from joint aliases.
|
59
|
+
"""
|
60
|
+
if isinstance(flag, Iterable): # >= py311
|
61
|
+
return {f: f.name.lower() for f in flag if f.name}
|
62
|
+
else: # < py311
|
63
|
+
return {f: f.name.lower() for f in list(flag.__class__) if f & flag and is_distinct(f) and f.name}
|
44
64
|
|
45
65
|
|
46
|
-
|
47
|
-
|
66
|
+
def verify_supported(flag: TFlag, flags: TFlag | Iterable[TFlag]):
|
67
|
+
supported = flags if isinstance(flags, flag.__class__) else cast(TFlag, reduce(lambda a, b: a | b, flags)) # type: ignore
|
68
|
+
unsupported = flag & ~supported
|
69
|
+
if unsupported:
|
70
|
+
raise ValueError(f"Unsupported flags {unsupported} called. Only {supported} flags are supported.")
|
dataeval/_internal/interop.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from importlib import import_module
|
2
|
-
from typing import Iterable
|
4
|
+
from typing import Iterable
|
3
5
|
|
4
6
|
import numpy as np
|
5
|
-
from numpy.typing import ArrayLike
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
6
8
|
|
7
9
|
module_cache = {}
|
8
10
|
|
@@ -20,7 +22,7 @@ def try_import(module_name):
|
|
20
22
|
return module
|
21
23
|
|
22
24
|
|
23
|
-
def to_numpy(array:
|
25
|
+
def to_numpy(array: ArrayLike | None) -> NDArray:
|
24
26
|
if array is None:
|
25
27
|
return np.ndarray([])
|
26
28
|
|