dataeval 0.65.0__py3-none-any.whl → 0.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +13 -9
- dataeval/_internal/detectors/clusterer.py +24 -22
- dataeval/_internal/detectors/drift/base.py +206 -26
- dataeval/_internal/detectors/drift/cvm.py +25 -23
- dataeval/_internal/detectors/drift/ks.py +28 -25
- dataeval/_internal/detectors/drift/mmd.py +30 -29
- dataeval/_internal/detectors/drift/torch.py +66 -58
- dataeval/_internal/detectors/drift/uncertainty.py +28 -28
- dataeval/_internal/detectors/duplicates.py +28 -18
- dataeval/_internal/detectors/ood/ae.py +15 -29
- dataeval/_internal/detectors/ood/aegmm.py +33 -27
- dataeval/_internal/detectors/ood/base.py +61 -43
- dataeval/_internal/detectors/ood/llr.py +27 -24
- dataeval/_internal/detectors/ood/vae.py +32 -31
- dataeval/_internal/detectors/ood/vaegmm.py +34 -28
- dataeval/_internal/detectors/{linter.py → outliers.py} +33 -27
- dataeval/_internal/flags.py +5 -3
- dataeval/_internal/interop.py +4 -2
- dataeval/_internal/metrics/balance.py +33 -4
- dataeval/_internal/metrics/ber.py +6 -4
- dataeval/_internal/metrics/diversity.py +45 -12
- dataeval/_internal/metrics/parity.py +114 -26
- dataeval/_internal/metrics/stats.py +154 -16
- dataeval/_internal/metrics/uap.py +28 -2
- dataeval/_internal/metrics/utils.py +20 -18
- dataeval/_internal/models/pytorch/autoencoder.py +127 -22
- dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
- dataeval/_internal/models/tensorflow/gmm.py +4 -2
- dataeval/_internal/models/tensorflow/losses.py +15 -11
- dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
- dataeval/_internal/models/tensorflow/trainer.py +8 -6
- dataeval/_internal/models/tensorflow/utils.py +21 -19
- dataeval/_internal/output.py +13 -10
- dataeval/_internal/utils.py +5 -3
- dataeval/_internal/workflows/sufficiency.py +42 -30
- dataeval/detectors/__init__.py +6 -25
- dataeval/detectors/drift/__init__.py +16 -0
- dataeval/detectors/drift/kernels/__init__.py +6 -0
- dataeval/detectors/drift/updates/__init__.py +3 -0
- dataeval/detectors/linters/__init__.py +5 -0
- dataeval/detectors/ood/__init__.py +11 -0
- dataeval/metrics/__init__.py +2 -26
- dataeval/metrics/bias/__init__.py +14 -0
- dataeval/metrics/estimators/__init__.py +9 -0
- dataeval/metrics/stats/__init__.py +6 -0
- dataeval/tensorflow/__init__.py +3 -0
- dataeval/tensorflow/loss/__init__.py +3 -0
- dataeval/tensorflow/models/__init__.py +5 -0
- dataeval/tensorflow/recon/__init__.py +3 -0
- dataeval/torch/__init__.py +3 -0
- dataeval/{models/torch → torch/models}/__init__.py +1 -2
- dataeval/torch/trainer/__init__.py +3 -0
- dataeval/utils/__init__.py +3 -6
- dataeval/workflows/__init__.py +2 -4
- {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
- dataeval-0.66.0.dist-info/RECORD +72 -0
- dataeval/models/__init__.py +0 -15
- dataeval/models/tensorflow/__init__.py +0 -6
- dataeval-0.65.0.dist-info/RECORD +0 -60
- {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -1,12 +1,13 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from dataclasses import dataclass
|
2
|
-
from typing import
|
4
|
+
from typing import Iterable
|
3
5
|
|
4
6
|
from numpy.typing import ArrayLike
|
5
7
|
|
6
|
-
from dataeval._internal.
|
8
|
+
from dataeval._internal.flags import ImageStat
|
9
|
+
from dataeval._internal.metrics.stats import StatsOutput, imagestats
|
7
10
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
8
|
-
from dataeval.flags import ImageStat
|
9
|
-
from dataeval.metrics import imagestats
|
10
11
|
|
11
12
|
|
12
13
|
@dataclass(frozen=True)
|
@@ -20,8 +21,8 @@ class DuplicatesOutput(OutputMetadata):
|
|
20
21
|
Indices of images that are near matches
|
21
22
|
"""
|
22
23
|
|
23
|
-
exact:
|
24
|
-
near:
|
24
|
+
exact: list[list[int]]
|
25
|
+
near: list[list[int]]
|
25
26
|
|
26
27
|
|
27
28
|
class Duplicates:
|
@@ -34,6 +35,11 @@ class Duplicates:
|
|
34
35
|
stats : StatsOutput
|
35
36
|
Output class of stats
|
36
37
|
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
only_exact : bool, default False
|
41
|
+
Only inspect the dataset for exact image matches
|
42
|
+
|
37
43
|
Example
|
38
44
|
-------
|
39
45
|
Initialize the Duplicates class:
|
@@ -41,12 +47,11 @@ class Duplicates:
|
|
41
47
|
>>> dups = Duplicates()
|
42
48
|
"""
|
43
49
|
|
44
|
-
def __init__(self,
|
50
|
+
def __init__(self, only_exact: bool = False):
|
45
51
|
self.stats: StatsOutput
|
46
|
-
self.
|
47
|
-
self.find_near = find_near
|
52
|
+
self.only_exact = only_exact
|
48
53
|
|
49
|
-
def _get_duplicates(self) ->
|
54
|
+
def _get_duplicates(self) -> dict[str, list[list[int]]]:
|
50
55
|
stats_dict = self.stats.dict()
|
51
56
|
if "xxhash" in stats_dict:
|
52
57
|
exact = {}
|
@@ -56,7 +61,7 @@ class Duplicates:
|
|
56
61
|
else:
|
57
62
|
exact = []
|
58
63
|
|
59
|
-
if "pchash" in stats_dict:
|
64
|
+
if "pchash" in stats_dict and not self.only_exact:
|
60
65
|
near = {}
|
61
66
|
for i, value in enumerate(stats_dict["pchash"]):
|
62
67
|
near.setdefault(value, []).append(i)
|
@@ -69,15 +74,15 @@ class Duplicates:
|
|
69
74
|
"near": sorted(near),
|
70
75
|
}
|
71
76
|
|
72
|
-
@set_metadata("dataeval.detectors", ["
|
73
|
-
def evaluate(self,
|
77
|
+
@set_metadata("dataeval.detectors", ["only_exact"])
|
78
|
+
def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> DuplicatesOutput:
|
74
79
|
"""
|
75
80
|
Returns duplicate image indices for both exact matches and near matches
|
76
81
|
|
77
82
|
Parameters
|
78
83
|
----------
|
79
|
-
|
80
|
-
A
|
84
|
+
data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput
|
85
|
+
A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
|
81
86
|
|
82
87
|
Returns
|
83
88
|
-------
|
@@ -93,7 +98,12 @@ class Duplicates:
|
|
93
98
|
>>> dups.evaluate(images)
|
94
99
|
DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
|
95
100
|
""" # noqa: E501
|
96
|
-
|
97
|
-
|
98
|
-
|
101
|
+
if isinstance(data, StatsOutput):
|
102
|
+
if not data.xxhash:
|
103
|
+
raise ValueError("StatsOutput must include xxhash information of the images.")
|
104
|
+
if not self.only_exact and not data.pchash:
|
105
|
+
raise ValueError("StatsOutput must include pchash information of the images for near matches.")
|
106
|
+
self.stats = data
|
107
|
+
else:
|
108
|
+
self.stats = imagestats(data, ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH))
|
99
109
|
return DuplicatesOutput(**self._get_duplicates())
|
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from typing import Callable
|
10
12
|
|
11
13
|
import keras
|
12
14
|
import numpy as np
|
15
|
+
import tensorflow as tf
|
13
16
|
from numpy.typing import ArrayLike
|
14
17
|
|
15
18
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
@@ -19,47 +22,30 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
19
22
|
|
20
23
|
|
21
24
|
class OOD_AE(OODBase):
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
"""
|
26
|
+
Autoencoder based out-of-distribution detector.
|
27
|
+
|
28
|
+
Parameters
|
29
|
+
----------
|
30
|
+
model : AE
|
31
|
+
An Autoencoder model.
|
32
|
+
"""
|
25
33
|
|
26
|
-
|
27
|
-
----------
|
28
|
-
model : AE
|
29
|
-
An Autoencoder model.
|
30
|
-
"""
|
34
|
+
def __init__(self, model: AE) -> None:
|
31
35
|
super().__init__(model)
|
32
36
|
|
33
37
|
def fit(
|
34
38
|
self,
|
35
39
|
x_ref: ArrayLike,
|
36
40
|
threshold_perc: float = 100.0,
|
37
|
-
loss_fn: Callable =
|
41
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
38
42
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
39
43
|
epochs: int = 20,
|
40
44
|
batch_size: int = 64,
|
41
45
|
verbose: bool = True,
|
42
46
|
) -> None:
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
x_ref : ArrayLike
|
49
|
-
Training batch.
|
50
|
-
threshold_perc : float, default 100.0
|
51
|
-
Percentage of reference data that is normal.
|
52
|
-
loss_fn : Callable, default keras.losses.MeanSquaredError()
|
53
|
-
Loss function used for training.
|
54
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
55
|
-
Optimizer used for training.
|
56
|
-
epochs : int, default 20
|
57
|
-
Number of training epochs.
|
58
|
-
batch_size : int, default 64
|
59
|
-
Batch size used for training.
|
60
|
-
verbose : bool, default True
|
61
|
-
Whether to print training progress.
|
62
|
-
"""
|
47
|
+
if loss_fn is None:
|
48
|
+
loss_fn = keras.losses.MeanSquaredError()
|
63
49
|
super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
64
50
|
|
65
51
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
@@ -6,9 +6,12 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from typing import Callable
|
10
12
|
|
11
13
|
import keras
|
14
|
+
import tensorflow as tf
|
12
15
|
from numpy.typing import ArrayLike
|
13
16
|
|
14
17
|
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
@@ -20,50 +23,53 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
20
23
|
|
21
24
|
|
22
25
|
class OOD_AEGMM(OODGMMBase):
|
23
|
-
|
24
|
-
|
25
|
-
AE with Gaussian Mixture Model based outlier detector.
|
26
|
+
"""
|
27
|
+
AE with Gaussian Mixture Model based outlier detector.
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
model : AEGMM
|
32
|
+
An AEGMM model.
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, model: AEGMM) -> None:
|
32
36
|
super().__init__(model)
|
33
37
|
|
34
38
|
def fit(
|
35
39
|
self,
|
36
40
|
x_ref: ArrayLike,
|
37
41
|
threshold_perc: float = 100.0,
|
38
|
-
loss_fn: Callable =
|
42
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
39
43
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
40
44
|
epochs: int = 20,
|
41
45
|
batch_size: int = 64,
|
42
46
|
verbose: bool = True,
|
43
47
|
) -> None:
|
48
|
+
if loss_fn is None:
|
49
|
+
loss_fn = LossGMM()
|
50
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
51
|
+
|
52
|
+
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
44
53
|
"""
|
45
|
-
|
54
|
+
Compute the out-of-distribution (OOD) score for a given dataset.
|
46
55
|
|
47
56
|
Parameters
|
48
57
|
----------
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
Loss function used for training.
|
55
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
56
|
-
Optimizer used for training.
|
57
|
-
epochs : int, default 20
|
58
|
-
Number of training epochs.
|
59
|
-
batch_size : int, default 64
|
60
|
-
Batch size used for training.
|
61
|
-
verbose : bool, default True
|
62
|
-
Whether to print training progress.
|
63
|
-
"""
|
64
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
58
|
+
X : ArrayLike
|
59
|
+
Input data to score.
|
60
|
+
batch_size : int, default 1e10
|
61
|
+
Number of instances to process in each batch.
|
62
|
+
Use a smaller batch size if your dataset is large or if you encounter memory issues.
|
65
63
|
|
66
|
-
|
64
|
+
Returns
|
65
|
+
-------
|
66
|
+
OODScore
|
67
|
+
An object containing the instance-level OOD score.
|
68
|
+
|
69
|
+
Note
|
70
|
+
----
|
71
|
+
This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
|
72
|
+
"""
|
67
73
|
self._validate(X := to_numpy(X))
|
68
74
|
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
69
75
|
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
@@ -6,9 +6,11 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from abc import ABC, abstractmethod
|
10
12
|
from dataclasses import dataclass
|
11
|
-
from typing import Callable,
|
13
|
+
from typing import Callable, Literal, NamedTuple, cast
|
12
14
|
|
13
15
|
import keras
|
14
16
|
import numpy as np
|
@@ -26,17 +28,17 @@ class OODOutput(OutputMetadata):
|
|
26
28
|
"""
|
27
29
|
Attributes
|
28
30
|
----------
|
29
|
-
is_ood : NDArray
|
31
|
+
is_ood : NDArray
|
30
32
|
Array of images that are detected as out of distribution
|
31
|
-
instance_score : NDArray
|
33
|
+
instance_score : NDArray
|
32
34
|
Instance score of the evaluated dataset
|
33
|
-
feature_score :
|
35
|
+
feature_score : NDArray | None
|
34
36
|
Feature score, if available, of the evaluated dataset
|
35
37
|
"""
|
36
38
|
|
37
39
|
is_ood: NDArray[np.bool_]
|
38
40
|
instance_score: NDArray[np.float32]
|
39
|
-
feature_score:
|
41
|
+
feature_score: NDArray[np.float32] | None
|
40
42
|
|
41
43
|
|
42
44
|
class OODScore(NamedTuple):
|
@@ -45,16 +47,28 @@ class OODScore(NamedTuple):
|
|
45
47
|
|
46
48
|
Parameters
|
47
49
|
----------
|
48
|
-
instance_score : NDArray
|
50
|
+
instance_score : NDArray
|
49
51
|
Instance score of the evaluated dataset.
|
50
|
-
feature_score :
|
52
|
+
feature_score : NDArray | None, default None
|
51
53
|
Feature score, if available, of the evaluated dataset.
|
52
54
|
"""
|
53
55
|
|
54
56
|
instance_score: NDArray[np.float32]
|
55
|
-
feature_score:
|
57
|
+
feature_score: NDArray[np.float32] | None = None
|
56
58
|
|
57
59
|
def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
|
60
|
+
"""
|
61
|
+
Returns either the instance or feature score
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
ood_type : "instance" | "feature"
|
66
|
+
|
67
|
+
Returns
|
68
|
+
-------
|
69
|
+
NDArray
|
70
|
+
Either the instance or feature score based on input selection
|
71
|
+
"""
|
58
72
|
return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
|
59
73
|
|
60
74
|
|
@@ -64,12 +78,12 @@ class OODBase(ABC):
|
|
64
78
|
|
65
79
|
self._ref_score: OODScore
|
66
80
|
self._threshold_perc: float
|
67
|
-
self._data_info:
|
81
|
+
self._data_info: tuple[tuple, type] | None = None
|
68
82
|
|
69
83
|
if not isinstance(model, keras.Model):
|
70
84
|
raise TypeError("Model should be of type 'keras.Model'.")
|
71
85
|
|
72
|
-
def _get_data_info(self, X: NDArray) ->
|
86
|
+
def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
|
73
87
|
if not isinstance(X, np.ndarray):
|
74
88
|
raise TypeError("Dataset should of type: `NDArray`.")
|
75
89
|
return X.shape[1:], X.dtype.type
|
@@ -80,7 +94,7 @@ class OODBase(ABC):
|
|
80
94
|
raise RuntimeError(f"Expect data of type: {self._data_info[1]} and shape: {self._data_info[0]}. \
|
81
95
|
Provided data is type: {check_data_info[1]} and shape: {check_data_info[0]}.")
|
82
96
|
|
83
|
-
def _validate_state(self, X: NDArray, additional_attrs:
|
97
|
+
def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
|
84
98
|
attrs = ["_data_info", "_threshold_perc", "_ref_score"]
|
85
99
|
attrs = attrs if additional_attrs is None else attrs + additional_attrs
|
86
100
|
if not all(hasattr(self, attr) for attr in attrs) or any(getattr(self, attr) for attr in attrs) is None:
|
@@ -90,18 +104,20 @@ class OODBase(ABC):
|
|
90
104
|
@abstractmethod
|
91
105
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
92
106
|
"""
|
93
|
-
Compute
|
107
|
+
Compute the out-of-distribution (OOD) scores for a given dataset.
|
94
108
|
|
95
109
|
Parameters
|
96
110
|
----------
|
97
111
|
X : ArrayLike
|
98
|
-
|
99
|
-
batch_size : int, default
|
100
|
-
|
112
|
+
Input data to score.
|
113
|
+
batch_size : int, default 1e10
|
114
|
+
Number of instances to process in each batch.
|
115
|
+
Use a smaller batch size if your dataset is large or if you encounter memory issues.
|
101
116
|
|
102
117
|
Returns
|
103
118
|
-------
|
104
|
-
|
119
|
+
OODScore
|
120
|
+
An object containing the instance-level and feature-level OOD scores.
|
105
121
|
"""
|
106
122
|
|
107
123
|
def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
|
@@ -110,33 +126,34 @@ class OODBase(ABC):
|
|
110
126
|
def fit(
|
111
127
|
self,
|
112
128
|
x_ref: ArrayLike,
|
113
|
-
threshold_perc: float,
|
114
|
-
loss_fn: Callable,
|
115
|
-
optimizer: keras.optimizers.Optimizer,
|
116
|
-
epochs: int,
|
117
|
-
batch_size: int,
|
118
|
-
verbose: bool,
|
129
|
+
threshold_perc: float = 100.0,
|
130
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
131
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
132
|
+
epochs: int = 20,
|
133
|
+
batch_size: int = 64,
|
134
|
+
verbose: bool = True,
|
119
135
|
) -> None:
|
120
136
|
"""
|
121
137
|
Train the model and infer the threshold value.
|
122
138
|
|
123
139
|
Parameters
|
124
140
|
----------
|
125
|
-
x_ref
|
126
|
-
Training
|
127
|
-
threshold_perc : float
|
141
|
+
x_ref : ArrayLike
|
142
|
+
Training data.
|
143
|
+
threshold_perc : float, default 100.0
|
128
144
|
Percentage of reference data that is normal.
|
129
|
-
loss_fn : Callable
|
145
|
+
loss_fn : Callable | None, default None
|
130
146
|
Loss function used for training.
|
131
|
-
optimizer : keras.optimizers.
|
147
|
+
optimizer : Optimizer, default keras.optimizers.Adam
|
132
148
|
Optimizer used for training.
|
133
|
-
epochs : int
|
149
|
+
epochs : int, default 20
|
134
150
|
Number of training epochs.
|
135
|
-
batch_size : int
|
151
|
+
batch_size : int, default 64
|
136
152
|
Batch size used for training.
|
137
|
-
verbose : bool
|
153
|
+
verbose : bool, default True
|
138
154
|
Whether to print training progress.
|
139
155
|
"""
|
156
|
+
|
140
157
|
# Train the model
|
141
158
|
trainer(
|
142
159
|
model=self.model,
|
@@ -165,15 +182,16 @@ class OODBase(ABC):
|
|
165
182
|
Parameters
|
166
183
|
----------
|
167
184
|
X : ArrayLike
|
168
|
-
|
169
|
-
batch_size : int, default
|
170
|
-
|
171
|
-
ood_type :
|
185
|
+
Input data for out-of-distribution prediction.
|
186
|
+
batch_size : int, default 1e10
|
187
|
+
Number of instances to process in each batch.
|
188
|
+
ood_type : "feature" | "instance", default "instance"
|
172
189
|
Predict out-of-distribution at the 'feature' or 'instance' level.
|
173
190
|
|
174
191
|
Returns
|
175
192
|
-------
|
176
|
-
Dictionary containing the outlier predictions
|
193
|
+
Dictionary containing the outlier predictions for the selected level,
|
194
|
+
and the OOD scores for the data including both 'instance' and 'feature' (if present) level scores.
|
177
195
|
"""
|
178
196
|
self._validate_state(X := to_numpy(X))
|
179
197
|
# compute outlier scores
|
@@ -187,7 +205,7 @@ class OODGMMBase(OODBase):
|
|
187
205
|
super().__init__(model)
|
188
206
|
self.gmm_params: GaussianMixtureModelParams
|
189
207
|
|
190
|
-
def _validate_state(self, X: NDArray, additional_attrs:
|
208
|
+
def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
|
191
209
|
if additional_attrs is None:
|
192
210
|
additional_attrs = ["gmm_params"]
|
193
211
|
super()._validate_state(X, additional_attrs)
|
@@ -195,12 +213,12 @@ class OODGMMBase(OODBase):
|
|
195
213
|
def fit(
|
196
214
|
self,
|
197
215
|
x_ref: ArrayLike,
|
198
|
-
threshold_perc: float,
|
199
|
-
loss_fn: Callable[
|
200
|
-
optimizer: keras.optimizers.Optimizer,
|
201
|
-
epochs: int,
|
202
|
-
batch_size: int,
|
203
|
-
verbose: bool,
|
216
|
+
threshold_perc: float = 100.0,
|
217
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
218
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
219
|
+
epochs: int = 20,
|
220
|
+
batch_size: int = 64,
|
221
|
+
verbose: bool = True,
|
204
222
|
) -> None:
|
205
223
|
# Train the model
|
206
224
|
trainer(
|
@@ -214,7 +232,7 @@ class OODGMMBase(OODBase):
|
|
214
232
|
)
|
215
233
|
|
216
234
|
# Calculate the GMM parameters
|
217
|
-
_, z, gamma = cast(
|
235
|
+
_, z, gamma = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
|
218
236
|
self.gmm_params = gmm_params(z, gamma)
|
219
237
|
|
220
238
|
# Infer the threshold values
|
@@ -6,8 +6,10 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from functools import partial
|
10
|
-
from typing import Callable
|
12
|
+
from typing import Callable
|
11
13
|
|
12
14
|
import keras
|
13
15
|
import numpy as np
|
@@ -24,8 +26,8 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
24
26
|
|
25
27
|
|
26
28
|
def build_model(
|
27
|
-
dist: PixelCNN, input_shape:
|
28
|
-
) ->
|
29
|
+
dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
30
|
+
) -> tuple[keras.Model, PixelCNN]:
|
29
31
|
"""
|
30
32
|
Create keras.Model from TF distribution.
|
31
33
|
|
@@ -87,28 +89,29 @@ def mutate_categorical(
|
|
87
89
|
|
88
90
|
|
89
91
|
class OOD_LLR(OODBase):
|
92
|
+
"""
|
93
|
+
Likelihood Ratios based outlier detector.
|
94
|
+
|
95
|
+
Parameters
|
96
|
+
----------
|
97
|
+
model : PixelCNN
|
98
|
+
Generative distribution model.
|
99
|
+
model_background : Optional[PixelCNN], default None
|
100
|
+
Optional model for the background. Only needed if it is different from `model`.
|
101
|
+
log_prob : Optional[Callable], default None
|
102
|
+
Function used to evaluate log probabilities under the model
|
103
|
+
if the model does not have a `log_prob` function.
|
104
|
+
sequential : bool, default False
|
105
|
+
Whether the data is sequential. Used to create targets during training.
|
106
|
+
"""
|
107
|
+
|
90
108
|
def __init__(
|
91
109
|
self,
|
92
110
|
model: PixelCNN,
|
93
|
-
model_background:
|
94
|
-
log_prob:
|
111
|
+
model_background: PixelCNN | None = None,
|
112
|
+
log_prob: Callable | None = None,
|
95
113
|
sequential: bool = False,
|
96
114
|
) -> None:
|
97
|
-
"""
|
98
|
-
Likelihood Ratios based outlier detector.
|
99
|
-
|
100
|
-
Parameters
|
101
|
-
----------
|
102
|
-
model : PixelCNN
|
103
|
-
Generative distribution model.
|
104
|
-
model_background : Optional[PixelCNN], default None
|
105
|
-
Optional model for the background. Only needed if it is different from `model`.
|
106
|
-
log_prob : Optional[Callable], default None
|
107
|
-
Function used to evaluate log probabilities under the model
|
108
|
-
if the model does not have a `log_prob` function.
|
109
|
-
sequential : bool, default False
|
110
|
-
Whether the data is sequential. Used to create targets during training.
|
111
|
-
"""
|
112
115
|
self.dist_s = model
|
113
116
|
self.dist_b = (
|
114
117
|
model.copy()
|
@@ -123,13 +126,13 @@ class OOD_LLR(OODBase):
|
|
123
126
|
|
124
127
|
self._ref_score: OODScore
|
125
128
|
self._threshold_perc: float
|
126
|
-
self._data_info:
|
129
|
+
self._data_info: tuple[tuple, type] | None = None
|
127
130
|
|
128
131
|
def fit(
|
129
132
|
self,
|
130
133
|
x_ref: ArrayLike,
|
131
134
|
threshold_perc: float = 100.0,
|
132
|
-
loss_fn:
|
135
|
+
loss_fn: Callable | None = None,
|
133
136
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
134
137
|
epochs: int = 20,
|
135
138
|
batch_size: int = 64,
|
@@ -144,10 +147,10 @@ class OOD_LLR(OODBase):
|
|
144
147
|
Parameters
|
145
148
|
----------
|
146
149
|
x_ref : ArrayLike
|
147
|
-
Training
|
150
|
+
Training data.
|
148
151
|
threshold_perc : float, default 100.0
|
149
152
|
Percentage of reference data that is normal.
|
150
|
-
loss_fn :
|
153
|
+
loss_fn : Callable | None, default None
|
151
154
|
Loss function used for training.
|
152
155
|
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
153
156
|
Optimizer used for training.
|
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
9
11
|
from typing import Callable
|
10
12
|
|
11
13
|
import keras
|
12
14
|
import numpy as np
|
15
|
+
import tensorflow as tf
|
13
16
|
from numpy.typing import ArrayLike
|
14
17
|
|
15
18
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
@@ -20,17 +23,33 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
|
|
20
23
|
|
21
24
|
|
22
25
|
class OOD_VAE(OODBase):
|
26
|
+
"""
|
27
|
+
VAE based outlier detector.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
model : VAE
|
32
|
+
A VAE model.
|
33
|
+
samples : int, default 10
|
34
|
+
Number of samples sampled to evaluate each instance.
|
35
|
+
|
36
|
+
Examples
|
37
|
+
--------
|
38
|
+
Instantiate an OOD detector metric with a generic dataset - batch of images with shape (3,25,25)
|
39
|
+
|
40
|
+
>>> metric = OOD_VAE(create_model(VAE, dataset[0].shape))
|
41
|
+
|
42
|
+
Adjusting fit parameters,
|
43
|
+
including setting the fit threshold at 85% for a training set with about 15% out-of-distribution
|
44
|
+
|
45
|
+
>>> metric.fit(dataset, threshold_perc=85, batch_size=128, verbose=False)
|
46
|
+
|
47
|
+
Detect out of distribution samples at the 'feature' level
|
48
|
+
|
49
|
+
>>> result = metric.predict(dataset, ood_type="feature")
|
50
|
+
"""
|
51
|
+
|
23
52
|
def __init__(self, model: VAE, samples: int = 10) -> None:
|
24
|
-
"""
|
25
|
-
VAE based outlier detector.
|
26
|
-
|
27
|
-
Parameters
|
28
|
-
----------
|
29
|
-
model : VAE
|
30
|
-
A VAE model.
|
31
|
-
samples : int, default 10
|
32
|
-
Number of samples sampled to evaluate each instance.
|
33
|
-
"""
|
34
53
|
super().__init__(model)
|
35
54
|
self.samples = samples
|
36
55
|
|
@@ -38,32 +57,14 @@ class OOD_VAE(OODBase):
|
|
38
57
|
self,
|
39
58
|
x_ref: ArrayLike,
|
40
59
|
threshold_perc: float = 100.0,
|
41
|
-
loss_fn: Callable =
|
60
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
42
61
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
43
62
|
epochs: int = 20,
|
44
63
|
batch_size: int = 64,
|
45
64
|
verbose: bool = True,
|
46
65
|
) -> None:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
Parameters
|
51
|
-
----------
|
52
|
-
x_ref : ArrayLike
|
53
|
-
Training batch.
|
54
|
-
threshold_perc : float, default 100.0
|
55
|
-
Percentage of reference data that is normal.
|
56
|
-
loss_fn : Callable, default Elbo(0.05)
|
57
|
-
Loss function used for training.
|
58
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
59
|
-
Optimizer used for training.
|
60
|
-
epochs : int, default 20
|
61
|
-
Number of training epochs.
|
62
|
-
batch_size : int, default 64
|
63
|
-
Batch size used for training.
|
64
|
-
verbose : bool, default True
|
65
|
-
Whether to print training progress.
|
66
|
-
"""
|
66
|
+
if loss_fn is None:
|
67
|
+
loss_fn = Elbo(0.05)
|
67
68
|
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
68
69
|
|
69
70
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|