dataeval 0.74.0__py3-none-any.whl → 0.74.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +23 -10
- dataeval/detectors/__init__.py +2 -10
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/mmd.py +1 -1
- dataeval/detectors/linters/clusterer.py +3 -3
- dataeval/detectors/linters/duplicates.py +4 -4
- dataeval/detectors/linters/outliers.py +4 -4
- dataeval/detectors/ood/__init__.py +5 -12
- dataeval/detectors/ood/base.py +5 -5
- dataeval/detectors/ood/metadata_ks_compare.py +12 -13
- dataeval/interop.py +15 -3
- dataeval/logging.py +16 -0
- dataeval/metrics/bias/balance.py +3 -3
- dataeval/metrics/bias/coverage.py +3 -3
- dataeval/metrics/bias/diversity.py +3 -3
- dataeval/metrics/bias/metadata_preprocessing.py +3 -3
- dataeval/metrics/bias/parity.py +4 -4
- dataeval/metrics/estimators/ber.py +3 -3
- dataeval/metrics/estimators/divergence.py +3 -3
- dataeval/metrics/estimators/uap.py +3 -3
- dataeval/metrics/stats/base.py +2 -2
- dataeval/metrics/stats/boxratiostats.py +1 -1
- dataeval/metrics/stats/datasetstats.py +6 -6
- dataeval/metrics/stats/dimensionstats.py +1 -1
- dataeval/metrics/stats/hashstats.py +1 -1
- dataeval/metrics/stats/labelstats.py +3 -3
- dataeval/metrics/stats/pixelstats.py +1 -1
- dataeval/metrics/stats/visualstats.py +1 -1
- dataeval/output.py +81 -57
- dataeval/utils/__init__.py +1 -7
- dataeval/utils/split_dataset.py +306 -279
- dataeval/workflows/sufficiency.py +4 -4
- {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/METADATA +3 -8
- dataeval-0.74.2.dist-info/RECORD +66 -0
- dataeval/detectors/ood/ae.py +0 -76
- dataeval/detectors/ood/aegmm.py +0 -67
- dataeval/detectors/ood/base_tf.py +0 -109
- dataeval/detectors/ood/llr.py +0 -302
- dataeval/detectors/ood/vae.py +0 -98
- dataeval/detectors/ood/vaegmm.py +0 -76
- dataeval/utils/lazy.py +0 -26
- dataeval/utils/tensorflow/__init__.py +0 -19
- dataeval/utils/tensorflow/_internal/gmm.py +0 -103
- dataeval/utils/tensorflow/_internal/loss.py +0 -121
- dataeval/utils/tensorflow/_internal/models.py +0 -1394
- dataeval/utils/tensorflow/_internal/trainer.py +0 -114
- dataeval/utils/tensorflow/_internal/utils.py +0 -256
- dataeval/utils/tensorflow/loss/__init__.py +0 -11
- dataeval-0.74.0.dist-info/RECORD +0 -79
- {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/WHEEL +0 -0
@@ -16,11 +16,11 @@ from scipy.optimize import basinhopping
|
|
16
16
|
from torch.utils.data import Dataset
|
17
17
|
|
18
18
|
from dataeval.interop import as_numpy
|
19
|
-
from dataeval.output import
|
19
|
+
from dataeval.output import Output, set_metadata
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass(frozen=True)
|
23
|
-
class SufficiencyOutput(
|
23
|
+
class SufficiencyOutput(Output):
|
24
24
|
"""
|
25
25
|
Output class for :class:`Sufficiency` workflow
|
26
26
|
|
@@ -47,7 +47,7 @@ class SufficiencyOutput(OutputMetadata):
|
|
47
47
|
if c != c_v:
|
48
48
|
raise ValueError(f"{m} does not contain the expected number ({c}) of data points.")
|
49
49
|
|
50
|
-
@set_metadata
|
50
|
+
@set_metadata
|
51
51
|
def project(
|
52
52
|
self,
|
53
53
|
projection: int | Iterable[int],
|
@@ -484,7 +484,7 @@ class Sufficiency(Generic[T]):
|
|
484
484
|
def eval_kwargs(self, value: Mapping[str, Any] | None) -> None:
|
485
485
|
self._eval_kwargs = {} if value is None else value
|
486
486
|
|
487
|
-
@set_metadata(["runs", "substeps"])
|
487
|
+
@set_metadata(state=["runs", "substeps"])
|
488
488
|
def evaluate(self, eval_at: int | Iterable[int] | None = None, niter: int = 1000) -> SufficiencyOutput:
|
489
489
|
"""
|
490
490
|
Creates data indices, trains models, and returns plotting data
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.74.
|
3
|
+
Version: 0.74.2
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -21,17 +21,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Programming Language :: Python :: 3 :: Only
|
22
22
|
Classifier: Topic :: Scientific/Engineering
|
23
23
|
Provides-Extra: all
|
24
|
-
Provides-Extra: tensorflow
|
25
24
|
Provides-Extra: torch
|
26
|
-
Requires-Dist:
|
27
|
-
Requires-Dist:
|
28
|
-
Requires-Dist: numpy (>1.24.3)
|
25
|
+
Requires-Dist: matplotlib ; extra == "all"
|
26
|
+
Requires-Dist: numpy (>=1.24.3)
|
29
27
|
Requires-Dist: pillow (>=10.3.0)
|
30
28
|
Requires-Dist: scikit-learn (>=1.5.0)
|
31
29
|
Requires-Dist: scipy (>=1.10)
|
32
|
-
Requires-Dist: tensorflow (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
|
33
|
-
Requires-Dist: tensorflow_probability (>=0.24,<0.25) ; extra == "tensorflow" or extra == "all"
|
34
|
-
Requires-Dist: tf-keras (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
|
35
30
|
Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
|
36
31
|
Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
|
37
32
|
Requires-Dist: tqdm
|
@@ -0,0 +1,66 @@
|
|
1
|
+
dataeval/__init__.py,sha256=w_On8sJ5o_f8PboMo6LLErdFSqDAQ1Jg_e0mcp-5FRU,959
|
2
|
+
dataeval/detectors/__init__.py,sha256=Y-0bbyWyuMvZU80bCx6WPt3IV_r2hu9ymzpA8uzMqoI,206
|
3
|
+
dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
|
4
|
+
dataeval/detectors/drift/base.py,sha256=QDGHMu1WADD-38MEIOwjQMEQM3DE7B0yFHO3hsMbV-E,14481
|
5
|
+
dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
|
6
|
+
dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
|
7
|
+
dataeval/detectors/drift/mmd.py,sha256=C0FX5v9ZJzmKNYEcYUaC7sDtMpJ2dZpwikNDu-AEWiI,7584
|
8
|
+
dataeval/detectors/drift/torch.py,sha256=igEQ2DV9JmcpTdUKCOHBi5LxtoNeCAslJS2Ldulg1hw,7585
|
9
|
+
dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
|
10
|
+
dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
|
11
|
+
dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
|
12
|
+
dataeval/detectors/linters/clusterer.py,sha256=hK-ak02GaxwWuufesZMKDsvoE5fMdXO7UWsLiK8hfY0,21008
|
13
|
+
dataeval/detectors/linters/duplicates.py,sha256=2bmPTFqoefeiAQV9y4CGlHV_mJNrysJSEFLXLd2DO4I,5661
|
14
|
+
dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
|
15
|
+
dataeval/detectors/linters/outliers.py,sha256=X48bzTfTr1LqC6WKVKBRfvpjcQRgmb93cNLT7Oipe3M,10113
|
16
|
+
dataeval/detectors/ood/__init__.py,sha256=-D4Fq-ysFylNNMqjHG1ALbB9qBCm_UinkCAgsK9HGg0,408
|
17
|
+
dataeval/detectors/ood/ae_torch.py,sha256=pO9w5221bXR9lEBkE7oakXeE7PXUUR--xcTpmHvOCSk,2142
|
18
|
+
dataeval/detectors/ood/base.py,sha256=UzcDbXl8Gv43VFzjrOegTnKSIoEYmfDP7fAySeWyWPw,6955
|
19
|
+
dataeval/detectors/ood/base_torch.py,sha256=yFbSfQsBMwZeVf8mrixmkZYBGChhV5oAHtkgzWnMzsA,3405
|
20
|
+
dataeval/detectors/ood/metadata_ks_compare.py,sha256=LNDNWGEDKTW8_-djgmK53sn9EZzzXq1Sgwc47k0QI-Y,5380
|
21
|
+
dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
|
22
|
+
dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
|
23
|
+
dataeval/interop.py,sha256=5lACbR7bZYGCagiwbXzAWvWeHRj8kWBmsTC9oEjFh78,2249
|
24
|
+
dataeval/logging.py,sha256=uGxXPqGpn5guQjuHtm25rzILaz7nCQUsy2o7tFo91OI,343
|
25
|
+
dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
|
26
|
+
dataeval/metrics/bias/__init__.py,sha256=dYiPHenS8J7pgRMMW2jNkTBmTbPoYTxT04fZu9PFats,747
|
27
|
+
dataeval/metrics/bias/balance.py,sha256=_TZEe17AT-qOvPp-QFrQfTqNwh8uVVCYjC4Sv6JBx9o,9118
|
28
|
+
dataeval/metrics/bias/coverage.py,sha256=o65_IgrWSlGnYeYZFABjwKaxq09uqyy5esHJM67PJ-k,4528
|
29
|
+
dataeval/metrics/bias/diversity.py,sha256=WL1NbZiRrv0SIq97FY3womZNCSl_EBMVlBWQZAUtjk8,7701
|
30
|
+
dataeval/metrics/bias/metadata_preprocessing.py,sha256=ekUFiirkmaHDiH7nJjkNpiUQD7OolAPhHorjLxpXv_Y,12248
|
31
|
+
dataeval/metrics/bias/metadata_utils.py,sha256=HmTjlRRTdM9566oKUDDdVMJ8luss4DYykFOiS2FQzhM,6558
|
32
|
+
dataeval/metrics/bias/parity.py,sha256=hnA7qQH4Uy3tl_krluZ9BPD5zYjjagUxZt2fEiIa2yE,12745
|
33
|
+
dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
|
34
|
+
dataeval/metrics/estimators/ber.py,sha256=fs3_e9pgu7I50QIALWtF2aidkBZhTCKVE2pA7PyB5Go,5019
|
35
|
+
dataeval/metrics/estimators/divergence.py,sha256=r_SKSurf1TdI5E1ivENqDnz8cQ3_sxVGKAqmF9cqcT4,4275
|
36
|
+
dataeval/metrics/estimators/uap.py,sha256=Aw5ReoWNK73Tq96r__qN_-cvHrELauqtDX3Af_QxX4s,2157
|
37
|
+
dataeval/metrics/stats/__init__.py,sha256=igLRaAt1nX6yRwC4xI0zNPBADi3u7EsSxWP3OZ8AqcU,1086
|
38
|
+
dataeval/metrics/stats/base.py,sha256=_C05KUAuDrfX3N-19o25V3vmXr0-45A5fc57cXyV8qs,12161
|
39
|
+
dataeval/metrics/stats/boxratiostats.py,sha256=bZunY-b8Y2IQqHlTusQN77ujLOHftogEQIARDpdVv6A,6463
|
40
|
+
dataeval/metrics/stats/datasetstats.py,sha256=rZUDiciHwEpnXmkI8-uJNiYwUuTL9ssZMKMx73hVX-Y,6219
|
41
|
+
dataeval/metrics/stats/dimensionstats.py,sha256=xITgQF_oomb6Ty_dJcbT3ARGGNp4QRcYSgnkjB4f-YE,4054
|
42
|
+
dataeval/metrics/stats/hashstats.py,sha256=vxw_K74EJM9CZy-EV617vdrysFO8nEspVWqIYsIHC-c,4958
|
43
|
+
dataeval/metrics/stats/labelstats.py,sha256=K0hJTphMe7htSjyss8GPtKDiHepTuU60_hX0xRA-uAg,4096
|
44
|
+
dataeval/metrics/stats/pixelstats.py,sha256=2zr9i3GLNx1i_SCtbfdtZNxXBEc_9wCe4qDpmXLVbKY,4576
|
45
|
+
dataeval/metrics/stats/visualstats.py,sha256=vLIC4sMo796axWl-4e4RzT33ll-_6ki54Dirn3V-EL8,4948
|
46
|
+
dataeval/output.py,sha256=hR5TJ67f7FgrZO9Du46aw-jvRpMjOimSgJSau4ZNK44,3565
|
47
|
+
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
|
+
dataeval/utils/__init__.py,sha256=z7HxSijjycey-rGdQkgVOdpvT0oO2pKAuT4uYyxYGMs,555
|
49
|
+
dataeval/utils/gmm.py,sha256=YuLsJKsVWgH_wHr1u_hSRH5Yeexdj8exht8h99L7bLo,561
|
50
|
+
dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
|
51
|
+
dataeval/utils/metadata.py,sha256=0A--iru0zEmi044mKz5P35q69KrI30yoiRSlvs7TSdQ,9418
|
52
|
+
dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
|
53
|
+
dataeval/utils/split_dataset.py,sha256=KYIl2ueLN0BeBoEvbUP5FdwVcMYW_l-ES1nQf_zKpQA,18776
|
54
|
+
dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
|
55
|
+
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
56
|
+
dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
|
57
|
+
dataeval/utils/torch/gmm.py,sha256=VbLlUQohwToApT493_tjQBWy2UM5R-3ppS9Dp-eP7BA,3240
|
58
|
+
dataeval/utils/torch/models.py,sha256=sdGeo7a8vshCTGA4lYyVxxb_aDWUlxdtIVxrddS-_ls,8542
|
59
|
+
dataeval/utils/torch/trainer.py,sha256=8BEXr6xtk-CHJTcNxOBnWgkFWfJUAiBy28cEdBhLMRU,7883
|
60
|
+
dataeval/utils/torch/utils.py,sha256=nWRcT6z6DbFVrL1RyxCOX3DPoCrv9G0B-VI_9LdGCQQ,5784
|
61
|
+
dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
|
62
|
+
dataeval/workflows/sufficiency.py,sha256=v9AV3BZT0NW-zD2VNIL_5aWspvoscrxRIUKcUdpy7HI,18540
|
63
|
+
dataeval-0.74.2.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
|
64
|
+
dataeval-0.74.2.dist-info/METADATA,sha256=Rcnn55cRPZ2JZ1jn8YamuVDxmQVDKEItK4oqZyAYkHM,4298
|
65
|
+
dataeval-0.74.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
66
|
+
dataeval-0.74.2.dist-info/RECORD,,
|
dataeval/detectors/ood/ae.py
DELETED
@@ -1,76 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
__all__ = ["OOD_AE"]
|
12
|
-
|
13
|
-
from typing import TYPE_CHECKING, Callable
|
14
|
-
|
15
|
-
import numpy as np
|
16
|
-
from numpy.typing import ArrayLike
|
17
|
-
|
18
|
-
from dataeval.detectors.ood.base import OODScoreOutput
|
19
|
-
from dataeval.detectors.ood.base_tf import OODBase
|
20
|
-
from dataeval.interop import as_numpy
|
21
|
-
from dataeval.utils.lazy import lazyload
|
22
|
-
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
23
|
-
|
24
|
-
if TYPE_CHECKING:
|
25
|
-
import tensorflow as tf
|
26
|
-
import tf_keras as keras
|
27
|
-
|
28
|
-
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
-
else:
|
30
|
-
tf = lazyload("tensorflow")
|
31
|
-
keras = lazyload("tf_keras")
|
32
|
-
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
-
|
34
|
-
|
35
|
-
class OOD_AE(OODBase):
|
36
|
-
"""
|
37
|
-
Autoencoder-based :term:`out of distribution<Out-of-distribution (OOD)>` detector.
|
38
|
-
|
39
|
-
Parameters
|
40
|
-
----------
|
41
|
-
model : AE
|
42
|
-
An :term:`autoencoder<Autoencoder>` model.
|
43
|
-
"""
|
44
|
-
|
45
|
-
def __init__(self, model: tf_models.AE) -> None:
|
46
|
-
super().__init__(model)
|
47
|
-
|
48
|
-
def fit(
|
49
|
-
self,
|
50
|
-
x_ref: ArrayLike,
|
51
|
-
threshold_perc: float = 100.0,
|
52
|
-
loss_fn: Callable[..., tf.Tensor] | None = None,
|
53
|
-
optimizer: keras.optimizers.Optimizer | None = None,
|
54
|
-
epochs: int = 20,
|
55
|
-
batch_size: int = 64,
|
56
|
-
verbose: bool = True,
|
57
|
-
) -> None:
|
58
|
-
if loss_fn is None:
|
59
|
-
loss_fn = keras.losses.MeanSquaredError()
|
60
|
-
super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
61
|
-
|
62
|
-
def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
63
|
-
self._validate(X := as_numpy(X))
|
64
|
-
|
65
|
-
# reconstruct instances
|
66
|
-
X_recon = predict_batch(X, self.model, batch_size=batch_size)
|
67
|
-
|
68
|
-
# compute feature and instance level scores
|
69
|
-
fscore = np.power(X - X_recon, 2)
|
70
|
-
fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
|
71
|
-
n_score_features = int(np.ceil(fscore_flat.shape[1]))
|
72
|
-
sorted_fscore = np.sort(fscore_flat, axis=1)
|
73
|
-
sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
|
74
|
-
iscore = np.mean(sorted_fscore_perc, axis=1)
|
75
|
-
|
76
|
-
return OODScoreOutput(iscore, fscore)
|
dataeval/detectors/ood/aegmm.py
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
__all__ = ["OOD_AEGMM"]
|
12
|
-
|
13
|
-
from typing import TYPE_CHECKING, Callable
|
14
|
-
|
15
|
-
from numpy.typing import ArrayLike
|
16
|
-
|
17
|
-
from dataeval.detectors.ood.base import OODScoreOutput
|
18
|
-
from dataeval.detectors.ood.base_tf import OODBaseGMM
|
19
|
-
from dataeval.interop import to_numpy
|
20
|
-
from dataeval.utils.lazy import lazyload
|
21
|
-
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
22
|
-
from dataeval.utils.tensorflow._internal.loss import LossGMM
|
23
|
-
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
24
|
-
|
25
|
-
if TYPE_CHECKING:
|
26
|
-
import tensorflow as tf
|
27
|
-
import tf_keras as keras
|
28
|
-
|
29
|
-
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
-
else:
|
31
|
-
tf = lazyload("tensorflow")
|
32
|
-
keras = lazyload("tf_keras")
|
33
|
-
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
-
|
35
|
-
|
36
|
-
class OOD_AEGMM(OODBaseGMM):
|
37
|
-
"""
|
38
|
-
AE with Gaussian Mixture Model based outlier detector.
|
39
|
-
|
40
|
-
Parameters
|
41
|
-
----------
|
42
|
-
model : AEGMM
|
43
|
-
An AEGMM model.
|
44
|
-
"""
|
45
|
-
|
46
|
-
def __init__(self, model: tf_models.AEGMM) -> None:
|
47
|
-
super().__init__(model)
|
48
|
-
|
49
|
-
def fit(
|
50
|
-
self,
|
51
|
-
x_ref: ArrayLike,
|
52
|
-
threshold_perc: float = 100.0,
|
53
|
-
loss_fn: Callable[..., tf.Tensor] | None = None,
|
54
|
-
optimizer: keras.optimizers.Optimizer | None = None,
|
55
|
-
epochs: int = 20,
|
56
|
-
batch_size: int = 64,
|
57
|
-
verbose: bool = True,
|
58
|
-
) -> None:
|
59
|
-
if loss_fn is None:
|
60
|
-
loss_fn = LossGMM()
|
61
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
62
|
-
|
63
|
-
def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
64
|
-
self._validate(X := to_numpy(X))
|
65
|
-
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
66
|
-
energy, _ = gmm_energy(z, self._gmm_params, return_mean=False)
|
67
|
-
return OODScoreOutput(energy.numpy()) # type: ignore
|
@@ -1,109 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
from typing import TYPE_CHECKING, Callable, cast
|
12
|
-
|
13
|
-
from numpy.typing import ArrayLike
|
14
|
-
|
15
|
-
from dataeval.detectors.ood.base import OODBaseMixin, OODFitMixin, OODGMMMixin
|
16
|
-
from dataeval.interop import to_numpy
|
17
|
-
from dataeval.utils.lazy import lazyload
|
18
|
-
from dataeval.utils.tensorflow._internal.gmm import gmm_params
|
19
|
-
from dataeval.utils.tensorflow._internal.trainer import trainer
|
20
|
-
|
21
|
-
if TYPE_CHECKING:
|
22
|
-
import tensorflow as tf
|
23
|
-
import tf_keras as keras
|
24
|
-
else:
|
25
|
-
tf = lazyload("tensorflow")
|
26
|
-
keras = lazyload("tf_keras")
|
27
|
-
|
28
|
-
|
29
|
-
class OODBase(OODBaseMixin[keras.Model], OODFitMixin[Callable[..., tf.Tensor], keras.optimizers.Optimizer]):
|
30
|
-
def __init__(self, model: keras.Model) -> None:
|
31
|
-
super().__init__(model)
|
32
|
-
|
33
|
-
def fit(
|
34
|
-
self,
|
35
|
-
x_ref: ArrayLike,
|
36
|
-
threshold_perc: float,
|
37
|
-
loss_fn: Callable[..., tf.Tensor] | None,
|
38
|
-
optimizer: keras.optimizers.Optimizer | None,
|
39
|
-
epochs: int,
|
40
|
-
batch_size: int,
|
41
|
-
verbose: bool,
|
42
|
-
) -> None:
|
43
|
-
"""
|
44
|
-
Train the model and infer the threshold value.
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
x_ref : ArrayLike
|
49
|
-
Training data.
|
50
|
-
threshold_perc : float, default 100.0
|
51
|
-
Percentage of reference data that is normal.
|
52
|
-
loss_fn : Callable | None, default None
|
53
|
-
Loss function used for training.
|
54
|
-
optimizer : Optimizer, default keras.optimizers.Adam
|
55
|
-
Optimizer used for training.
|
56
|
-
epochs : int, default 20
|
57
|
-
Number of training epochs.
|
58
|
-
batch_size : int, default 64
|
59
|
-
Batch size used for training.
|
60
|
-
verbose : bool, default True
|
61
|
-
Whether to print training progress.
|
62
|
-
"""
|
63
|
-
|
64
|
-
# Train the model
|
65
|
-
trainer(
|
66
|
-
model=self.model,
|
67
|
-
loss_fn=loss_fn,
|
68
|
-
x_train=to_numpy(x_ref),
|
69
|
-
y_train=None,
|
70
|
-
optimizer=optimizer,
|
71
|
-
epochs=epochs,
|
72
|
-
batch_size=batch_size,
|
73
|
-
verbose=verbose,
|
74
|
-
)
|
75
|
-
|
76
|
-
# Infer the threshold values
|
77
|
-
self._ref_score = self.score(x_ref, batch_size)
|
78
|
-
self._threshold_perc = threshold_perc
|
79
|
-
|
80
|
-
|
81
|
-
class OODBaseGMM(OODBase, OODGMMMixin[tf.Tensor]):
|
82
|
-
def fit(
|
83
|
-
self,
|
84
|
-
x_ref: ArrayLike,
|
85
|
-
threshold_perc: float,
|
86
|
-
loss_fn: Callable[..., tf.Tensor] | None,
|
87
|
-
optimizer: keras.optimizers.Optimizer | None,
|
88
|
-
epochs: int,
|
89
|
-
batch_size: int,
|
90
|
-
verbose: bool,
|
91
|
-
) -> None:
|
92
|
-
# Train the model
|
93
|
-
trainer(
|
94
|
-
model=self.model,
|
95
|
-
loss_fn=loss_fn,
|
96
|
-
x_train=to_numpy(x_ref),
|
97
|
-
optimizer=optimizer,
|
98
|
-
epochs=epochs,
|
99
|
-
batch_size=batch_size,
|
100
|
-
verbose=verbose,
|
101
|
-
)
|
102
|
-
|
103
|
-
# Calculate the GMM parameters
|
104
|
-
_, z, gamma = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
|
105
|
-
self._gmm_params = gmm_params(z, gamma)
|
106
|
-
|
107
|
-
# Infer the threshold values
|
108
|
-
self._ref_score = self.score(x_ref, batch_size)
|
109
|
-
self._threshold_perc = threshold_perc
|
dataeval/detectors/ood/llr.py
DELETED
@@ -1,302 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
__all__ = ["OOD_LLR"]
|
12
|
-
|
13
|
-
from functools import partial
|
14
|
-
from typing import TYPE_CHECKING, Callable
|
15
|
-
|
16
|
-
import numpy as np
|
17
|
-
from numpy.typing import ArrayLike, NDArray
|
18
|
-
|
19
|
-
from dataeval.detectors.ood.base import OODBaseMixin, OODScoreOutput
|
20
|
-
from dataeval.interop import to_numpy
|
21
|
-
from dataeval.utils.lazy import lazyload
|
22
|
-
from dataeval.utils.tensorflow._internal.trainer import trainer
|
23
|
-
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
24
|
-
|
25
|
-
if TYPE_CHECKING:
|
26
|
-
import tensorflow as tf
|
27
|
-
import tf_keras as keras
|
28
|
-
|
29
|
-
import dataeval.utils.tensorflow._internal.models as tf_models
|
30
|
-
else:
|
31
|
-
tf = lazyload("tensorflow")
|
32
|
-
keras = lazyload("tf_keras")
|
33
|
-
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
34
|
-
|
35
|
-
|
36
|
-
def _build_model(
|
37
|
-
dist: tf_models.PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
|
38
|
-
) -> tuple[keras.Model, tf_models.PixelCNN]:
|
39
|
-
"""
|
40
|
-
Create keras.Model from TF distribution.
|
41
|
-
|
42
|
-
Parameters
|
43
|
-
----------
|
44
|
-
dist
|
45
|
-
:term:`TensorFlow` distribution.
|
46
|
-
input_shape
|
47
|
-
Input shape of the model.
|
48
|
-
filepath
|
49
|
-
File to load model weights from.
|
50
|
-
|
51
|
-
Returns
|
52
|
-
-------
|
53
|
-
TensorFlow model.
|
54
|
-
"""
|
55
|
-
x_in = keras.layers.Input(shape=input_shape)
|
56
|
-
log_prob = dist.log_prob(x_in)
|
57
|
-
model = keras.models.Model(inputs=x_in, outputs=log_prob)
|
58
|
-
model.add_loss(-tf.reduce_mean(log_prob))
|
59
|
-
if isinstance(filepath, str):
|
60
|
-
model.load_weights(filepath)
|
61
|
-
return model, dist
|
62
|
-
|
63
|
-
|
64
|
-
def _mutate_categorical(
|
65
|
-
X: NDArray,
|
66
|
-
rate: float,
|
67
|
-
seed: int = 0,
|
68
|
-
feature_range: tuple[int, int] = (0, 255),
|
69
|
-
) -> tf.Tensor:
|
70
|
-
"""
|
71
|
-
Randomly change integer feature values to values within a set range
|
72
|
-
with a specified permutation rate.
|
73
|
-
|
74
|
-
Parameters
|
75
|
-
----------
|
76
|
-
X
|
77
|
-
Batch of data to be perturbed.
|
78
|
-
rate
|
79
|
-
Permutation rate (between 0 and 1).
|
80
|
-
seed
|
81
|
-
Random seed.
|
82
|
-
feature_range
|
83
|
-
Min and max range for perturbed features.
|
84
|
-
|
85
|
-
Returns
|
86
|
-
-------
|
87
|
-
Array with perturbed data.
|
88
|
-
"""
|
89
|
-
frange = (feature_range[0] + 1, feature_range[1] + 1)
|
90
|
-
shape = X.shape
|
91
|
-
n_samples = np.prod(shape)
|
92
|
-
mask = tf.random.categorical(tf.math.log([[1.0 - rate, rate]]), n_samples, seed=seed, dtype=tf.int32)
|
93
|
-
mask = tf.reshape(mask, shape)
|
94
|
-
possible_mutations = tf.random.uniform(shape, minval=frange[0], maxval=frange[1], dtype=tf.int32, seed=seed + 1)
|
95
|
-
X = tf.math.floormod(tf.cast(X, tf.int32) + mask * possible_mutations, frange[1]) # type: ignore py38
|
96
|
-
return tf.cast(X, tf.float32) # type: ignore
|
97
|
-
|
98
|
-
|
99
|
-
class OOD_LLR(OODBaseMixin[tf_models.PixelCNN]):
|
100
|
-
"""
|
101
|
-
Likelihood Ratios based outlier detector.
|
102
|
-
|
103
|
-
Parameters
|
104
|
-
----------
|
105
|
-
model : PixelCNN
|
106
|
-
Generative distribution model.
|
107
|
-
model_background : Optional[PixelCNN], default None
|
108
|
-
Optional model for the background. Only needed if it is different from `model`.
|
109
|
-
log_prob : Optional[Callable], default None
|
110
|
-
Function used to evaluate log probabilities under the model
|
111
|
-
if the model does not have a `log_prob` function.
|
112
|
-
sequential : bool, default False
|
113
|
-
Whether the data is sequential. Used to create targets during training.
|
114
|
-
"""
|
115
|
-
|
116
|
-
def __init__(
|
117
|
-
self,
|
118
|
-
model: tf_models.PixelCNN,
|
119
|
-
model_background: tf_models.PixelCNN | None = None,
|
120
|
-
log_prob: Callable | None = None,
|
121
|
-
sequential: bool = False,
|
122
|
-
) -> None:
|
123
|
-
self.dist_s: tf_models.PixelCNN = model
|
124
|
-
self.dist_b: tf_models.PixelCNN = (
|
125
|
-
model.copy()
|
126
|
-
if hasattr(model, "copy")
|
127
|
-
else keras.models.clone_model(model)
|
128
|
-
if model_background is None
|
129
|
-
else model_background
|
130
|
-
)
|
131
|
-
self.has_log_prob: bool = hasattr(model, "log_prob")
|
132
|
-
self.sequential: bool = sequential
|
133
|
-
self.log_prob: Callable | None = log_prob
|
134
|
-
|
135
|
-
self._ref_score: OODScoreOutput
|
136
|
-
self._threshold_perc: float
|
137
|
-
self._data_info: tuple[tuple, type] | None = None
|
138
|
-
|
139
|
-
def fit(
|
140
|
-
self,
|
141
|
-
x_ref: ArrayLike,
|
142
|
-
threshold_perc: float = 100.0,
|
143
|
-
loss_fn: Callable | None = None,
|
144
|
-
optimizer: keras.optimizers.Optimizer | None = None,
|
145
|
-
epochs: int = 20,
|
146
|
-
batch_size: int = 64,
|
147
|
-
verbose: bool = True,
|
148
|
-
mutate_fn: Callable = _mutate_categorical,
|
149
|
-
mutate_fn_kwargs: dict[str, float | int | tuple[int, int]] = {
|
150
|
-
"rate": 0.2,
|
151
|
-
"seed": 0,
|
152
|
-
"feature_range": (0, 255),
|
153
|
-
},
|
154
|
-
mutate_batch_size: int = int(1e10),
|
155
|
-
) -> None:
|
156
|
-
"""
|
157
|
-
Train semantic and background generative models.
|
158
|
-
|
159
|
-
Parameters
|
160
|
-
----------
|
161
|
-
x_ref : ArrayLike
|
162
|
-
Training data.
|
163
|
-
threshold_perc : float, default 100.0
|
164
|
-
Percentage of reference data that is normal.
|
165
|
-
loss_fn : Callable | None, default None
|
166
|
-
Loss function used for training.
|
167
|
-
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
168
|
-
Optimizer used for training.
|
169
|
-
epochs : int, default 20
|
170
|
-
Number of training epochs.
|
171
|
-
batch_size : int, default 64
|
172
|
-
Batch size used for training.
|
173
|
-
verbose : bool, default True
|
174
|
-
Whether to print training progress.
|
175
|
-
mutate_fn : Callable, default mutate_categorical
|
176
|
-
Mutation function used to generate the background dataset.
|
177
|
-
mutate_fn_kwargs : dict, default {"rate": 0.2, "seed": 0, "feature_range": (0, 255)}
|
178
|
-
Kwargs for the mutation function used to generate the background dataset.
|
179
|
-
Default values set for an image dataset.
|
180
|
-
mutate_batch_size: int, default int(1e10)
|
181
|
-
Batch size used to generate the mutations for the background dataset.
|
182
|
-
"""
|
183
|
-
x_ref = to_numpy(x_ref)
|
184
|
-
input_shape = x_ref.shape[1:]
|
185
|
-
optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
|
186
|
-
# Separate into two separate optimizers, one for semantic model and one for background model
|
187
|
-
optimizer_s = optimizer
|
188
|
-
optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
|
189
|
-
|
190
|
-
# training arguments
|
191
|
-
kwargs = {
|
192
|
-
"epochs": epochs,
|
193
|
-
"batch_size": batch_size,
|
194
|
-
"verbose": verbose,
|
195
|
-
}
|
196
|
-
|
197
|
-
# create background data
|
198
|
-
mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
|
199
|
-
X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype) # type: ignore
|
200
|
-
|
201
|
-
# prepare sequential data
|
202
|
-
if self.sequential and not self.has_log_prob:
|
203
|
-
y, y_back = x_ref[:, 1:], X_back[:, 1:] # type: ignore
|
204
|
-
X, X_back = x_ref[:, :-1], X_back[:, :-1] # type: ignore
|
205
|
-
else:
|
206
|
-
X = x_ref
|
207
|
-
y, y_back = None, None
|
208
|
-
|
209
|
-
# check if model needs to be built
|
210
|
-
use_build = self.has_log_prob and not isinstance(self.dist_s, keras.Model)
|
211
|
-
|
212
|
-
if use_build:
|
213
|
-
# build and train semantic model
|
214
|
-
self.model_s: keras.Model = _build_model(self.dist_s, input_shape)[0]
|
215
|
-
self.model_s.compile(optimizer=optimizer_s)
|
216
|
-
self.model_s.fit(X, **kwargs)
|
217
|
-
# build and train background model
|
218
|
-
self.model_b: keras.Model = _build_model(self.dist_b, input_shape)[0]
|
219
|
-
self.model_b.compile(optimizer=optimizer_b)
|
220
|
-
self.model_b.fit(X_back, **kwargs)
|
221
|
-
else:
|
222
|
-
# train semantic model
|
223
|
-
args = [self.dist_s, X]
|
224
|
-
kwargs.update({"y_train": y, "loss_fn": loss_fn, "optimizer": optimizer_s})
|
225
|
-
trainer(*args, **kwargs)
|
226
|
-
|
227
|
-
# train background model
|
228
|
-
args = [self.dist_b, X_back]
|
229
|
-
kwargs.update({"y_train": y_back, "loss_fn": loss_fn, "optimizer": optimizer_b})
|
230
|
-
trainer(*args, **kwargs)
|
231
|
-
|
232
|
-
self._datainfo = self._get_data_info(x_ref)
|
233
|
-
self._ref_score = self.score(x_ref, batch_size=batch_size)
|
234
|
-
self._threshold_perc = threshold_perc
|
235
|
-
|
236
|
-
def _logp(
|
237
|
-
self,
|
238
|
-
dist,
|
239
|
-
X: NDArray,
|
240
|
-
return_per_feature: bool = False,
|
241
|
-
batch_size: int = int(1e10),
|
242
|
-
) -> NDArray:
|
243
|
-
"""
|
244
|
-
Compute log probability of a batch of instances under the :term:`generative model<Generative Model>`.
|
245
|
-
"""
|
246
|
-
logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature)
|
247
|
-
# TODO: TBD: can this be any of the other types from predict_batch? i.e. tf.Tensor or tuple
|
248
|
-
return predict_batch(X, logp_fn, batch_size=batch_size) # type: ignore[return-value]
|
249
|
-
|
250
|
-
def _logp_alt(
|
251
|
-
self,
|
252
|
-
model: keras.Model,
|
253
|
-
X: NDArray,
|
254
|
-
return_per_feature: bool = False,
|
255
|
-
batch_size: int = int(1e10),
|
256
|
-
) -> NDArray:
|
257
|
-
"""
|
258
|
-
Compute log probability of a batch of instances with the user defined log_prob function.
|
259
|
-
"""
|
260
|
-
if self.sequential:
|
261
|
-
y, X = X[:, 1:], X[:, :-1]
|
262
|
-
else:
|
263
|
-
y = X.copy()
|
264
|
-
y_preds = predict_batch(X, model, batch_size=batch_size)
|
265
|
-
logp = self.log_prob(y, y_preds).numpy() # type: ignore
|
266
|
-
if return_per_feature:
|
267
|
-
return logp
|
268
|
-
else:
|
269
|
-
axis = tuple(np.arange(len(logp.shape))[1:])
|
270
|
-
return np.mean(logp, axis=axis)
|
271
|
-
|
272
|
-
def _llr(self, X: NDArray, return_per_feature: bool, batch_size: int = int(1e10)) -> NDArray:
|
273
|
-
"""
|
274
|
-
Compute likelihood ratios.
|
275
|
-
|
276
|
-
Parameters
|
277
|
-
----------
|
278
|
-
X
|
279
|
-
Batch of instances.
|
280
|
-
return_per_feature
|
281
|
-
Return likelihood ratio per feature.
|
282
|
-
batch_size
|
283
|
-
Batch size for the :term:`generative model<Generative Model>` evaluations.
|
284
|
-
|
285
|
-
Returns
|
286
|
-
-------
|
287
|
-
Likelihood ratios.
|
288
|
-
"""
|
289
|
-
logp_fn = self._logp if not isinstance(self.log_prob, Callable) else self._logp_alt # type: ignore
|
290
|
-
logp_s = logp_fn(self.dist_s, X, return_per_feature=return_per_feature, batch_size=batch_size)
|
291
|
-
logp_b = logp_fn(self.dist_b, X, return_per_feature=return_per_feature, batch_size=batch_size)
|
292
|
-
return logp_s - logp_b
|
293
|
-
|
294
|
-
def _score(
|
295
|
-
self,
|
296
|
-
X: ArrayLike,
|
297
|
-
batch_size: int = int(1e10),
|
298
|
-
) -> OODScoreOutput:
|
299
|
-
self._validate(X := to_numpy(X))
|
300
|
-
fscore = -self._llr(X, True, batch_size=batch_size)
|
301
|
-
iscore = -self._llr(X, False, batch_size=batch_size)
|
302
|
-
return OODScoreOutput(iscore, fscore)
|