dataeval 0.74.0__py3-none-any.whl → 0.74.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dataeval/__init__.py +23 -10
  2. dataeval/detectors/__init__.py +2 -10
  3. dataeval/detectors/drift/base.py +3 -3
  4. dataeval/detectors/drift/mmd.py +1 -1
  5. dataeval/detectors/linters/clusterer.py +3 -3
  6. dataeval/detectors/linters/duplicates.py +4 -4
  7. dataeval/detectors/linters/outliers.py +4 -4
  8. dataeval/detectors/ood/__init__.py +5 -12
  9. dataeval/detectors/ood/base.py +5 -5
  10. dataeval/detectors/ood/metadata_ks_compare.py +12 -13
  11. dataeval/interop.py +15 -3
  12. dataeval/logging.py +16 -0
  13. dataeval/metrics/bias/balance.py +3 -3
  14. dataeval/metrics/bias/coverage.py +3 -3
  15. dataeval/metrics/bias/diversity.py +3 -3
  16. dataeval/metrics/bias/metadata_preprocessing.py +3 -3
  17. dataeval/metrics/bias/parity.py +4 -4
  18. dataeval/metrics/estimators/ber.py +3 -3
  19. dataeval/metrics/estimators/divergence.py +3 -3
  20. dataeval/metrics/estimators/uap.py +3 -3
  21. dataeval/metrics/stats/base.py +2 -2
  22. dataeval/metrics/stats/boxratiostats.py +1 -1
  23. dataeval/metrics/stats/datasetstats.py +6 -6
  24. dataeval/metrics/stats/dimensionstats.py +1 -1
  25. dataeval/metrics/stats/hashstats.py +1 -1
  26. dataeval/metrics/stats/labelstats.py +3 -3
  27. dataeval/metrics/stats/pixelstats.py +1 -1
  28. dataeval/metrics/stats/visualstats.py +1 -1
  29. dataeval/output.py +81 -57
  30. dataeval/utils/__init__.py +1 -7
  31. dataeval/utils/split_dataset.py +306 -279
  32. dataeval/workflows/sufficiency.py +4 -4
  33. {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/METADATA +3 -8
  34. dataeval-0.74.2.dist-info/RECORD +66 -0
  35. dataeval/detectors/ood/ae.py +0 -76
  36. dataeval/detectors/ood/aegmm.py +0 -67
  37. dataeval/detectors/ood/base_tf.py +0 -109
  38. dataeval/detectors/ood/llr.py +0 -302
  39. dataeval/detectors/ood/vae.py +0 -98
  40. dataeval/detectors/ood/vaegmm.py +0 -76
  41. dataeval/utils/lazy.py +0 -26
  42. dataeval/utils/tensorflow/__init__.py +0 -19
  43. dataeval/utils/tensorflow/_internal/gmm.py +0 -103
  44. dataeval/utils/tensorflow/_internal/loss.py +0 -121
  45. dataeval/utils/tensorflow/_internal/models.py +0 -1394
  46. dataeval/utils/tensorflow/_internal/trainer.py +0 -114
  47. dataeval/utils/tensorflow/_internal/utils.py +0 -256
  48. dataeval/utils/tensorflow/loss/__init__.py +0 -11
  49. dataeval-0.74.0.dist-info/RECORD +0 -79
  50. {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/LICENSE.txt +0 -0
  51. {dataeval-0.74.0.dist-info → dataeval-0.74.2.dist-info}/WHEEL +0 -0
@@ -16,11 +16,11 @@ from scipy.optimize import basinhopping
16
16
  from torch.utils.data import Dataset
17
17
 
18
18
  from dataeval.interop import as_numpy
19
- from dataeval.output import OutputMetadata, set_metadata
19
+ from dataeval.output import Output, set_metadata
20
20
 
21
21
 
22
22
  @dataclass(frozen=True)
23
- class SufficiencyOutput(OutputMetadata):
23
+ class SufficiencyOutput(Output):
24
24
  """
25
25
  Output class for :class:`Sufficiency` workflow
26
26
 
@@ -47,7 +47,7 @@ class SufficiencyOutput(OutputMetadata):
47
47
  if c != c_v:
48
48
  raise ValueError(f"{m} does not contain the expected number ({c}) of data points.")
49
49
 
50
- @set_metadata()
50
+ @set_metadata
51
51
  def project(
52
52
  self,
53
53
  projection: int | Iterable[int],
@@ -484,7 +484,7 @@ class Sufficiency(Generic[T]):
484
484
  def eval_kwargs(self, value: Mapping[str, Any] | None) -> None:
485
485
  self._eval_kwargs = {} if value is None else value
486
486
 
487
- @set_metadata(["runs", "substeps"])
487
+ @set_metadata(state=["runs", "substeps"])
488
488
  def evaluate(self, eval_at: int | Iterable[int] | None = None, niter: int = 1000) -> SufficiencyOutput:
489
489
  """
490
490
  Creates data indices, trains models, and returns plotting data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.74.0
3
+ Version: 0.74.2
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -21,17 +21,12 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3 :: Only
22
22
  Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
- Provides-Extra: tensorflow
25
24
  Provides-Extra: torch
26
- Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
27
- Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
28
- Requires-Dist: numpy (>1.24.3)
25
+ Requires-Dist: matplotlib ; extra == "all"
26
+ Requires-Dist: numpy (>=1.24.3)
29
27
  Requires-Dist: pillow (>=10.3.0)
30
28
  Requires-Dist: scikit-learn (>=1.5.0)
31
29
  Requires-Dist: scipy (>=1.10)
32
- Requires-Dist: tensorflow (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
33
- Requires-Dist: tensorflow_probability (>=0.24,<0.25) ; extra == "tensorflow" or extra == "all"
34
- Requires-Dist: tf-keras (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
35
30
  Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
36
31
  Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
37
32
  Requires-Dist: tqdm
@@ -0,0 +1,66 @@
1
+ dataeval/__init__.py,sha256=w_On8sJ5o_f8PboMo6LLErdFSqDAQ1Jg_e0mcp-5FRU,959
2
+ dataeval/detectors/__init__.py,sha256=Y-0bbyWyuMvZU80bCx6WPt3IV_r2hu9ymzpA8uzMqoI,206
3
+ dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
4
+ dataeval/detectors/drift/base.py,sha256=QDGHMu1WADD-38MEIOwjQMEQM3DE7B0yFHO3hsMbV-E,14481
5
+ dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
6
+ dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
7
+ dataeval/detectors/drift/mmd.py,sha256=C0FX5v9ZJzmKNYEcYUaC7sDtMpJ2dZpwikNDu-AEWiI,7584
8
+ dataeval/detectors/drift/torch.py,sha256=igEQ2DV9JmcpTdUKCOHBi5LxtoNeCAslJS2Ldulg1hw,7585
9
+ dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
10
+ dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
11
+ dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
12
+ dataeval/detectors/linters/clusterer.py,sha256=hK-ak02GaxwWuufesZMKDsvoE5fMdXO7UWsLiK8hfY0,21008
13
+ dataeval/detectors/linters/duplicates.py,sha256=2bmPTFqoefeiAQV9y4CGlHV_mJNrysJSEFLXLd2DO4I,5661
14
+ dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
15
+ dataeval/detectors/linters/outliers.py,sha256=X48bzTfTr1LqC6WKVKBRfvpjcQRgmb93cNLT7Oipe3M,10113
16
+ dataeval/detectors/ood/__init__.py,sha256=-D4Fq-ysFylNNMqjHG1ALbB9qBCm_UinkCAgsK9HGg0,408
17
+ dataeval/detectors/ood/ae_torch.py,sha256=pO9w5221bXR9lEBkE7oakXeE7PXUUR--xcTpmHvOCSk,2142
18
+ dataeval/detectors/ood/base.py,sha256=UzcDbXl8Gv43VFzjrOegTnKSIoEYmfDP7fAySeWyWPw,6955
19
+ dataeval/detectors/ood/base_torch.py,sha256=yFbSfQsBMwZeVf8mrixmkZYBGChhV5oAHtkgzWnMzsA,3405
20
+ dataeval/detectors/ood/metadata_ks_compare.py,sha256=LNDNWGEDKTW8_-djgmK53sn9EZzzXq1Sgwc47k0QI-Y,5380
21
+ dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
22
+ dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
23
+ dataeval/interop.py,sha256=5lACbR7bZYGCagiwbXzAWvWeHRj8kWBmsTC9oEjFh78,2249
24
+ dataeval/logging.py,sha256=uGxXPqGpn5guQjuHtm25rzILaz7nCQUsy2o7tFo91OI,343
25
+ dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
26
+ dataeval/metrics/bias/__init__.py,sha256=dYiPHenS8J7pgRMMW2jNkTBmTbPoYTxT04fZu9PFats,747
27
+ dataeval/metrics/bias/balance.py,sha256=_TZEe17AT-qOvPp-QFrQfTqNwh8uVVCYjC4Sv6JBx9o,9118
28
+ dataeval/metrics/bias/coverage.py,sha256=o65_IgrWSlGnYeYZFABjwKaxq09uqyy5esHJM67PJ-k,4528
29
+ dataeval/metrics/bias/diversity.py,sha256=WL1NbZiRrv0SIq97FY3womZNCSl_EBMVlBWQZAUtjk8,7701
30
+ dataeval/metrics/bias/metadata_preprocessing.py,sha256=ekUFiirkmaHDiH7nJjkNpiUQD7OolAPhHorjLxpXv_Y,12248
31
+ dataeval/metrics/bias/metadata_utils.py,sha256=HmTjlRRTdM9566oKUDDdVMJ8luss4DYykFOiS2FQzhM,6558
32
+ dataeval/metrics/bias/parity.py,sha256=hnA7qQH4Uy3tl_krluZ9BPD5zYjjagUxZt2fEiIa2yE,12745
33
+ dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
34
+ dataeval/metrics/estimators/ber.py,sha256=fs3_e9pgu7I50QIALWtF2aidkBZhTCKVE2pA7PyB5Go,5019
35
+ dataeval/metrics/estimators/divergence.py,sha256=r_SKSurf1TdI5E1ivENqDnz8cQ3_sxVGKAqmF9cqcT4,4275
36
+ dataeval/metrics/estimators/uap.py,sha256=Aw5ReoWNK73Tq96r__qN_-cvHrELauqtDX3Af_QxX4s,2157
37
+ dataeval/metrics/stats/__init__.py,sha256=igLRaAt1nX6yRwC4xI0zNPBADi3u7EsSxWP3OZ8AqcU,1086
38
+ dataeval/metrics/stats/base.py,sha256=_C05KUAuDrfX3N-19o25V3vmXr0-45A5fc57cXyV8qs,12161
39
+ dataeval/metrics/stats/boxratiostats.py,sha256=bZunY-b8Y2IQqHlTusQN77ujLOHftogEQIARDpdVv6A,6463
40
+ dataeval/metrics/stats/datasetstats.py,sha256=rZUDiciHwEpnXmkI8-uJNiYwUuTL9ssZMKMx73hVX-Y,6219
41
+ dataeval/metrics/stats/dimensionstats.py,sha256=xITgQF_oomb6Ty_dJcbT3ARGGNp4QRcYSgnkjB4f-YE,4054
42
+ dataeval/metrics/stats/hashstats.py,sha256=vxw_K74EJM9CZy-EV617vdrysFO8nEspVWqIYsIHC-c,4958
43
+ dataeval/metrics/stats/labelstats.py,sha256=K0hJTphMe7htSjyss8GPtKDiHepTuU60_hX0xRA-uAg,4096
44
+ dataeval/metrics/stats/pixelstats.py,sha256=2zr9i3GLNx1i_SCtbfdtZNxXBEc_9wCe4qDpmXLVbKY,4576
45
+ dataeval/metrics/stats/visualstats.py,sha256=vLIC4sMo796axWl-4e4RzT33ll-_6ki54Dirn3V-EL8,4948
46
+ dataeval/output.py,sha256=hR5TJ67f7FgrZO9Du46aw-jvRpMjOimSgJSau4ZNK44,3565
47
+ dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ dataeval/utils/__init__.py,sha256=z7HxSijjycey-rGdQkgVOdpvT0oO2pKAuT4uYyxYGMs,555
49
+ dataeval/utils/gmm.py,sha256=YuLsJKsVWgH_wHr1u_hSRH5Yeexdj8exht8h99L7bLo,561
50
+ dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
51
+ dataeval/utils/metadata.py,sha256=0A--iru0zEmi044mKz5P35q69KrI30yoiRSlvs7TSdQ,9418
52
+ dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
53
+ dataeval/utils/split_dataset.py,sha256=KYIl2ueLN0BeBoEvbUP5FdwVcMYW_l-ES1nQf_zKpQA,18776
54
+ dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
55
+ dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
56
+ dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
57
+ dataeval/utils/torch/gmm.py,sha256=VbLlUQohwToApT493_tjQBWy2UM5R-3ppS9Dp-eP7BA,3240
58
+ dataeval/utils/torch/models.py,sha256=sdGeo7a8vshCTGA4lYyVxxb_aDWUlxdtIVxrddS-_ls,8542
59
+ dataeval/utils/torch/trainer.py,sha256=8BEXr6xtk-CHJTcNxOBnWgkFWfJUAiBy28cEdBhLMRU,7883
60
+ dataeval/utils/torch/utils.py,sha256=nWRcT6z6DbFVrL1RyxCOX3DPoCrv9G0B-VI_9LdGCQQ,5784
61
+ dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
62
+ dataeval/workflows/sufficiency.py,sha256=v9AV3BZT0NW-zD2VNIL_5aWspvoscrxRIUKcUdpy7HI,18540
63
+ dataeval-0.74.2.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
64
+ dataeval-0.74.2.dist-info/METADATA,sha256=Rcnn55cRPZ2JZ1jn8YamuVDxmQVDKEItK4oqZyAYkHM,4298
65
+ dataeval-0.74.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
66
+ dataeval-0.74.2.dist-info/RECORD,,
@@ -1,76 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- __all__ = ["OOD_AE"]
12
-
13
- from typing import TYPE_CHECKING, Callable
14
-
15
- import numpy as np
16
- from numpy.typing import ArrayLike
17
-
18
- from dataeval.detectors.ood.base import OODScoreOutput
19
- from dataeval.detectors.ood.base_tf import OODBase
20
- from dataeval.interop import as_numpy
21
- from dataeval.utils.lazy import lazyload
22
- from dataeval.utils.tensorflow._internal.utils import predict_batch
23
-
24
- if TYPE_CHECKING:
25
- import tensorflow as tf
26
- import tf_keras as keras
27
-
28
- import dataeval.utils.tensorflow._internal.models as tf_models
29
- else:
30
- tf = lazyload("tensorflow")
31
- keras = lazyload("tf_keras")
32
- tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
33
-
34
-
35
- class OOD_AE(OODBase):
36
- """
37
- Autoencoder-based :term:`out of distribution<Out-of-distribution (OOD)>` detector.
38
-
39
- Parameters
40
- ----------
41
- model : AE
42
- An :term:`autoencoder<Autoencoder>` model.
43
- """
44
-
45
- def __init__(self, model: tf_models.AE) -> None:
46
- super().__init__(model)
47
-
48
- def fit(
49
- self,
50
- x_ref: ArrayLike,
51
- threshold_perc: float = 100.0,
52
- loss_fn: Callable[..., tf.Tensor] | None = None,
53
- optimizer: keras.optimizers.Optimizer | None = None,
54
- epochs: int = 20,
55
- batch_size: int = 64,
56
- verbose: bool = True,
57
- ) -> None:
58
- if loss_fn is None:
59
- loss_fn = keras.losses.MeanSquaredError()
60
- super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
61
-
62
- def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
63
- self._validate(X := as_numpy(X))
64
-
65
- # reconstruct instances
66
- X_recon = predict_batch(X, self.model, batch_size=batch_size)
67
-
68
- # compute feature and instance level scores
69
- fscore = np.power(X - X_recon, 2)
70
- fscore_flat = fscore.reshape(fscore.shape[0], -1).copy()
71
- n_score_features = int(np.ceil(fscore_flat.shape[1]))
72
- sorted_fscore = np.sort(fscore_flat, axis=1)
73
- sorted_fscore_perc = sorted_fscore[:, -n_score_features:]
74
- iscore = np.mean(sorted_fscore_perc, axis=1)
75
-
76
- return OODScoreOutput(iscore, fscore)
@@ -1,67 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- __all__ = ["OOD_AEGMM"]
12
-
13
- from typing import TYPE_CHECKING, Callable
14
-
15
- from numpy.typing import ArrayLike
16
-
17
- from dataeval.detectors.ood.base import OODScoreOutput
18
- from dataeval.detectors.ood.base_tf import OODBaseGMM
19
- from dataeval.interop import to_numpy
20
- from dataeval.utils.lazy import lazyload
21
- from dataeval.utils.tensorflow._internal.gmm import gmm_energy
22
- from dataeval.utils.tensorflow._internal.loss import LossGMM
23
- from dataeval.utils.tensorflow._internal.utils import predict_batch
24
-
25
- if TYPE_CHECKING:
26
- import tensorflow as tf
27
- import tf_keras as keras
28
-
29
- import dataeval.utils.tensorflow._internal.models as tf_models
30
- else:
31
- tf = lazyload("tensorflow")
32
- keras = lazyload("tf_keras")
33
- tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
34
-
35
-
36
- class OOD_AEGMM(OODBaseGMM):
37
- """
38
- AE with Gaussian Mixture Model based outlier detector.
39
-
40
- Parameters
41
- ----------
42
- model : AEGMM
43
- An AEGMM model.
44
- """
45
-
46
- def __init__(self, model: tf_models.AEGMM) -> None:
47
- super().__init__(model)
48
-
49
- def fit(
50
- self,
51
- x_ref: ArrayLike,
52
- threshold_perc: float = 100.0,
53
- loss_fn: Callable[..., tf.Tensor] | None = None,
54
- optimizer: keras.optimizers.Optimizer | None = None,
55
- epochs: int = 20,
56
- batch_size: int = 64,
57
- verbose: bool = True,
58
- ) -> None:
59
- if loss_fn is None:
60
- loss_fn = LossGMM()
61
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
62
-
63
- def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
64
- self._validate(X := to_numpy(X))
65
- _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
66
- energy, _ = gmm_energy(z, self._gmm_params, return_mean=False)
67
- return OODScoreOutput(energy.numpy()) # type: ignore
@@ -1,109 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- from typing import TYPE_CHECKING, Callable, cast
12
-
13
- from numpy.typing import ArrayLike
14
-
15
- from dataeval.detectors.ood.base import OODBaseMixin, OODFitMixin, OODGMMMixin
16
- from dataeval.interop import to_numpy
17
- from dataeval.utils.lazy import lazyload
18
- from dataeval.utils.tensorflow._internal.gmm import gmm_params
19
- from dataeval.utils.tensorflow._internal.trainer import trainer
20
-
21
- if TYPE_CHECKING:
22
- import tensorflow as tf
23
- import tf_keras as keras
24
- else:
25
- tf = lazyload("tensorflow")
26
- keras = lazyload("tf_keras")
27
-
28
-
29
- class OODBase(OODBaseMixin[keras.Model], OODFitMixin[Callable[..., tf.Tensor], keras.optimizers.Optimizer]):
30
- def __init__(self, model: keras.Model) -> None:
31
- super().__init__(model)
32
-
33
- def fit(
34
- self,
35
- x_ref: ArrayLike,
36
- threshold_perc: float,
37
- loss_fn: Callable[..., tf.Tensor] | None,
38
- optimizer: keras.optimizers.Optimizer | None,
39
- epochs: int,
40
- batch_size: int,
41
- verbose: bool,
42
- ) -> None:
43
- """
44
- Train the model and infer the threshold value.
45
-
46
- Parameters
47
- ----------
48
- x_ref : ArrayLike
49
- Training data.
50
- threshold_perc : float, default 100.0
51
- Percentage of reference data that is normal.
52
- loss_fn : Callable | None, default None
53
- Loss function used for training.
54
- optimizer : Optimizer, default keras.optimizers.Adam
55
- Optimizer used for training.
56
- epochs : int, default 20
57
- Number of training epochs.
58
- batch_size : int, default 64
59
- Batch size used for training.
60
- verbose : bool, default True
61
- Whether to print training progress.
62
- """
63
-
64
- # Train the model
65
- trainer(
66
- model=self.model,
67
- loss_fn=loss_fn,
68
- x_train=to_numpy(x_ref),
69
- y_train=None,
70
- optimizer=optimizer,
71
- epochs=epochs,
72
- batch_size=batch_size,
73
- verbose=verbose,
74
- )
75
-
76
- # Infer the threshold values
77
- self._ref_score = self.score(x_ref, batch_size)
78
- self._threshold_perc = threshold_perc
79
-
80
-
81
- class OODBaseGMM(OODBase, OODGMMMixin[tf.Tensor]):
82
- def fit(
83
- self,
84
- x_ref: ArrayLike,
85
- threshold_perc: float,
86
- loss_fn: Callable[..., tf.Tensor] | None,
87
- optimizer: keras.optimizers.Optimizer | None,
88
- epochs: int,
89
- batch_size: int,
90
- verbose: bool,
91
- ) -> None:
92
- # Train the model
93
- trainer(
94
- model=self.model,
95
- loss_fn=loss_fn,
96
- x_train=to_numpy(x_ref),
97
- optimizer=optimizer,
98
- epochs=epochs,
99
- batch_size=batch_size,
100
- verbose=verbose,
101
- )
102
-
103
- # Calculate the GMM parameters
104
- _, z, gamma = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
105
- self._gmm_params = gmm_params(z, gamma)
106
-
107
- # Infer the threshold values
108
- self._ref_score = self.score(x_ref, batch_size)
109
- self._threshold_perc = threshold_perc
@@ -1,302 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- __all__ = ["OOD_LLR"]
12
-
13
- from functools import partial
14
- from typing import TYPE_CHECKING, Callable
15
-
16
- import numpy as np
17
- from numpy.typing import ArrayLike, NDArray
18
-
19
- from dataeval.detectors.ood.base import OODBaseMixin, OODScoreOutput
20
- from dataeval.interop import to_numpy
21
- from dataeval.utils.lazy import lazyload
22
- from dataeval.utils.tensorflow._internal.trainer import trainer
23
- from dataeval.utils.tensorflow._internal.utils import predict_batch
24
-
25
- if TYPE_CHECKING:
26
- import tensorflow as tf
27
- import tf_keras as keras
28
-
29
- import dataeval.utils.tensorflow._internal.models as tf_models
30
- else:
31
- tf = lazyload("tensorflow")
32
- keras = lazyload("tf_keras")
33
- tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
34
-
35
-
36
- def _build_model(
37
- dist: tf_models.PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
38
- ) -> tuple[keras.Model, tf_models.PixelCNN]:
39
- """
40
- Create keras.Model from TF distribution.
41
-
42
- Parameters
43
- ----------
44
- dist
45
- :term:`TensorFlow` distribution.
46
- input_shape
47
- Input shape of the model.
48
- filepath
49
- File to load model weights from.
50
-
51
- Returns
52
- -------
53
- TensorFlow model.
54
- """
55
- x_in = keras.layers.Input(shape=input_shape)
56
- log_prob = dist.log_prob(x_in)
57
- model = keras.models.Model(inputs=x_in, outputs=log_prob)
58
- model.add_loss(-tf.reduce_mean(log_prob))
59
- if isinstance(filepath, str):
60
- model.load_weights(filepath)
61
- return model, dist
62
-
63
-
64
- def _mutate_categorical(
65
- X: NDArray,
66
- rate: float,
67
- seed: int = 0,
68
- feature_range: tuple[int, int] = (0, 255),
69
- ) -> tf.Tensor:
70
- """
71
- Randomly change integer feature values to values within a set range
72
- with a specified permutation rate.
73
-
74
- Parameters
75
- ----------
76
- X
77
- Batch of data to be perturbed.
78
- rate
79
- Permutation rate (between 0 and 1).
80
- seed
81
- Random seed.
82
- feature_range
83
- Min and max range for perturbed features.
84
-
85
- Returns
86
- -------
87
- Array with perturbed data.
88
- """
89
- frange = (feature_range[0] + 1, feature_range[1] + 1)
90
- shape = X.shape
91
- n_samples = np.prod(shape)
92
- mask = tf.random.categorical(tf.math.log([[1.0 - rate, rate]]), n_samples, seed=seed, dtype=tf.int32)
93
- mask = tf.reshape(mask, shape)
94
- possible_mutations = tf.random.uniform(shape, minval=frange[0], maxval=frange[1], dtype=tf.int32, seed=seed + 1)
95
- X = tf.math.floormod(tf.cast(X, tf.int32) + mask * possible_mutations, frange[1]) # type: ignore py38
96
- return tf.cast(X, tf.float32) # type: ignore
97
-
98
-
99
- class OOD_LLR(OODBaseMixin[tf_models.PixelCNN]):
100
- """
101
- Likelihood Ratios based outlier detector.
102
-
103
- Parameters
104
- ----------
105
- model : PixelCNN
106
- Generative distribution model.
107
- model_background : Optional[PixelCNN], default None
108
- Optional model for the background. Only needed if it is different from `model`.
109
- log_prob : Optional[Callable], default None
110
- Function used to evaluate log probabilities under the model
111
- if the model does not have a `log_prob` function.
112
- sequential : bool, default False
113
- Whether the data is sequential. Used to create targets during training.
114
- """
115
-
116
- def __init__(
117
- self,
118
- model: tf_models.PixelCNN,
119
- model_background: tf_models.PixelCNN | None = None,
120
- log_prob: Callable | None = None,
121
- sequential: bool = False,
122
- ) -> None:
123
- self.dist_s: tf_models.PixelCNN = model
124
- self.dist_b: tf_models.PixelCNN = (
125
- model.copy()
126
- if hasattr(model, "copy")
127
- else keras.models.clone_model(model)
128
- if model_background is None
129
- else model_background
130
- )
131
- self.has_log_prob: bool = hasattr(model, "log_prob")
132
- self.sequential: bool = sequential
133
- self.log_prob: Callable | None = log_prob
134
-
135
- self._ref_score: OODScoreOutput
136
- self._threshold_perc: float
137
- self._data_info: tuple[tuple, type] | None = None
138
-
139
- def fit(
140
- self,
141
- x_ref: ArrayLike,
142
- threshold_perc: float = 100.0,
143
- loss_fn: Callable | None = None,
144
- optimizer: keras.optimizers.Optimizer | None = None,
145
- epochs: int = 20,
146
- batch_size: int = 64,
147
- verbose: bool = True,
148
- mutate_fn: Callable = _mutate_categorical,
149
- mutate_fn_kwargs: dict[str, float | int | tuple[int, int]] = {
150
- "rate": 0.2,
151
- "seed": 0,
152
- "feature_range": (0, 255),
153
- },
154
- mutate_batch_size: int = int(1e10),
155
- ) -> None:
156
- """
157
- Train semantic and background generative models.
158
-
159
- Parameters
160
- ----------
161
- x_ref : ArrayLike
162
- Training data.
163
- threshold_perc : float, default 100.0
164
- Percentage of reference data that is normal.
165
- loss_fn : Callable | None, default None
166
- Loss function used for training.
167
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
168
- Optimizer used for training.
169
- epochs : int, default 20
170
- Number of training epochs.
171
- batch_size : int, default 64
172
- Batch size used for training.
173
- verbose : bool, default True
174
- Whether to print training progress.
175
- mutate_fn : Callable, default mutate_categorical
176
- Mutation function used to generate the background dataset.
177
- mutate_fn_kwargs : dict, default {"rate": 0.2, "seed": 0, "feature_range": (0, 255)}
178
- Kwargs for the mutation function used to generate the background dataset.
179
- Default values set for an image dataset.
180
- mutate_batch_size: int, default int(1e10)
181
- Batch size used to generate the mutations for the background dataset.
182
- """
183
- x_ref = to_numpy(x_ref)
184
- input_shape = x_ref.shape[1:]
185
- optimizer = keras.optimizers.Adam() if optimizer is None else optimizer
186
- # Separate into two separate optimizers, one for semantic model and one for background model
187
- optimizer_s = optimizer
188
- optimizer_b = optimizer.__class__.from_config(optimizer.get_config())
189
-
190
- # training arguments
191
- kwargs = {
192
- "epochs": epochs,
193
- "batch_size": batch_size,
194
- "verbose": verbose,
195
- }
196
-
197
- # create background data
198
- mutate_fn = partial(mutate_fn, **mutate_fn_kwargs)
199
- X_back = predict_batch(x_ref, mutate_fn, batch_size=mutate_batch_size, dtype=x_ref.dtype) # type: ignore
200
-
201
- # prepare sequential data
202
- if self.sequential and not self.has_log_prob:
203
- y, y_back = x_ref[:, 1:], X_back[:, 1:] # type: ignore
204
- X, X_back = x_ref[:, :-1], X_back[:, :-1] # type: ignore
205
- else:
206
- X = x_ref
207
- y, y_back = None, None
208
-
209
- # check if model needs to be built
210
- use_build = self.has_log_prob and not isinstance(self.dist_s, keras.Model)
211
-
212
- if use_build:
213
- # build and train semantic model
214
- self.model_s: keras.Model = _build_model(self.dist_s, input_shape)[0]
215
- self.model_s.compile(optimizer=optimizer_s)
216
- self.model_s.fit(X, **kwargs)
217
- # build and train background model
218
- self.model_b: keras.Model = _build_model(self.dist_b, input_shape)[0]
219
- self.model_b.compile(optimizer=optimizer_b)
220
- self.model_b.fit(X_back, **kwargs)
221
- else:
222
- # train semantic model
223
- args = [self.dist_s, X]
224
- kwargs.update({"y_train": y, "loss_fn": loss_fn, "optimizer": optimizer_s})
225
- trainer(*args, **kwargs)
226
-
227
- # train background model
228
- args = [self.dist_b, X_back]
229
- kwargs.update({"y_train": y_back, "loss_fn": loss_fn, "optimizer": optimizer_b})
230
- trainer(*args, **kwargs)
231
-
232
- self._datainfo = self._get_data_info(x_ref)
233
- self._ref_score = self.score(x_ref, batch_size=batch_size)
234
- self._threshold_perc = threshold_perc
235
-
236
- def _logp(
237
- self,
238
- dist,
239
- X: NDArray,
240
- return_per_feature: bool = False,
241
- batch_size: int = int(1e10),
242
- ) -> NDArray:
243
- """
244
- Compute log probability of a batch of instances under the :term:`generative model<Generative Model>`.
245
- """
246
- logp_fn = partial(dist.log_prob, return_per_feature=return_per_feature)
247
- # TODO: TBD: can this be any of the other types from predict_batch? i.e. tf.Tensor or tuple
248
- return predict_batch(X, logp_fn, batch_size=batch_size) # type: ignore[return-value]
249
-
250
- def _logp_alt(
251
- self,
252
- model: keras.Model,
253
- X: NDArray,
254
- return_per_feature: bool = False,
255
- batch_size: int = int(1e10),
256
- ) -> NDArray:
257
- """
258
- Compute log probability of a batch of instances with the user defined log_prob function.
259
- """
260
- if self.sequential:
261
- y, X = X[:, 1:], X[:, :-1]
262
- else:
263
- y = X.copy()
264
- y_preds = predict_batch(X, model, batch_size=batch_size)
265
- logp = self.log_prob(y, y_preds).numpy() # type: ignore
266
- if return_per_feature:
267
- return logp
268
- else:
269
- axis = tuple(np.arange(len(logp.shape))[1:])
270
- return np.mean(logp, axis=axis)
271
-
272
- def _llr(self, X: NDArray, return_per_feature: bool, batch_size: int = int(1e10)) -> NDArray:
273
- """
274
- Compute likelihood ratios.
275
-
276
- Parameters
277
- ----------
278
- X
279
- Batch of instances.
280
- return_per_feature
281
- Return likelihood ratio per feature.
282
- batch_size
283
- Batch size for the :term:`generative model<Generative Model>` evaluations.
284
-
285
- Returns
286
- -------
287
- Likelihood ratios.
288
- """
289
- logp_fn = self._logp if not isinstance(self.log_prob, Callable) else self._logp_alt # type: ignore
290
- logp_s = logp_fn(self.dist_s, X, return_per_feature=return_per_feature, batch_size=batch_size)
291
- logp_b = logp_fn(self.dist_b, X, return_per_feature=return_per_feature, batch_size=batch_size)
292
- return logp_s - logp_b
293
-
294
- def _score(
295
- self,
296
- X: ArrayLike,
297
- batch_size: int = int(1e10),
298
- ) -> OODScoreOutput:
299
- self._validate(X := to_numpy(X))
300
- fscore = -self._llr(X, True, batch_size=batch_size)
301
- iscore = -self._llr(X, False, batch_size=batch_size)
302
- return OODScoreOutput(iscore, fscore)