dataeval 0.65.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. dataeval/__init__.py +13 -9
  2. dataeval/_internal/detectors/clusterer.py +24 -22
  3. dataeval/_internal/detectors/drift/base.py +206 -26
  4. dataeval/_internal/detectors/drift/cvm.py +25 -23
  5. dataeval/_internal/detectors/drift/ks.py +28 -25
  6. dataeval/_internal/detectors/drift/mmd.py +30 -29
  7. dataeval/_internal/detectors/drift/torch.py +66 -58
  8. dataeval/_internal/detectors/drift/uncertainty.py +28 -28
  9. dataeval/_internal/detectors/duplicates.py +28 -18
  10. dataeval/_internal/detectors/ood/ae.py +15 -29
  11. dataeval/_internal/detectors/ood/aegmm.py +33 -27
  12. dataeval/_internal/detectors/ood/base.py +61 -43
  13. dataeval/_internal/detectors/ood/llr.py +27 -24
  14. dataeval/_internal/detectors/ood/vae.py +32 -31
  15. dataeval/_internal/detectors/ood/vaegmm.py +34 -28
  16. dataeval/_internal/detectors/{linter.py → outliers.py} +33 -27
  17. dataeval/_internal/flags.py +5 -3
  18. dataeval/_internal/interop.py +4 -2
  19. dataeval/_internal/metrics/balance.py +33 -4
  20. dataeval/_internal/metrics/ber.py +6 -4
  21. dataeval/_internal/metrics/diversity.py +45 -12
  22. dataeval/_internal/metrics/parity.py +114 -26
  23. dataeval/_internal/metrics/stats.py +154 -16
  24. dataeval/_internal/metrics/uap.py +28 -2
  25. dataeval/_internal/metrics/utils.py +20 -18
  26. dataeval/_internal/models/pytorch/autoencoder.py +127 -22
  27. dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
  28. dataeval/_internal/models/tensorflow/gmm.py +4 -2
  29. dataeval/_internal/models/tensorflow/losses.py +15 -11
  30. dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
  31. dataeval/_internal/models/tensorflow/trainer.py +8 -6
  32. dataeval/_internal/models/tensorflow/utils.py +21 -19
  33. dataeval/_internal/output.py +13 -10
  34. dataeval/_internal/utils.py +5 -3
  35. dataeval/_internal/workflows/sufficiency.py +42 -30
  36. dataeval/detectors/__init__.py +6 -25
  37. dataeval/detectors/drift/__init__.py +16 -0
  38. dataeval/detectors/drift/kernels/__init__.py +6 -0
  39. dataeval/detectors/drift/updates/__init__.py +3 -0
  40. dataeval/detectors/linters/__init__.py +5 -0
  41. dataeval/detectors/ood/__init__.py +11 -0
  42. dataeval/metrics/__init__.py +2 -26
  43. dataeval/metrics/bias/__init__.py +14 -0
  44. dataeval/metrics/estimators/__init__.py +9 -0
  45. dataeval/metrics/stats/__init__.py +6 -0
  46. dataeval/tensorflow/__init__.py +3 -0
  47. dataeval/tensorflow/loss/__init__.py +3 -0
  48. dataeval/tensorflow/models/__init__.py +5 -0
  49. dataeval/tensorflow/recon/__init__.py +3 -0
  50. dataeval/torch/__init__.py +3 -0
  51. dataeval/{models/torch → torch/models}/__init__.py +1 -2
  52. dataeval/torch/trainer/__init__.py +3 -0
  53. dataeval/utils/__init__.py +3 -6
  54. dataeval/workflows/__init__.py +2 -4
  55. {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
  56. dataeval-0.66.0.dist-info/RECORD +72 -0
  57. dataeval/models/__init__.py +0 -15
  58. dataeval/models/tensorflow/__init__.py +0 -6
  59. dataeval-0.65.0.dist-info/RECORD +0 -60
  60. {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
  61. {dataeval-0.65.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -1,12 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  from dataclasses import dataclass
2
- from typing import Dict, Iterable, List
4
+ from typing import Iterable
3
5
 
4
6
  from numpy.typing import ArrayLike
5
7
 
6
- from dataeval._internal.metrics.stats import StatsOutput
8
+ from dataeval._internal.flags import ImageStat
9
+ from dataeval._internal.metrics.stats import StatsOutput, imagestats
7
10
  from dataeval._internal.output import OutputMetadata, set_metadata
8
- from dataeval.flags import ImageStat
9
- from dataeval.metrics import imagestats
10
11
 
11
12
 
12
13
  @dataclass(frozen=True)
@@ -20,8 +21,8 @@ class DuplicatesOutput(OutputMetadata):
20
21
  Indices of images that are near matches
21
22
  """
22
23
 
23
- exact: List[List[int]]
24
- near: List[List[int]]
24
+ exact: list[list[int]]
25
+ near: list[list[int]]
25
26
 
26
27
 
27
28
  class Duplicates:
@@ -34,6 +35,11 @@ class Duplicates:
34
35
  stats : StatsOutput
35
36
  Output class of stats
36
37
 
38
+ Parameters
39
+ ----------
40
+ only_exact : bool, default False
41
+ Only inspect the dataset for exact image matches
42
+
37
43
  Example
38
44
  -------
39
45
  Initialize the Duplicates class:
@@ -41,12 +47,11 @@ class Duplicates:
41
47
  >>> dups = Duplicates()
42
48
  """
43
49
 
44
- def __init__(self, find_exact: bool = True, find_near: bool = True):
50
+ def __init__(self, only_exact: bool = False):
45
51
  self.stats: StatsOutput
46
- self.find_exact = find_exact
47
- self.find_near = find_near
52
+ self.only_exact = only_exact
48
53
 
49
- def _get_duplicates(self) -> Dict[str, List[List[int]]]:
54
+ def _get_duplicates(self) -> dict[str, list[list[int]]]:
50
55
  stats_dict = self.stats.dict()
51
56
  if "xxhash" in stats_dict:
52
57
  exact = {}
@@ -56,7 +61,7 @@ class Duplicates:
56
61
  else:
57
62
  exact = []
58
63
 
59
- if "pchash" in stats_dict:
64
+ if "pchash" in stats_dict and not self.only_exact:
60
65
  near = {}
61
66
  for i, value in enumerate(stats_dict["pchash"]):
62
67
  near.setdefault(value, []).append(i)
@@ -69,15 +74,15 @@ class Duplicates:
69
74
  "near": sorted(near),
70
75
  }
71
76
 
72
- @set_metadata("dataeval.detectors", ["find_exact", "find_near"])
73
- def evaluate(self, images: Iterable[ArrayLike]) -> DuplicatesOutput:
77
+ @set_metadata("dataeval.detectors", ["only_exact"])
78
+ def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> DuplicatesOutput:
74
79
  """
75
80
  Returns duplicate image indices for both exact matches and near matches
76
81
 
77
82
  Parameters
78
83
  ----------
79
- images : Iterable[ArrayLike], shape - (N, C, H, W)
80
- A set of images in an ArrayLike format
84
+ data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput
85
+ A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
81
86
 
82
87
  Returns
83
88
  -------
@@ -93,7 +98,12 @@ class Duplicates:
93
98
  >>> dups.evaluate(images)
94
99
  DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
95
100
  """ # noqa: E501
96
- flag_exact = ImageStat.XXHASH if self.find_exact else ImageStat(0)
97
- flag_near = ImageStat.PCHASH if self.find_near else ImageStat(0)
98
- self.stats = imagestats(images, flag_exact | flag_near)
101
+ if isinstance(data, StatsOutput):
102
+ if not data.xxhash:
103
+ raise ValueError("StatsOutput must include xxhash information of the images.")
104
+ if not self.only_exact and not data.pchash:
105
+ raise ValueError("StatsOutput must include pchash information of the images for near matches.")
106
+ self.stats = data
107
+ else:
108
+ self.stats = imagestats(data, ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH))
99
109
  return DuplicatesOutput(**self._get_duplicates())
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from typing import Callable
10
12
 
11
13
  import keras
12
14
  import numpy as np
15
+ import tensorflow as tf
13
16
  from numpy.typing import ArrayLike
14
17
 
15
18
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
@@ -19,47 +22,30 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
19
22
 
20
23
 
21
24
  class OOD_AE(OODBase):
22
- def __init__(self, model: AE) -> None:
23
- """
24
- Autoencoder based out-of-distribution detector.
25
+ """
26
+ Autoencoder based out-of-distribution detector.
27
+
28
+ Parameters
29
+ ----------
30
+ model : AE
31
+ An Autoencoder model.
32
+ """
25
33
 
26
- Parameters
27
- ----------
28
- model : AE
29
- An Autoencoder model.
30
- """
34
+ def __init__(self, model: AE) -> None:
31
35
  super().__init__(model)
32
36
 
33
37
  def fit(
34
38
  self,
35
39
  x_ref: ArrayLike,
36
40
  threshold_perc: float = 100.0,
37
- loss_fn: Callable = keras.losses.MeanSquaredError(),
41
+ loss_fn: Callable[..., tf.Tensor] | None = None,
38
42
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
39
43
  epochs: int = 20,
40
44
  batch_size: int = 64,
41
45
  verbose: bool = True,
42
46
  ) -> None:
43
- """
44
- Train the AE model with recommended loss function and optimizer.
45
-
46
- Parameters
47
- ----------
48
- x_ref : ArrayLike
49
- Training batch.
50
- threshold_perc : float, default 100.0
51
- Percentage of reference data that is normal.
52
- loss_fn : Callable, default keras.losses.MeanSquaredError()
53
- Loss function used for training.
54
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
55
- Optimizer used for training.
56
- epochs : int, default 20
57
- Number of training epochs.
58
- batch_size : int, default 64
59
- Batch size used for training.
60
- verbose : bool, default True
61
- Whether to print training progress.
62
- """
47
+ if loss_fn is None:
48
+ loss_fn = keras.losses.MeanSquaredError()
63
49
  super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
64
50
 
65
51
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
@@ -6,9 +6,12 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from typing import Callable
10
12
 
11
13
  import keras
14
+ import tensorflow as tf
12
15
  from numpy.typing import ArrayLike
13
16
 
14
17
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
@@ -20,50 +23,53 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
20
23
 
21
24
 
22
25
  class OOD_AEGMM(OODGMMBase):
23
- def __init__(self, model: AEGMM) -> None:
24
- """
25
- AE with Gaussian Mixture Model based outlier detector.
26
+ """
27
+ AE with Gaussian Mixture Model based outlier detector.
26
28
 
27
- Parameters
28
- ----------
29
- model : AEGMM
30
- An AEGMM model.
31
- """
29
+ Parameters
30
+ ----------
31
+ model : AEGMM
32
+ An AEGMM model.
33
+ """
34
+
35
+ def __init__(self, model: AEGMM) -> None:
32
36
  super().__init__(model)
33
37
 
34
38
  def fit(
35
39
  self,
36
40
  x_ref: ArrayLike,
37
41
  threshold_perc: float = 100.0,
38
- loss_fn: Callable = LossGMM(),
42
+ loss_fn: Callable[..., tf.Tensor] | None = None,
39
43
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
40
44
  epochs: int = 20,
41
45
  batch_size: int = 64,
42
46
  verbose: bool = True,
43
47
  ) -> None:
48
+ if loss_fn is None:
49
+ loss_fn = LossGMM()
50
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
51
+
52
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
44
53
  """
45
- Train the AEGMM model with recommended loss function and optimizer.
54
+ Compute the out-of-distribution (OOD) score for a given dataset.
46
55
 
47
56
  Parameters
48
57
  ----------
49
- x_ref : ArrayLike
50
- Training batch.
51
- threshold_perc : float, default 100.0
52
- Percentage of reference data that is normal.
53
- loss_fn : Callable, default LossGMM()
54
- Loss function used for training.
55
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
56
- Optimizer used for training.
57
- epochs : int, default 20
58
- Number of training epochs.
59
- batch_size : int, default 64
60
- Batch size used for training.
61
- verbose : bool, default True
62
- Whether to print training progress.
63
- """
64
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
58
+ X : ArrayLike
59
+ Input data to score.
60
+ batch_size : int, default 1e10
61
+ Number of instances to process in each batch.
62
+ Use a smaller batch size if your dataset is large or if you encounter memory issues.
65
63
 
66
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
64
+ Returns
65
+ -------
66
+ OODScore
67
+ An object containing the instance-level OOD score.
68
+
69
+ Note
70
+ ----
71
+ This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
72
+ """
67
73
  self._validate(X := to_numpy(X))
68
74
  _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
69
75
  energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
@@ -6,9 +6,11 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from abc import ABC, abstractmethod
10
12
  from dataclasses import dataclass
11
- from typing import Callable, List, Literal, NamedTuple, Optional, Tuple, cast
13
+ from typing import Callable, Literal, NamedTuple, cast
12
14
 
13
15
  import keras
14
16
  import numpy as np
@@ -26,17 +28,17 @@ class OODOutput(OutputMetadata):
26
28
  """
27
29
  Attributes
28
30
  ----------
29
- is_ood : NDArray[np.bool_]
31
+ is_ood : NDArray
30
32
  Array of images that are detected as out of distribution
31
- instance_score : NDArray[np.float32]
33
+ instance_score : NDArray
32
34
  Instance score of the evaluated dataset
33
- feature_score : Optional[NDArray[np.float32]]
35
+ feature_score : NDArray | None
34
36
  Feature score, if available, of the evaluated dataset
35
37
  """
36
38
 
37
39
  is_ood: NDArray[np.bool_]
38
40
  instance_score: NDArray[np.float32]
39
- feature_score: Optional[NDArray[np.float32]]
41
+ feature_score: NDArray[np.float32] | None
40
42
 
41
43
 
42
44
  class OODScore(NamedTuple):
@@ -45,16 +47,28 @@ class OODScore(NamedTuple):
45
47
 
46
48
  Parameters
47
49
  ----------
48
- instance_score : NDArray[np.float32]
50
+ instance_score : NDArray
49
51
  Instance score of the evaluated dataset.
50
- feature_score : Optional[NDArray[np.float32]], default None
52
+ feature_score : NDArray | None, default None
51
53
  Feature score, if available, of the evaluated dataset.
52
54
  """
53
55
 
54
56
  instance_score: NDArray[np.float32]
55
- feature_score: Optional[NDArray[np.float32]] = None
57
+ feature_score: NDArray[np.float32] | None = None
56
58
 
57
59
  def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
60
+ """
61
+ Returns either the instance or feature score
62
+
63
+ Parameters
64
+ ----------
65
+ ood_type : "instance" | "feature"
66
+
67
+ Returns
68
+ -------
69
+ NDArray
70
+ Either the instance or feature score based on input selection
71
+ """
58
72
  return self.instance_score if ood_type == "instance" or self.feature_score is None else self.feature_score
59
73
 
60
74
 
@@ -64,12 +78,12 @@ class OODBase(ABC):
64
78
 
65
79
  self._ref_score: OODScore
66
80
  self._threshold_perc: float
67
- self._data_info: Optional[Tuple[tuple, type]] = None
81
+ self._data_info: tuple[tuple, type] | None = None
68
82
 
69
83
  if not isinstance(model, keras.Model):
70
84
  raise TypeError("Model should be of type 'keras.Model'.")
71
85
 
72
- def _get_data_info(self, X: NDArray) -> Tuple[tuple, type]:
86
+ def _get_data_info(self, X: NDArray) -> tuple[tuple, type]:
73
87
  if not isinstance(X, np.ndarray):
74
88
  raise TypeError("Dataset should of type: `NDArray`.")
75
89
  return X.shape[1:], X.dtype.type
@@ -80,7 +94,7 @@ class OODBase(ABC):
80
94
  raise RuntimeError(f"Expect data of type: {self._data_info[1]} and shape: {self._data_info[0]}. \
81
95
  Provided data is type: {check_data_info[1]} and shape: {check_data_info[0]}.")
82
96
 
83
- def _validate_state(self, X: NDArray, additional_attrs: Optional[List[str]] = None) -> None:
97
+ def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
84
98
  attrs = ["_data_info", "_threshold_perc", "_ref_score"]
85
99
  attrs = attrs if additional_attrs is None else attrs + additional_attrs
86
100
  if not all(hasattr(self, attr) for attr in attrs) or any(getattr(self, attr) for attr in attrs) is None:
@@ -90,18 +104,20 @@ class OODBase(ABC):
90
104
  @abstractmethod
91
105
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
92
106
  """
93
- Compute instance and (optionally) feature level outlier scores.
107
+ Compute the out-of-distribution (OOD) scores for a given dataset.
94
108
 
95
109
  Parameters
96
110
  ----------
97
111
  X : ArrayLike
98
- Batch of instances.
99
- batch_size : int, default int(1e10)
100
- Batch size used when making predictions with the autoencoder.
112
+ Input data to score.
113
+ batch_size : int, default 1e10
114
+ Number of instances to process in each batch.
115
+ Use a smaller batch size if your dataset is large or if you encounter memory issues.
101
116
 
102
117
  Returns
103
118
  -------
104
- Instance and feature level outlier scores.
119
+ OODScore
120
+ An object containing the instance-level and feature-level OOD scores.
105
121
  """
106
122
 
107
123
  def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
@@ -110,33 +126,34 @@ class OODBase(ABC):
110
126
  def fit(
111
127
  self,
112
128
  x_ref: ArrayLike,
113
- threshold_perc: float,
114
- loss_fn: Callable,
115
- optimizer: keras.optimizers.Optimizer,
116
- epochs: int,
117
- batch_size: int,
118
- verbose: bool,
129
+ threshold_perc: float = 100.0,
130
+ loss_fn: Callable[..., tf.Tensor] | None = None,
131
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
132
+ epochs: int = 20,
133
+ batch_size: int = 64,
134
+ verbose: bool = True,
119
135
  ) -> None:
120
136
  """
121
137
  Train the model and infer the threshold value.
122
138
 
123
139
  Parameters
124
140
  ----------
125
- x_ref: : ArrayLike
126
- Training batch.
127
- threshold_perc : float
141
+ x_ref : ArrayLike
142
+ Training data.
143
+ threshold_perc : float, default 100.0
128
144
  Percentage of reference data that is normal.
129
- loss_fn : Callable
145
+ loss_fn : Callable | None, default None
130
146
  Loss function used for training.
131
- optimizer : keras.optimizers.Optimizer
147
+ optimizer : Optimizer, default keras.optimizers.Adam
132
148
  Optimizer used for training.
133
- epochs : int
149
+ epochs : int, default 20
134
150
  Number of training epochs.
135
- batch_size : int
151
+ batch_size : int, default 64
136
152
  Batch size used for training.
137
- verbose : bool
153
+ verbose : bool, default True
138
154
  Whether to print training progress.
139
155
  """
156
+
140
157
  # Train the model
141
158
  trainer(
142
159
  model=self.model,
@@ -165,15 +182,16 @@ class OODBase(ABC):
165
182
  Parameters
166
183
  ----------
167
184
  X : ArrayLike
168
- Batch of instances.
169
- batch_size : int, default int(1e10)
170
- Batch size used when making predictions with the autoencoder.
171
- ood_type : Literal["feature", "instance"], default "instance"
185
+ Input data for out-of-distribution prediction.
186
+ batch_size : int, default 1e10
187
+ Number of instances to process in each batch.
188
+ ood_type : "feature" | "instance", default "instance"
172
189
  Predict out-of-distribution at the 'feature' or 'instance' level.
173
190
 
174
191
  Returns
175
192
  -------
176
- Dictionary containing the outlier predictions and both feature and instance level outlier scores.
193
+ Dictionary containing the outlier predictions for the selected level,
194
+ and the OOD scores for the data including both 'instance' and 'feature' (if present) level scores.
177
195
  """
178
196
  self._validate_state(X := to_numpy(X))
179
197
  # compute outlier scores
@@ -187,7 +205,7 @@ class OODGMMBase(OODBase):
187
205
  super().__init__(model)
188
206
  self.gmm_params: GaussianMixtureModelParams
189
207
 
190
- def _validate_state(self, X: NDArray, additional_attrs: Optional[List[str]] = None) -> None:
208
+ def _validate_state(self, X: NDArray, additional_attrs: list[str] | None = None) -> None:
191
209
  if additional_attrs is None:
192
210
  additional_attrs = ["gmm_params"]
193
211
  super()._validate_state(X, additional_attrs)
@@ -195,12 +213,12 @@ class OODGMMBase(OODBase):
195
213
  def fit(
196
214
  self,
197
215
  x_ref: ArrayLike,
198
- threshold_perc: float,
199
- loss_fn: Callable[[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor], tf.Tensor],
200
- optimizer: keras.optimizers.Optimizer,
201
- epochs: int,
202
- batch_size: int,
203
- verbose: bool,
216
+ threshold_perc: float = 100.0,
217
+ loss_fn: Callable[..., tf.Tensor] | None = None,
218
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
219
+ epochs: int = 20,
220
+ batch_size: int = 64,
221
+ verbose: bool = True,
204
222
  ) -> None:
205
223
  # Train the model
206
224
  trainer(
@@ -214,7 +232,7 @@ class OODGMMBase(OODBase):
214
232
  )
215
233
 
216
234
  # Calculate the GMM parameters
217
- _, z, gamma = cast(Tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
235
+ _, z, gamma = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.model(x_ref))
218
236
  self.gmm_params = gmm_params(z, gamma)
219
237
 
220
238
  # Infer the threshold values
@@ -6,8 +6,10 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from functools import partial
10
- from typing import Callable, Optional, Tuple
12
+ from typing import Callable
11
13
 
12
14
  import keras
13
15
  import numpy as np
@@ -24,8 +26,8 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
24
26
 
25
27
 
26
28
  def build_model(
27
- dist: PixelCNN, input_shape: Optional[tuple] = None, filepath: Optional[str] = None
28
- ) -> Tuple[keras.Model, PixelCNN]:
29
+ dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
30
+ ) -> tuple[keras.Model, PixelCNN]:
29
31
  """
30
32
  Create keras.Model from TF distribution.
31
33
 
@@ -87,28 +89,29 @@ def mutate_categorical(
87
89
 
88
90
 
89
91
  class OOD_LLR(OODBase):
92
+ """
93
+ Likelihood Ratios based outlier detector.
94
+
95
+ Parameters
96
+ ----------
97
+ model : PixelCNN
98
+ Generative distribution model.
99
+ model_background : Optional[PixelCNN], default None
100
+ Optional model for the background. Only needed if it is different from `model`.
101
+ log_prob : Optional[Callable], default None
102
+ Function used to evaluate log probabilities under the model
103
+ if the model does not have a `log_prob` function.
104
+ sequential : bool, default False
105
+ Whether the data is sequential. Used to create targets during training.
106
+ """
107
+
90
108
  def __init__(
91
109
  self,
92
110
  model: PixelCNN,
93
- model_background: Optional[PixelCNN] = None,
94
- log_prob: Optional[Callable] = None,
111
+ model_background: PixelCNN | None = None,
112
+ log_prob: Callable | None = None,
95
113
  sequential: bool = False,
96
114
  ) -> None:
97
- """
98
- Likelihood Ratios based outlier detector.
99
-
100
- Parameters
101
- ----------
102
- model : PixelCNN
103
- Generative distribution model.
104
- model_background : Optional[PixelCNN], default None
105
- Optional model for the background. Only needed if it is different from `model`.
106
- log_prob : Optional[Callable], default None
107
- Function used to evaluate log probabilities under the model
108
- if the model does not have a `log_prob` function.
109
- sequential : bool, default False
110
- Whether the data is sequential. Used to create targets during training.
111
- """
112
115
  self.dist_s = model
113
116
  self.dist_b = (
114
117
  model.copy()
@@ -123,13 +126,13 @@ class OOD_LLR(OODBase):
123
126
 
124
127
  self._ref_score: OODScore
125
128
  self._threshold_perc: float
126
- self._data_info: Optional[Tuple[tuple, type]] = None
129
+ self._data_info: tuple[tuple, type] | None = None
127
130
 
128
131
  def fit(
129
132
  self,
130
133
  x_ref: ArrayLike,
131
134
  threshold_perc: float = 100.0,
132
- loss_fn: Optional[Callable] = None,
135
+ loss_fn: Callable | None = None,
133
136
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
134
137
  epochs: int = 20,
135
138
  batch_size: int = 64,
@@ -144,10 +147,10 @@ class OOD_LLR(OODBase):
144
147
  Parameters
145
148
  ----------
146
149
  x_ref : ArrayLike
147
- Training batch.
150
+ Training data.
148
151
  threshold_perc : float, default 100.0
149
152
  Percentage of reference data that is normal.
150
- loss_fn : Optional[Callable], default None
153
+ loss_fn : Callable | None, default None
151
154
  Loss function used for training.
152
155
  optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
153
156
  Optimizer used for training.
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from typing import Callable
10
12
 
11
13
  import keras
12
14
  import numpy as np
15
+ import tensorflow as tf
13
16
  from numpy.typing import ArrayLike
14
17
 
15
18
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
@@ -20,17 +23,33 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
20
23
 
21
24
 
22
25
  class OOD_VAE(OODBase):
26
+ """
27
+ VAE based outlier detector.
28
+
29
+ Parameters
30
+ ----------
31
+ model : VAE
32
+ A VAE model.
33
+ samples : int, default 10
34
+ Number of samples sampled to evaluate each instance.
35
+
36
+ Examples
37
+ --------
38
+ Instantiate an OOD detector metric with a generic dataset - batch of images with shape (3,25,25)
39
+
40
+ >>> metric = OOD_VAE(create_model(VAE, dataset[0].shape))
41
+
42
+ Adjusting fit parameters,
43
+ including setting the fit threshold at 85% for a training set with about 15% out-of-distribution
44
+
45
+ >>> metric.fit(dataset, threshold_perc=85, batch_size=128, verbose=False)
46
+
47
+ Detect out of distribution samples at the 'feature' level
48
+
49
+ >>> result = metric.predict(dataset, ood_type="feature")
50
+ """
51
+
23
52
  def __init__(self, model: VAE, samples: int = 10) -> None:
24
- """
25
- VAE based outlier detector.
26
-
27
- Parameters
28
- ----------
29
- model : VAE
30
- A VAE model.
31
- samples : int, default 10
32
- Number of samples sampled to evaluate each instance.
33
- """
34
53
  super().__init__(model)
35
54
  self.samples = samples
36
55
 
@@ -38,32 +57,14 @@ class OOD_VAE(OODBase):
38
57
  self,
39
58
  x_ref: ArrayLike,
40
59
  threshold_perc: float = 100.0,
41
- loss_fn: Callable = Elbo(0.05),
60
+ loss_fn: Callable[..., tf.Tensor] | None = None,
42
61
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
43
62
  epochs: int = 20,
44
63
  batch_size: int = 64,
45
64
  verbose: bool = True,
46
65
  ) -> None:
47
- """
48
- Train the VAE model.
49
-
50
- Parameters
51
- ----------
52
- x_ref : ArrayLike
53
- Training batch.
54
- threshold_perc : float, default 100.0
55
- Percentage of reference data that is normal.
56
- loss_fn : Callable, default Elbo(0.05)
57
- Loss function used for training.
58
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
59
- Optimizer used for training.
60
- epochs : int, default 20
61
- Number of training epochs.
62
- batch_size : int, default 64
63
- Batch size used for training.
64
- verbose : bool, default True
65
- Whether to print training progress.
66
- """
66
+ if loss_fn is None:
67
+ loss_fn = Elbo(0.05)
67
68
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
68
69
 
69
70
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore: