dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +9 -10
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
  16. dataeval/detectors/ood/__init__.py +6 -6
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
  18. dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
  25. dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
  26. dataeval/{_internal/interop.py → interop.py} +12 -7
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
  32. dataeval/metrics/bias/metadata.py +275 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +7 -3
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/shared.py +151 -0
  51. dataeval/{_internal → utils}/split_dataset.py +98 -33
  52. dataeval/utils/tensorflow/__init__.py +7 -6
  53. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
  54. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
  56. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
  57. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
  58. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  59. dataeval/utils/torch/__init__.py +7 -3
  60. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  61. dataeval/{_internal → utils/torch}/datasets.py +48 -42
  62. dataeval/utils/torch/models.py +138 -0
  63. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
  64. dataeval/{_internal → utils/torch}/utils.py +3 -1
  65. dataeval/workflows/__init__.py +1 -1
  66. dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
  67. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
  68. dataeval-0.72.2.dist-info/RECORD +72 -0
  69. dataeval/_internal/detectors/__init__.py +0 -0
  70. dataeval/_internal/detectors/drift/__init__.py +0 -0
  71. dataeval/_internal/detectors/ood/__init__.py +0 -0
  72. dataeval/_internal/metrics/__init__.py +0 -0
  73. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  74. dataeval/_internal/metrics/utils.py +0 -447
  75. dataeval/_internal/models/__init__.py +0 -0
  76. dataeval/_internal/models/pytorch/__init__.py +0 -0
  77. dataeval/_internal/models/pytorch/utils.py +0 -67
  78. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  79. dataeval/_internal/workflows/__init__.py +0 -0
  80. dataeval/detectors/drift/kernels/__init__.py +0 -10
  81. dataeval/detectors/drift/updates/__init__.py +0 -8
  82. dataeval/utils/tensorflow/models/__init__.py +0 -9
  83. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  84. dataeval/utils/torch/datasets/__init__.py +0 -12
  85. dataeval/utils/torch/models/__init__.py +0 -11
  86. dataeval/utils/torch/trainer/__init__.py +0 -7
  87. dataeval-0.72.1.dist-info/RECORD +0 -81
  88. /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
  89. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
  90. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,20 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["split_dataset"]
4
+
3
5
  import warnings
6
+ from typing import Any
4
7
 
5
8
  import numpy as np
9
+ from numpy.typing import NDArray
6
10
  from sklearn.cluster import KMeans
7
11
  from sklearn.metrics import silhouette_score
8
12
  from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
9
13
  from sklearn.utils.multiclass import type_of_target
10
14
 
11
15
 
12
- def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: float | None = None):
13
- """Check input arguments to ensure unambiguous splitting arguments are passed.
16
+ def validate_test_val(num_folds: int, test_frac: float | None, val_frac: float | None) -> tuple[float, float]:
17
+ """Check input fractions to ensure unambiguous splitting arguments are passed return calculated
18
+ test and validation fractions.
19
+
14
20
 
15
21
  Parameters
16
22
  ----------
17
- num_folds : int, default 1
23
+ num_folds : int
18
24
  number of [train, val] cross-validation folds to generate
19
25
  test_frac : float, optional
20
26
  If specified, also generate a test set containing (test_frac*100)% of the data
@@ -36,19 +42,23 @@ def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: flo
36
42
 
37
43
  Returns
38
44
  -------
39
- None
45
+ tuple[float, float]
46
+ Tuple of the validated and calculated values as appropriate for test and validation fractions
40
47
  """
41
48
  if (num_folds > 1) and (val_frac is not None):
42
49
  raise ValueError("If specifying val_frac, num_folds must be None or 1")
43
50
  if (num_folds == 1) and (val_frac is None):
44
- raise UnboundLocalError("If num_folds is None or 1, must assign a value to val_frac")
51
+ raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
45
52
  t_frac = 0.0 if test_frac is None else test_frac
46
53
  v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
47
54
  if (t_frac + v_frac) >= 1.0:
48
55
  raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
56
+ return t_frac, v_frac
49
57
 
50
58
 
51
- def check_labels(labels: list | np.ndarray, total_partitions: int):
59
+ def check_labels(
60
+ labels: list[int] | NDArray[np.int_], total_partitions: int
61
+ ) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
52
62
  """Check to make sure there are more input data than the total number of partitions requested
53
63
  Also converts labels to a numpy array, if it isn't already
54
64
 
@@ -88,7 +98,7 @@ def check_labels(labels: list | np.ndarray, total_partitions: int):
88
98
  return index, labels
89
99
 
90
100
 
91
- def check_stratifiable(labels: np.ndarray, total_partitions: int):
101
+ def check_stratifiable(labels: NDArray[np.int_], total_partitions: int) -> bool:
92
102
  """
93
103
  Very basic check to see if dataset can be stratified by class label. This is not a
94
104
  comprehensive test, as factors such as grouping also affect the ability to stratify by label
@@ -124,7 +134,7 @@ def check_stratifiable(labels: np.ndarray, total_partitions: int):
124
134
  return stratifiable
125
135
 
126
136
 
127
- def check_groups(group_ids: np.ndarray, num_partitions: int):
137
+ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
128
138
  """
129
139
  Warns user if the number of unique group_ids is incompatible with a grouped partition containing
130
140
  num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
@@ -162,7 +172,7 @@ def check_groups(group_ids: np.ndarray, num_partitions: int):
162
172
  return groupable
163
173
 
164
174
 
165
- def bin_kmeans(array: np.ndarray):
175
+ def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
166
176
  """
167
177
  Find bins of continuous data by iteratively applying k-means clustering, and keeping the
168
178
  clustering with the highest silhouette score.
@@ -182,18 +192,18 @@ def bin_kmeans(array: np.ndarray):
182
192
  best_score = 0.60
183
193
  else:
184
194
  best_score = 0.50
185
- bin_index = np.zeros(len(array))
195
+ bin_index = np.zeros(len(array), dtype=np.int_)
186
196
  for k in range(2, 20):
187
197
  clusterer = KMeans(n_clusters=k)
188
198
  cluster_labels = clusterer.fit_predict(array)
189
199
  score = silhouette_score(array, cluster_labels, sample_size=25_000)
190
200
  if score > best_score:
191
201
  best_score = score
192
- bin_index = cluster_labels
202
+ bin_index = cluster_labels.astype(np.int_)
193
203
  return bin_index
194
204
 
195
205
 
196
- def angle2xy(angles: np.ndarray):
206
+ def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
197
207
  """
198
208
  Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
199
209
 
@@ -213,7 +223,7 @@ def angle2xy(angles: np.ndarray):
213
223
  return xy
214
224
 
215
225
 
216
- def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
226
+ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
217
227
  """Returns individual group numbers based on a subset of metadata defined by groupnames
218
228
 
219
229
  Parameters
@@ -235,7 +245,7 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
235
245
  group_ids: np.ndarray
236
246
  group identifiers from metadata
237
247
  """
238
- features2group = {k: np.array(v) for k, v in metadata.items() if k in groupnames}
248
+ features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
239
249
  if not features2group:
240
250
  return np.zeros(num_samples, dtype=int)
241
251
  for name, feature in features2group.items():
@@ -252,8 +262,12 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
252
262
 
253
263
 
254
264
  def make_splits(
255
- index: np.ndarray, labels: np.ndarray, n_folds: int, groups: np.ndarray | None = None, stratified: bool = False
256
- ):
265
+ index: NDArray[np.int_],
266
+ labels: NDArray[np.int_],
267
+ n_folds: int,
268
+ groups: NDArray[np.int_] | None = None,
269
+ stratified: bool = False,
270
+ ) -> list[dict[str, NDArray[np.int_]]]:
257
271
  """Split data into n_folds partitions of training and validation data.
258
272
 
259
273
  Parameters
@@ -290,9 +304,59 @@ def make_splits(
290
304
  return split_defs
291
305
 
292
306
 
307
+ def find_best_split(
308
+ labels: NDArray[np.int_], split_defs: list[dict[str, NDArray[np.int_]]], stratified: bool, eval_frac: float
309
+ ) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
310
+ """Finds the split that most closely satisfies a criterion determined by the arguments passed.
311
+ If stratified is True, returns the split whose class balance most closely resembles the overall
312
+ class balance. If false, returns the split with the size closest to the desired eval_frac
313
+
314
+ Parameters
315
+ ----------
316
+ labels : np.ndarray
317
+ Labels upon which splits are (optionally) stratified
318
+ split_defs : list[dict]
319
+ List of dictionaries, which specifying train index, validation index, and the ratio of
320
+ validation to all data.
321
+ stratified: bool
322
+ If True, maintain dataset class balance within each train/val split
323
+ eval_frac: float
324
+ Desired fraction of the dataset sequestered for evaluation
325
+
326
+ Returns
327
+ -------
328
+ train_index : np.ndarray
329
+ indices of data partitioned for training
330
+ eval_index : np.ndarray
331
+ indices of data partitioned for evaluation
332
+ """
333
+
334
+ def class_freq_diff(split):
335
+ train_labels = labels[split["train"]]
336
+ _, train_counts = np.unique(train_labels, return_counts=True)
337
+ train_freq = train_counts / train_counts.sum()
338
+ return np.square(train_freq - class_freq).sum()
339
+
340
+ if stratified:
341
+ _, class_counts = np.unique(labels, return_counts=True)
342
+ class_freq = class_counts / class_counts.sum()
343
+ best_split = min(split_defs, key=class_freq_diff)
344
+ return best_split["train"], best_split["eval"]
345
+ elif eval_frac <= 2 / 3:
346
+ best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"])) # type: ignore
347
+ return best_split["train"], best_split["eval"]
348
+ else:
349
+ best_split = min(split_defs, key=lambda x: abs(eval_frac - (1 - x["eval_frac"]))) # type: ignore
350
+ return best_split["eval"], best_split["train"]
351
+
352
+
293
353
  def single_split(
294
- index: np.ndarray, labels: np.ndarray, eval_frac: float, groups: np.ndarray | None = None, stratified: bool = False
295
- ):
354
+ index: NDArray[np.int_],
355
+ labels: NDArray[np.int_],
356
+ eval_frac: float,
357
+ groups: NDArray[np.int_] | None = None,
358
+ stratified: bool = False,
359
+ ) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
296
360
  """Handles the special case where only 1 partition of the data is desired (such as when
297
361
  generating the test holdout split). In this case, the desired fraction of the data to be
298
362
  partitioned into the test data must be specified, and a single [train, eval] pair are returned.
@@ -317,27 +381,28 @@ def single_split(
317
381
  eval_index : np.ndarray
318
382
  indices of data partitioned for evaluation
319
383
  """
320
- if eval_frac <= 2 / 3:
384
+ if groups is not None:
385
+ n_unique_groups = np.unique(groups).shape[0]
386
+ _, label_counts = np.unique(labels, return_counts=True)
387
+ n_folds = min(n_unique_groups, label_counts.min())
388
+ elif eval_frac <= 2 / 3:
321
389
  n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
322
- split_candidates = make_splits(index, labels, n_folds, groups, stratified)
323
- best_split = min(split_candidates, key=lambda x: abs(eval_frac - x["eval_frac"]))
324
- return best_split["train"], best_split["eval"]
325
390
  else:
326
- n_folds = max(2, int(round(1 / (1 - eval_frac + 1e-6))))
327
- split_candidates = make_splits(index, labels, n_folds, groups, stratified)
328
- best_split = min(split_candidates, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))
329
- return best_split["eval"], best_split["train"]
391
+ n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
392
+ split_candidates = make_splits(index, labels, n_folds, groups, stratified)
393
+ best_train, best_eval = find_best_split(labels, split_candidates, stratified, eval_frac)
394
+ return best_train, best_eval
330
395
 
331
396
 
332
397
  def split_dataset(
333
- labels: list | np.ndarray,
398
+ labels: list[int] | NDArray[np.int_],
334
399
  num_folds: int = 1,
335
400
  stratify: bool = False,
336
- split_on: list | None = None,
337
- metadata: dict | None = None,
401
+ split_on: list[str] | None = None,
402
+ metadata: dict[str, Any] | None = None,
338
403
  test_frac: float | None = None,
339
404
  val_frac: float | None = None,
340
- ):
405
+ ) -> dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]]:
341
406
  """Top level splitting function. Returns a dict with each key-value pair containing
342
407
  train and validation indices. Indices for a test holdout may also be optionally included
343
408
 
@@ -386,7 +451,7 @@ def split_dataset(
386
451
  }
387
452
  """
388
453
 
389
- check_args(num_folds, test_frac, val_frac)
454
+ test_frac, val_frac = validate_test_val(num_folds, test_frac, val_frac)
390
455
  total_partitions = num_folds + 1 if test_frac else num_folds
391
456
  index, labels = check_labels(labels, total_partitions)
392
457
  stratify &= check_stratifiable(labels, total_partitions)
@@ -399,7 +464,7 @@ def split_dataset(
399
464
  groups = None
400
465
  else:
401
466
  groups = None
402
- split_defs = {}
467
+ split_defs: dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]] = {}
403
468
  if test_frac:
404
469
  tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
405
470
  tv_labels = labels[tv_idx]
@@ -410,7 +475,7 @@ def split_dataset(
410
475
  tv_labels = labels
411
476
  tv_groups = groups
412
477
  if num_folds == 1:
413
- train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify) # type: ignore
478
+ train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
414
479
  split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
415
480
  else:
416
481
  tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)
@@ -2,17 +2,18 @@
2
2
  TensorFlow models are used in :term:`out of distribution<Out-of-distribution (OOD)>` detectors in the
3
3
  :mod:`dataeval.detectors.ood` module.
4
4
 
5
- DataEval provides both basic default models through the utility :func:`dataeval.utils.tensorflow.models.create_model`
6
- as well as constructors which allow for customization of the encoder, decoder and any other applicable
7
- layers used by the model.
5
+ DataEval provides basic default models through the utility :func:`dataeval.utils.tensorflow.create_model`.
8
6
  """
9
7
 
10
8
  from dataeval import _IS_TENSORFLOW_AVAILABLE
11
9
 
12
- from . import loss, models, recon
13
-
14
10
  __all__ = []
15
11
 
16
12
 
17
13
  if _IS_TENSORFLOW_AVAILABLE:
18
- __all__ = ["loss", "models", "recon"]
14
+ import dataeval.utils.tensorflow.loss as loss
15
+ from dataeval.utils.tensorflow._internal.utils import create_model
16
+
17
+ __all__ = ["create_model", "loss"]
18
+
19
+ del _IS_TENSORFLOW_AVAILABLE
@@ -6,14 +6,14 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
- # pyright: reportIncompatibleMethodOverride=false
10
-
11
9
  from __future__ import annotations
12
10
 
13
- from typing import Callable, cast
11
+ from typing import cast
14
12
 
15
13
  import tensorflow as tf
16
14
  import tf_keras as keras
15
+ from tensorflow.python.module.module import Module # noqa
16
+ from tf_keras import Sequential
17
17
  from tf_keras.layers import (
18
18
  Dense,
19
19
  Flatten,
@@ -103,7 +103,7 @@ class Sampling(Layer):
103
103
 
104
104
 
105
105
  class EncoderAE(Layer):
106
- def __init__(self, encoder_net: keras.Model) -> None:
106
+ def __init__(self, encoder_net: Sequential) -> None:
107
107
  """
108
108
  Encoder of AE.
109
109
 
@@ -115,14 +115,14 @@ class EncoderAE(Layer):
115
115
  Name of encoder.
116
116
  """
117
117
  super().__init__(name="encoder_ae")
118
- self.encoder_net = encoder_net
118
+ self.encoder_net: Sequential = encoder_net
119
119
 
120
120
  def call(self, x: tf.Tensor) -> tf.Tensor:
121
121
  return cast(tf.Tensor, self.encoder_net(x))
122
122
 
123
123
 
124
124
  class EncoderVAE(Layer):
125
- def __init__(self, encoder_net: keras.Model, latent_dim: int) -> None:
125
+ def __init__(self, encoder_net: Sequential, latent_dim: int) -> None:
126
126
  """
127
127
  Encoder of VAE.
128
128
 
@@ -136,23 +136,23 @@ class EncoderVAE(Layer):
136
136
  Name of encoder.
137
137
  """
138
138
  super().__init__(name="encoder_vae")
139
- self.encoder_net = encoder_net
140
- self.fc_mean = Dense(latent_dim, activation=None)
141
- self.fc_log_var = Dense(latent_dim, activation=None)
142
- self.sampling = Sampling()
139
+ self.encoder_net: Sequential = encoder_net
140
+ self._fc_mean = Dense(latent_dim, activation=None)
141
+ self._fc_log_var = Dense(latent_dim, activation=None)
142
+ self._sampling = Sampling()
143
143
 
144
144
  def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
145
145
  x = cast(tf.Tensor, self.encoder_net(x))
146
146
  if len(x.shape) > 2:
147
147
  x = cast(tf.Tensor, Flatten()(x))
148
- z_mean = cast(tf.Tensor, self.fc_mean(x))
149
- z_log_var = cast(tf.Tensor, self.fc_log_var(x))
150
- z = cast(tf.Tensor, self.sampling((z_mean, z_log_var)))
148
+ z_mean = cast(tf.Tensor, self._fc_mean(x))
149
+ z_log_var = cast(tf.Tensor, self._fc_log_var(x))
150
+ z = cast(tf.Tensor, self._sampling((z_mean, z_log_var)))
151
151
  return z_mean, z_log_var, z
152
152
 
153
153
 
154
154
  class Decoder(Layer):
155
- def __init__(self, decoder_net: keras.Model) -> None:
155
+ def __init__(self, decoder_net: Sequential) -> None:
156
156
  """
157
157
  Decoder of AE and VAE.
158
158
 
@@ -164,10 +164,10 @@ class Decoder(Layer):
164
164
  Name of decoder.
165
165
  """
166
166
  super().__init__(name="decoder")
167
- self.decoder_net = decoder_net
167
+ self.decoder_net: Sequential = decoder_net
168
168
 
169
- def call(self, x: tf.Tensor) -> tf.Tensor:
170
- return cast(tf.Tensor, self.decoder_net(x))
169
+ def call(self, inputs: tf.Tensor) -> tf.Tensor:
170
+ return cast(tf.Tensor, self.decoder_net(inputs))
171
171
 
172
172
 
173
173
  class AE(keras.Model):
@@ -176,19 +176,19 @@ class AE(keras.Model):
176
176
 
177
177
  Parameters
178
178
  ----------
179
- encoder_net : keras.Model
179
+ encoder_net : Sequential
180
180
  Layers for the encoder wrapped in a keras.Sequential class.
181
- decoder_net : keras.Model
181
+ decoder_net : Sequential
182
182
  Layers for the decoder wrapped in a keras.Sequential class.
183
183
  """
184
184
 
185
- def __init__(self, encoder_net: keras.Model, decoder_net: keras.Model) -> None:
185
+ def __init__(self, encoder_net: Sequential, decoder_net: Sequential) -> None:
186
186
  super().__init__(name="ae")
187
- self.encoder = EncoderAE(encoder_net)
188
- self.decoder = Decoder(decoder_net)
187
+ self.encoder: Layer = EncoderAE(encoder_net)
188
+ self.decoder: Layer = Decoder(decoder_net)
189
189
 
190
- def call(self, x: tf.Tensor) -> tf.Tensor:
191
- z = cast(tf.Tensor, self.encoder(x))
190
+ def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
191
+ z = cast(tf.Tensor, self.encoder(inputs))
192
192
  x_recon = cast(tf.Tensor, self.decoder(z))
193
193
  return x_recon
194
194
 
@@ -199,9 +199,9 @@ class VAE(keras.Model):
199
199
 
200
200
  Parameters
201
201
  ----------
202
- encoder_net : keras.Model
202
+ encoder_net : Sequential
203
203
  Layers for the encoder wrapped in a keras.Sequential class.
204
- decoder_net : keras.Model
204
+ decoder_net : Sequential
205
205
  Layers for the decoder wrapped in a keras.Sequential class.
206
206
  latent_dim : int
207
207
  Dimensionality of the :term:`latent space<Latent Space>`.
@@ -209,15 +209,15 @@ class VAE(keras.Model):
209
209
  Beta parameter for KL-divergence loss term.
210
210
  """
211
211
 
212
- def __init__(self, encoder_net: keras.Model, decoder_net: keras.Model, latent_dim: int, beta: float = 1.0) -> None:
212
+ def __init__(self, encoder_net: Sequential, decoder_net: Sequential, latent_dim: int, beta: float = 1.0) -> None:
213
213
  super().__init__(name="vae_model")
214
- self.encoder = EncoderVAE(encoder_net, latent_dim)
215
- self.decoder = Decoder(decoder_net)
216
- self.beta = beta
217
- self.latent_dim = latent_dim
214
+ self.encoder: Layer = EncoderVAE(encoder_net, latent_dim)
215
+ self.decoder: Layer = Decoder(decoder_net)
216
+ self.beta: float = beta
217
+ self.latent_dim: int = latent_dim
218
218
 
219
- def call(self, x: tf.Tensor) -> tf.Tensor:
220
- z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(x))
219
+ def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
220
+ z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
221
221
  x_recon = self.decoder(z)
222
222
  # add KL divergence loss term
223
223
  kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
@@ -231,37 +231,35 @@ class AEGMM(keras.Model):
231
231
 
232
232
  Parameters
233
233
  ----------
234
- encoder_net : keras.Model
234
+ encoder_net : Sequential
235
235
  Layers for the encoder wrapped in a keras.Sequential class.
236
- decoder_net : keras.Model
236
+ decoder_net : Sequential
237
237
  Layers for the decoder wrapped in a keras.Sequential class.
238
- gmm_density_net : keras.Model
238
+ gmm_density_net : Sequential
239
239
  Layers for the GMM network wrapped in a keras.Sequential class.
240
240
  n_gmm : int
241
241
  Number of components in GMM.
242
- recon_features : Callable, default eucl_cosim_features
243
- Function to extract features from the reconstructed instance by the decoder.
244
242
  """
245
243
 
246
244
  def __init__(
247
245
  self,
248
- encoder_net: keras.Model,
249
- decoder_net: keras.Model,
250
- gmm_density_net: keras.Model,
246
+ encoder_net: Sequential,
247
+ decoder_net: Sequential,
248
+ gmm_density_net: Sequential,
251
249
  n_gmm: int,
252
- recon_features: Callable = eucl_cosim_features,
253
250
  ) -> None:
254
251
  super().__init__("aegmm")
255
252
  self.encoder = encoder_net
256
253
  self.decoder = decoder_net
257
254
  self.gmm_density = gmm_density_net
258
255
  self.n_gmm = n_gmm
259
- self.recon_features = recon_features
260
256
 
261
- def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
262
- enc = self.encoder(x)
257
+ def call(
258
+ self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
259
+ ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
260
+ enc = self.encoder(inputs)
263
261
  x_recon = cast(tf.Tensor, self.decoder(enc))
264
- recon_features = self.recon_features(x, x_recon)
262
+ recon_features = eucl_cosim_features(inputs, x_recon)
265
263
  z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
266
264
  gamma = cast(tf.Tensor, self.gmm_density(z))
267
265
  return x_recon, z, gamma
@@ -273,45 +271,43 @@ class VAEGMM(keras.Model):
273
271
 
274
272
  Parameters
275
273
  ----------
276
- encoder_net : keras.Model
274
+ encoder_net : Sequential
277
275
  Layers for the encoder wrapped in a keras.Sequential class.
278
- decoder_net : keras.Model
276
+ decoder_net : Sequential
279
277
  Layers for the decoder wrapped in a keras.Sequential class.
280
- gmm_density_net : keras.Model
278
+ gmm_density_net : Sequential
281
279
  Layers for the GMM network wrapped in a keras.Sequential class.
282
280
  n_gmm : int
283
281
  Number of components in GMM.
284
282
  latent_dim : int
285
283
  Dimensionality of the :term:`latent space<Latent Space>`.
286
- recon_features : Callable, default eucl_cosim_features
287
- Function to extract features from the reconstructed instance by the decoder.
288
284
  beta : float, default 1.0
289
285
  Beta parameter for KL-divergence loss term.
290
286
  """
291
287
 
292
288
  def __init__(
293
289
  self,
294
- encoder_net: keras.Model,
295
- decoder_net: keras.Model,
296
- gmm_density_net: keras.Model,
290
+ encoder_net: Sequential,
291
+ decoder_net: Sequential,
292
+ gmm_density_net: Sequential,
297
293
  n_gmm: int,
298
294
  latent_dim: int,
299
- recon_features: Callable = eucl_cosim_features,
300
295
  beta: float = 1.0,
301
296
  ) -> None:
302
297
  super().__init__(name="vaegmm")
303
- self.encoder = EncoderVAE(encoder_net, latent_dim)
304
- self.decoder = decoder_net
305
- self.gmm_density = gmm_density_net
306
- self.n_gmm = n_gmm
307
- self.latent_dim = latent_dim
308
- self.recon_features = recon_features
298
+ self.encoder: Sequential = EncoderVAE(encoder_net, latent_dim)
299
+ self.decoder: Sequential = decoder_net
300
+ self.gmm_density: Sequential = gmm_density_net
301
+ self.n_gmm: int = n_gmm
302
+ self.latent_dim: int = latent_dim
309
303
  self.beta = beta
310
304
 
311
- def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
312
- enc_mean, enc_log_var, enc = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(x))
305
+ def call(
306
+ self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
307
+ ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
308
+ enc_mean, enc_log_var, enc = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
313
309
  x_recon = cast(tf.Tensor, self.decoder(enc))
314
- recon_features = self.recon_features(x, x_recon)
310
+ recon_features = eucl_cosim_features(inputs, x_recon)
315
311
  z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
316
312
  gamma = cast(tf.Tensor, self.gmm_density(z))
317
313
  # add KL divergence loss term
@@ -10,6 +10,7 @@ from __future__ import annotations
10
10
 
11
11
  from typing import Literal, cast
12
12
 
13
+ import numpy as np
13
14
  import tensorflow as tf
14
15
  from numpy.typing import NDArray
15
16
  from tensorflow_probability.python.distributions.mvn_diag import MultivariateNormalDiag
@@ -17,7 +18,7 @@ from tensorflow_probability.python.distributions.mvn_tril import MultivariateNor
17
18
  from tensorflow_probability.python.stats import covariance
18
19
  from tf_keras.layers import Flatten
19
20
 
20
- from dataeval._internal.models.tensorflow.gmm import gmm_energy, gmm_params
21
+ from dataeval.utils.tensorflow._internal.gmm import gmm_energy, gmm_params
21
22
 
22
23
 
23
24
  class Elbo:
@@ -39,26 +40,26 @@ class Elbo:
39
40
  def __init__(
40
41
  self,
41
42
  cov_type: Literal["cov_full", "cov_diag"] | float = 1.0,
42
- x: tf.Tensor | NDArray | None = None,
43
+ x: tf.Tensor | NDArray[np.float32] | None = None,
43
44
  ):
44
45
  if isinstance(cov_type, float):
45
- self.cov = ("sim", cov_type)
46
+ self._cov = ("sim", cov_type)
46
47
  elif cov_type in ["cov_full", "cov_diag"]:
47
- x_np: NDArray = x.numpy() if tf.is_tensor(x) else x # type: ignore
48
+ x_np: NDArray[np.float32] = x.numpy().astype(np.float32) if tf.is_tensor(x) else x # type: ignore
48
49
  cov = covariance(x_np.reshape(x_np.shape[0], -1)) # type: ignore py38
49
50
  if cov_type == "cov_diag": # infer standard deviation from covariance matrix
50
51
  cov = tf.math.sqrt(tf.linalg.diag_part(cov))
51
- self.cov = (cov_type, cov)
52
+ self._cov = (cov_type, cov)
52
53
  else:
53
54
  raise ValueError("Only cov_full, cov_diag or sim value should be specified.")
54
55
 
55
56
  def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
56
57
  y_pred_flat = cast(tf.Tensor, Flatten()(y_pred))
57
58
 
58
- if self.cov[0] == "cov_full":
59
- y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self.cov[1]))
59
+ if self._cov[0] == "cov_full":
60
+ y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self._cov[1]))
60
61
  else: # cov_diag and sim
61
- cov_diag = self.cov[1] if self.cov[0] == "cov_diag" else self.cov[1] * tf.ones(y_pred_flat.shape[-1])
62
+ cov_diag = self._cov[1] if self._cov[0] == "cov_diag" else self._cov[1] * tf.ones(y_pred_flat.shape[-1])
62
63
  y_mn = MultivariateNormalDiag(y_pred_flat, scale_diag=cov_diag)
63
64
 
64
65
  loss = -tf.reduce_mean(y_mn.log_prob(Flatten()(y_true)))