dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +9 -10
- dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
- dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
- dataeval/detectors/ood/__init__.py +6 -6
- dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
- dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
- dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
- dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
- dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
- dataeval/{_internal/interop.py → interop.py} +12 -7
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
- dataeval/metrics/bias/metadata.py +275 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +7 -3
- dataeval/utils/image.py +71 -0
- dataeval/utils/shared.py +151 -0
- dataeval/{_internal → utils}/split_dataset.py +98 -33
- dataeval/utils/tensorflow/__init__.py +7 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +48 -42
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
- dataeval-0.72.2.dist-info/RECORD +72 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -8
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.1.dist-info/RECORD +0 -81
- /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,20 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["split_dataset"]
|
4
|
+
|
3
5
|
import warnings
|
6
|
+
from typing import Any
|
4
7
|
|
5
8
|
import numpy as np
|
9
|
+
from numpy.typing import NDArray
|
6
10
|
from sklearn.cluster import KMeans
|
7
11
|
from sklearn.metrics import silhouette_score
|
8
12
|
from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
|
9
13
|
from sklearn.utils.multiclass import type_of_target
|
10
14
|
|
11
15
|
|
12
|
-
def
|
13
|
-
"""Check input
|
16
|
+
def validate_test_val(num_folds: int, test_frac: float | None, val_frac: float | None) -> tuple[float, float]:
|
17
|
+
"""Check input fractions to ensure unambiguous splitting arguments are passed return calculated
|
18
|
+
test and validation fractions.
|
19
|
+
|
14
20
|
|
15
21
|
Parameters
|
16
22
|
----------
|
17
|
-
num_folds : int
|
23
|
+
num_folds : int
|
18
24
|
number of [train, val] cross-validation folds to generate
|
19
25
|
test_frac : float, optional
|
20
26
|
If specified, also generate a test set containing (test_frac*100)% of the data
|
@@ -36,19 +42,23 @@ def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: flo
|
|
36
42
|
|
37
43
|
Returns
|
38
44
|
-------
|
39
|
-
|
45
|
+
tuple[float, float]
|
46
|
+
Tuple of the validated and calculated values as appropriate for test and validation fractions
|
40
47
|
"""
|
41
48
|
if (num_folds > 1) and (val_frac is not None):
|
42
49
|
raise ValueError("If specifying val_frac, num_folds must be None or 1")
|
43
50
|
if (num_folds == 1) and (val_frac is None):
|
44
|
-
raise
|
51
|
+
raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
|
45
52
|
t_frac = 0.0 if test_frac is None else test_frac
|
46
53
|
v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
|
47
54
|
if (t_frac + v_frac) >= 1.0:
|
48
55
|
raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
|
56
|
+
return t_frac, v_frac
|
49
57
|
|
50
58
|
|
51
|
-
def check_labels(
|
59
|
+
def check_labels(
|
60
|
+
labels: list[int] | NDArray[np.int_], total_partitions: int
|
61
|
+
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
52
62
|
"""Check to make sure there are more input data than the total number of partitions requested
|
53
63
|
Also converts labels to a numpy array, if it isn't already
|
54
64
|
|
@@ -88,7 +98,7 @@ def check_labels(labels: list | np.ndarray, total_partitions: int):
|
|
88
98
|
return index, labels
|
89
99
|
|
90
100
|
|
91
|
-
def check_stratifiable(labels: np.
|
101
|
+
def check_stratifiable(labels: NDArray[np.int_], total_partitions: int) -> bool:
|
92
102
|
"""
|
93
103
|
Very basic check to see if dataset can be stratified by class label. This is not a
|
94
104
|
comprehensive test, as factors such as grouping also affect the ability to stratify by label
|
@@ -124,7 +134,7 @@ def check_stratifiable(labels: np.ndarray, total_partitions: int):
|
|
124
134
|
return stratifiable
|
125
135
|
|
126
136
|
|
127
|
-
def check_groups(group_ids: np.
|
137
|
+
def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
|
128
138
|
"""
|
129
139
|
Warns user if the number of unique group_ids is incompatible with a grouped partition containing
|
130
140
|
num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
|
@@ -162,7 +172,7 @@ def check_groups(group_ids: np.ndarray, num_partitions: int):
|
|
162
172
|
return groupable
|
163
173
|
|
164
174
|
|
165
|
-
def bin_kmeans(array: np.
|
175
|
+
def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
|
166
176
|
"""
|
167
177
|
Find bins of continuous data by iteratively applying k-means clustering, and keeping the
|
168
178
|
clustering with the highest silhouette score.
|
@@ -182,18 +192,18 @@ def bin_kmeans(array: np.ndarray):
|
|
182
192
|
best_score = 0.60
|
183
193
|
else:
|
184
194
|
best_score = 0.50
|
185
|
-
bin_index = np.zeros(len(array))
|
195
|
+
bin_index = np.zeros(len(array), dtype=np.int_)
|
186
196
|
for k in range(2, 20):
|
187
197
|
clusterer = KMeans(n_clusters=k)
|
188
198
|
cluster_labels = clusterer.fit_predict(array)
|
189
199
|
score = silhouette_score(array, cluster_labels, sample_size=25_000)
|
190
200
|
if score > best_score:
|
191
201
|
best_score = score
|
192
|
-
bin_index = cluster_labels
|
202
|
+
bin_index = cluster_labels.astype(np.int_)
|
193
203
|
return bin_index
|
194
204
|
|
195
205
|
|
196
|
-
def angle2xy(angles:
|
206
|
+
def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
|
197
207
|
"""
|
198
208
|
Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
|
199
209
|
|
@@ -213,7 +223,7 @@ def angle2xy(angles: np.ndarray):
|
|
213
223
|
return xy
|
214
224
|
|
215
225
|
|
216
|
-
def get_group_ids(metadata: dict,
|
226
|
+
def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
|
217
227
|
"""Returns individual group numbers based on a subset of metadata defined by groupnames
|
218
228
|
|
219
229
|
Parameters
|
@@ -235,7 +245,7 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
|
|
235
245
|
group_ids: np.ndarray
|
236
246
|
group identifiers from metadata
|
237
247
|
"""
|
238
|
-
features2group = {k: np.array(v) for k, v in metadata.items() if k in
|
248
|
+
features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
|
239
249
|
if not features2group:
|
240
250
|
return np.zeros(num_samples, dtype=int)
|
241
251
|
for name, feature in features2group.items():
|
@@ -252,8 +262,12 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
|
|
252
262
|
|
253
263
|
|
254
264
|
def make_splits(
|
255
|
-
index: np.
|
256
|
-
|
265
|
+
index: NDArray[np.int_],
|
266
|
+
labels: NDArray[np.int_],
|
267
|
+
n_folds: int,
|
268
|
+
groups: NDArray[np.int_] | None = None,
|
269
|
+
stratified: bool = False,
|
270
|
+
) -> list[dict[str, NDArray[np.int_]]]:
|
257
271
|
"""Split data into n_folds partitions of training and validation data.
|
258
272
|
|
259
273
|
Parameters
|
@@ -290,9 +304,59 @@ def make_splits(
|
|
290
304
|
return split_defs
|
291
305
|
|
292
306
|
|
307
|
+
def find_best_split(
|
308
|
+
labels: NDArray[np.int_], split_defs: list[dict[str, NDArray[np.int_]]], stratified: bool, eval_frac: float
|
309
|
+
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
310
|
+
"""Finds the split that most closely satisfies a criterion determined by the arguments passed.
|
311
|
+
If stratified is True, returns the split whose class balance most closely resembles the overall
|
312
|
+
class balance. If false, returns the split with the size closest to the desired eval_frac
|
313
|
+
|
314
|
+
Parameters
|
315
|
+
----------
|
316
|
+
labels : np.ndarray
|
317
|
+
Labels upon which splits are (optionally) stratified
|
318
|
+
split_defs : list[dict]
|
319
|
+
List of dictionaries, which specifying train index, validation index, and the ratio of
|
320
|
+
validation to all data.
|
321
|
+
stratified: bool
|
322
|
+
If True, maintain dataset class balance within each train/val split
|
323
|
+
eval_frac: float
|
324
|
+
Desired fraction of the dataset sequestered for evaluation
|
325
|
+
|
326
|
+
Returns
|
327
|
+
-------
|
328
|
+
train_index : np.ndarray
|
329
|
+
indices of data partitioned for training
|
330
|
+
eval_index : np.ndarray
|
331
|
+
indices of data partitioned for evaluation
|
332
|
+
"""
|
333
|
+
|
334
|
+
def class_freq_diff(split):
|
335
|
+
train_labels = labels[split["train"]]
|
336
|
+
_, train_counts = np.unique(train_labels, return_counts=True)
|
337
|
+
train_freq = train_counts / train_counts.sum()
|
338
|
+
return np.square(train_freq - class_freq).sum()
|
339
|
+
|
340
|
+
if stratified:
|
341
|
+
_, class_counts = np.unique(labels, return_counts=True)
|
342
|
+
class_freq = class_counts / class_counts.sum()
|
343
|
+
best_split = min(split_defs, key=class_freq_diff)
|
344
|
+
return best_split["train"], best_split["eval"]
|
345
|
+
elif eval_frac <= 2 / 3:
|
346
|
+
best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"])) # type: ignore
|
347
|
+
return best_split["train"], best_split["eval"]
|
348
|
+
else:
|
349
|
+
best_split = min(split_defs, key=lambda x: abs(eval_frac - (1 - x["eval_frac"]))) # type: ignore
|
350
|
+
return best_split["eval"], best_split["train"]
|
351
|
+
|
352
|
+
|
293
353
|
def single_split(
|
294
|
-
index: np.
|
295
|
-
|
354
|
+
index: NDArray[np.int_],
|
355
|
+
labels: NDArray[np.int_],
|
356
|
+
eval_frac: float,
|
357
|
+
groups: NDArray[np.int_] | None = None,
|
358
|
+
stratified: bool = False,
|
359
|
+
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
296
360
|
"""Handles the special case where only 1 partition of the data is desired (such as when
|
297
361
|
generating the test holdout split). In this case, the desired fraction of the data to be
|
298
362
|
partitioned into the test data must be specified, and a single [train, eval] pair are returned.
|
@@ -317,27 +381,28 @@ def single_split(
|
|
317
381
|
eval_index : np.ndarray
|
318
382
|
indices of data partitioned for evaluation
|
319
383
|
"""
|
320
|
-
if
|
384
|
+
if groups is not None:
|
385
|
+
n_unique_groups = np.unique(groups).shape[0]
|
386
|
+
_, label_counts = np.unique(labels, return_counts=True)
|
387
|
+
n_folds = min(n_unique_groups, label_counts.min())
|
388
|
+
elif eval_frac <= 2 / 3:
|
321
389
|
n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
|
322
|
-
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
323
|
-
best_split = min(split_candidates, key=lambda x: abs(eval_frac - x["eval_frac"]))
|
324
|
-
return best_split["train"], best_split["eval"]
|
325
390
|
else:
|
326
|
-
n_folds = max(2, int(round(1 / (1 - eval_frac
|
327
|
-
|
328
|
-
|
329
|
-
|
391
|
+
n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
|
392
|
+
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
393
|
+
best_train, best_eval = find_best_split(labels, split_candidates, stratified, eval_frac)
|
394
|
+
return best_train, best_eval
|
330
395
|
|
331
396
|
|
332
397
|
def split_dataset(
|
333
|
-
labels: list | np.
|
398
|
+
labels: list[int] | NDArray[np.int_],
|
334
399
|
num_folds: int = 1,
|
335
400
|
stratify: bool = False,
|
336
|
-
split_on: list | None = None,
|
337
|
-
metadata: dict | None = None,
|
401
|
+
split_on: list[str] | None = None,
|
402
|
+
metadata: dict[str, Any] | None = None,
|
338
403
|
test_frac: float | None = None,
|
339
404
|
val_frac: float | None = None,
|
340
|
-
):
|
405
|
+
) -> dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]]:
|
341
406
|
"""Top level splitting function. Returns a dict with each key-value pair containing
|
342
407
|
train and validation indices. Indices for a test holdout may also be optionally included
|
343
408
|
|
@@ -386,7 +451,7 @@ def split_dataset(
|
|
386
451
|
}
|
387
452
|
"""
|
388
453
|
|
389
|
-
|
454
|
+
test_frac, val_frac = validate_test_val(num_folds, test_frac, val_frac)
|
390
455
|
total_partitions = num_folds + 1 if test_frac else num_folds
|
391
456
|
index, labels = check_labels(labels, total_partitions)
|
392
457
|
stratify &= check_stratifiable(labels, total_partitions)
|
@@ -399,7 +464,7 @@ def split_dataset(
|
|
399
464
|
groups = None
|
400
465
|
else:
|
401
466
|
groups = None
|
402
|
-
split_defs = {}
|
467
|
+
split_defs: dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]] = {}
|
403
468
|
if test_frac:
|
404
469
|
tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
|
405
470
|
tv_labels = labels[tv_idx]
|
@@ -410,7 +475,7 @@ def split_dataset(
|
|
410
475
|
tv_labels = labels
|
411
476
|
tv_groups = groups
|
412
477
|
if num_folds == 1:
|
413
|
-
train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
|
478
|
+
train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
|
414
479
|
split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
|
415
480
|
else:
|
416
481
|
tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)
|
@@ -2,17 +2,18 @@
|
|
2
2
|
TensorFlow models are used in :term:`out of distribution<Out-of-distribution (OOD)>` detectors in the
|
3
3
|
:mod:`dataeval.detectors.ood` module.
|
4
4
|
|
5
|
-
DataEval provides
|
6
|
-
as well as constructors which allow for customization of the encoder, decoder and any other applicable
|
7
|
-
layers used by the model.
|
5
|
+
DataEval provides basic default models through the utility :func:`dataeval.utils.tensorflow.create_model`.
|
8
6
|
"""
|
9
7
|
|
10
8
|
from dataeval import _IS_TENSORFLOW_AVAILABLE
|
11
9
|
|
12
|
-
from . import loss, models, recon
|
13
|
-
|
14
10
|
__all__ = []
|
15
11
|
|
16
12
|
|
17
13
|
if _IS_TENSORFLOW_AVAILABLE:
|
18
|
-
|
14
|
+
import dataeval.utils.tensorflow.loss as loss
|
15
|
+
from dataeval.utils.tensorflow._internal.utils import create_model
|
16
|
+
|
17
|
+
__all__ = ["create_model", "loss"]
|
18
|
+
|
19
|
+
del _IS_TENSORFLOW_AVAILABLE
|
@@ -6,14 +6,14 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
|
|
6
6
|
Licensed under Apache Software License (Apache 2.0)
|
7
7
|
"""
|
8
8
|
|
9
|
-
# pyright: reportIncompatibleMethodOverride=false
|
10
|
-
|
11
9
|
from __future__ import annotations
|
12
10
|
|
13
|
-
from typing import
|
11
|
+
from typing import cast
|
14
12
|
|
15
13
|
import tensorflow as tf
|
16
14
|
import tf_keras as keras
|
15
|
+
from tensorflow.python.module.module import Module # noqa
|
16
|
+
from tf_keras import Sequential
|
17
17
|
from tf_keras.layers import (
|
18
18
|
Dense,
|
19
19
|
Flatten,
|
@@ -103,7 +103,7 @@ class Sampling(Layer):
|
|
103
103
|
|
104
104
|
|
105
105
|
class EncoderAE(Layer):
|
106
|
-
def __init__(self, encoder_net:
|
106
|
+
def __init__(self, encoder_net: Sequential) -> None:
|
107
107
|
"""
|
108
108
|
Encoder of AE.
|
109
109
|
|
@@ -115,14 +115,14 @@ class EncoderAE(Layer):
|
|
115
115
|
Name of encoder.
|
116
116
|
"""
|
117
117
|
super().__init__(name="encoder_ae")
|
118
|
-
self.encoder_net = encoder_net
|
118
|
+
self.encoder_net: Sequential = encoder_net
|
119
119
|
|
120
120
|
def call(self, x: tf.Tensor) -> tf.Tensor:
|
121
121
|
return cast(tf.Tensor, self.encoder_net(x))
|
122
122
|
|
123
123
|
|
124
124
|
class EncoderVAE(Layer):
|
125
|
-
def __init__(self, encoder_net:
|
125
|
+
def __init__(self, encoder_net: Sequential, latent_dim: int) -> None:
|
126
126
|
"""
|
127
127
|
Encoder of VAE.
|
128
128
|
|
@@ -136,23 +136,23 @@ class EncoderVAE(Layer):
|
|
136
136
|
Name of encoder.
|
137
137
|
"""
|
138
138
|
super().__init__(name="encoder_vae")
|
139
|
-
self.encoder_net = encoder_net
|
140
|
-
self.
|
141
|
-
self.
|
142
|
-
self.
|
139
|
+
self.encoder_net: Sequential = encoder_net
|
140
|
+
self._fc_mean = Dense(latent_dim, activation=None)
|
141
|
+
self._fc_log_var = Dense(latent_dim, activation=None)
|
142
|
+
self._sampling = Sampling()
|
143
143
|
|
144
144
|
def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
|
145
145
|
x = cast(tf.Tensor, self.encoder_net(x))
|
146
146
|
if len(x.shape) > 2:
|
147
147
|
x = cast(tf.Tensor, Flatten()(x))
|
148
|
-
z_mean = cast(tf.Tensor, self.
|
149
|
-
z_log_var = cast(tf.Tensor, self.
|
150
|
-
z = cast(tf.Tensor, self.
|
148
|
+
z_mean = cast(tf.Tensor, self._fc_mean(x))
|
149
|
+
z_log_var = cast(tf.Tensor, self._fc_log_var(x))
|
150
|
+
z = cast(tf.Tensor, self._sampling((z_mean, z_log_var)))
|
151
151
|
return z_mean, z_log_var, z
|
152
152
|
|
153
153
|
|
154
154
|
class Decoder(Layer):
|
155
|
-
def __init__(self, decoder_net:
|
155
|
+
def __init__(self, decoder_net: Sequential) -> None:
|
156
156
|
"""
|
157
157
|
Decoder of AE and VAE.
|
158
158
|
|
@@ -164,10 +164,10 @@ class Decoder(Layer):
|
|
164
164
|
Name of decoder.
|
165
165
|
"""
|
166
166
|
super().__init__(name="decoder")
|
167
|
-
self.decoder_net = decoder_net
|
167
|
+
self.decoder_net: Sequential = decoder_net
|
168
168
|
|
169
|
-
def call(self,
|
170
|
-
return cast(tf.Tensor, self.decoder_net(
|
169
|
+
def call(self, inputs: tf.Tensor) -> tf.Tensor:
|
170
|
+
return cast(tf.Tensor, self.decoder_net(inputs))
|
171
171
|
|
172
172
|
|
173
173
|
class AE(keras.Model):
|
@@ -176,19 +176,19 @@ class AE(keras.Model):
|
|
176
176
|
|
177
177
|
Parameters
|
178
178
|
----------
|
179
|
-
encoder_net :
|
179
|
+
encoder_net : Sequential
|
180
180
|
Layers for the encoder wrapped in a keras.Sequential class.
|
181
|
-
decoder_net :
|
181
|
+
decoder_net : Sequential
|
182
182
|
Layers for the decoder wrapped in a keras.Sequential class.
|
183
183
|
"""
|
184
184
|
|
185
|
-
def __init__(self, encoder_net:
|
185
|
+
def __init__(self, encoder_net: Sequential, decoder_net: Sequential) -> None:
|
186
186
|
super().__init__(name="ae")
|
187
|
-
self.encoder = EncoderAE(encoder_net)
|
188
|
-
self.decoder = Decoder(decoder_net)
|
187
|
+
self.encoder: Layer = EncoderAE(encoder_net)
|
188
|
+
self.decoder: Layer = Decoder(decoder_net)
|
189
189
|
|
190
|
-
def call(self,
|
191
|
-
z = cast(tf.Tensor, self.encoder(
|
190
|
+
def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
|
191
|
+
z = cast(tf.Tensor, self.encoder(inputs))
|
192
192
|
x_recon = cast(tf.Tensor, self.decoder(z))
|
193
193
|
return x_recon
|
194
194
|
|
@@ -199,9 +199,9 @@ class VAE(keras.Model):
|
|
199
199
|
|
200
200
|
Parameters
|
201
201
|
----------
|
202
|
-
encoder_net :
|
202
|
+
encoder_net : Sequential
|
203
203
|
Layers for the encoder wrapped in a keras.Sequential class.
|
204
|
-
decoder_net :
|
204
|
+
decoder_net : Sequential
|
205
205
|
Layers for the decoder wrapped in a keras.Sequential class.
|
206
206
|
latent_dim : int
|
207
207
|
Dimensionality of the :term:`latent space<Latent Space>`.
|
@@ -209,15 +209,15 @@ class VAE(keras.Model):
|
|
209
209
|
Beta parameter for KL-divergence loss term.
|
210
210
|
"""
|
211
211
|
|
212
|
-
def __init__(self, encoder_net:
|
212
|
+
def __init__(self, encoder_net: Sequential, decoder_net: Sequential, latent_dim: int, beta: float = 1.0) -> None:
|
213
213
|
super().__init__(name="vae_model")
|
214
|
-
self.encoder = EncoderVAE(encoder_net, latent_dim)
|
215
|
-
self.decoder = Decoder(decoder_net)
|
216
|
-
self.beta = beta
|
217
|
-
self.latent_dim = latent_dim
|
214
|
+
self.encoder: Layer = EncoderVAE(encoder_net, latent_dim)
|
215
|
+
self.decoder: Layer = Decoder(decoder_net)
|
216
|
+
self.beta: float = beta
|
217
|
+
self.latent_dim: int = latent_dim
|
218
218
|
|
219
|
-
def call(self,
|
220
|
-
z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(
|
219
|
+
def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
|
220
|
+
z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
|
221
221
|
x_recon = self.decoder(z)
|
222
222
|
# add KL divergence loss term
|
223
223
|
kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
|
@@ -231,37 +231,35 @@ class AEGMM(keras.Model):
|
|
231
231
|
|
232
232
|
Parameters
|
233
233
|
----------
|
234
|
-
encoder_net :
|
234
|
+
encoder_net : Sequential
|
235
235
|
Layers for the encoder wrapped in a keras.Sequential class.
|
236
|
-
decoder_net :
|
236
|
+
decoder_net : Sequential
|
237
237
|
Layers for the decoder wrapped in a keras.Sequential class.
|
238
|
-
gmm_density_net :
|
238
|
+
gmm_density_net : Sequential
|
239
239
|
Layers for the GMM network wrapped in a keras.Sequential class.
|
240
240
|
n_gmm : int
|
241
241
|
Number of components in GMM.
|
242
|
-
recon_features : Callable, default eucl_cosim_features
|
243
|
-
Function to extract features from the reconstructed instance by the decoder.
|
244
242
|
"""
|
245
243
|
|
246
244
|
def __init__(
|
247
245
|
self,
|
248
|
-
encoder_net:
|
249
|
-
decoder_net:
|
250
|
-
gmm_density_net:
|
246
|
+
encoder_net: Sequential,
|
247
|
+
decoder_net: Sequential,
|
248
|
+
gmm_density_net: Sequential,
|
251
249
|
n_gmm: int,
|
252
|
-
recon_features: Callable = eucl_cosim_features,
|
253
250
|
) -> None:
|
254
251
|
super().__init__("aegmm")
|
255
252
|
self.encoder = encoder_net
|
256
253
|
self.decoder = decoder_net
|
257
254
|
self.gmm_density = gmm_density_net
|
258
255
|
self.n_gmm = n_gmm
|
259
|
-
self.recon_features = recon_features
|
260
256
|
|
261
|
-
def call(
|
262
|
-
|
257
|
+
def call(
|
258
|
+
self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
|
259
|
+
) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
|
260
|
+
enc = self.encoder(inputs)
|
263
261
|
x_recon = cast(tf.Tensor, self.decoder(enc))
|
264
|
-
recon_features =
|
262
|
+
recon_features = eucl_cosim_features(inputs, x_recon)
|
265
263
|
z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
|
266
264
|
gamma = cast(tf.Tensor, self.gmm_density(z))
|
267
265
|
return x_recon, z, gamma
|
@@ -273,45 +271,43 @@ class VAEGMM(keras.Model):
|
|
273
271
|
|
274
272
|
Parameters
|
275
273
|
----------
|
276
|
-
encoder_net :
|
274
|
+
encoder_net : Sequential
|
277
275
|
Layers for the encoder wrapped in a keras.Sequential class.
|
278
|
-
decoder_net :
|
276
|
+
decoder_net : Sequential
|
279
277
|
Layers for the decoder wrapped in a keras.Sequential class.
|
280
|
-
gmm_density_net :
|
278
|
+
gmm_density_net : Sequential
|
281
279
|
Layers for the GMM network wrapped in a keras.Sequential class.
|
282
280
|
n_gmm : int
|
283
281
|
Number of components in GMM.
|
284
282
|
latent_dim : int
|
285
283
|
Dimensionality of the :term:`latent space<Latent Space>`.
|
286
|
-
recon_features : Callable, default eucl_cosim_features
|
287
|
-
Function to extract features from the reconstructed instance by the decoder.
|
288
284
|
beta : float, default 1.0
|
289
285
|
Beta parameter for KL-divergence loss term.
|
290
286
|
"""
|
291
287
|
|
292
288
|
def __init__(
|
293
289
|
self,
|
294
|
-
encoder_net:
|
295
|
-
decoder_net:
|
296
|
-
gmm_density_net:
|
290
|
+
encoder_net: Sequential,
|
291
|
+
decoder_net: Sequential,
|
292
|
+
gmm_density_net: Sequential,
|
297
293
|
n_gmm: int,
|
298
294
|
latent_dim: int,
|
299
|
-
recon_features: Callable = eucl_cosim_features,
|
300
295
|
beta: float = 1.0,
|
301
296
|
) -> None:
|
302
297
|
super().__init__(name="vaegmm")
|
303
|
-
self.encoder = EncoderVAE(encoder_net, latent_dim)
|
304
|
-
self.decoder = decoder_net
|
305
|
-
self.gmm_density = gmm_density_net
|
306
|
-
self.n_gmm = n_gmm
|
307
|
-
self.latent_dim = latent_dim
|
308
|
-
self.recon_features = recon_features
|
298
|
+
self.encoder: Sequential = EncoderVAE(encoder_net, latent_dim)
|
299
|
+
self.decoder: Sequential = decoder_net
|
300
|
+
self.gmm_density: Sequential = gmm_density_net
|
301
|
+
self.n_gmm: int = n_gmm
|
302
|
+
self.latent_dim: int = latent_dim
|
309
303
|
self.beta = beta
|
310
304
|
|
311
|
-
def call(
|
312
|
-
|
305
|
+
def call(
|
306
|
+
self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
|
307
|
+
) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
|
308
|
+
enc_mean, enc_log_var, enc = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
|
313
309
|
x_recon = cast(tf.Tensor, self.decoder(enc))
|
314
|
-
recon_features =
|
310
|
+
recon_features = eucl_cosim_features(inputs, x_recon)
|
315
311
|
z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
|
316
312
|
gamma = cast(tf.Tensor, self.gmm_density(z))
|
317
313
|
# add KL divergence loss term
|
@@ -10,6 +10,7 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
from typing import Literal, cast
|
12
12
|
|
13
|
+
import numpy as np
|
13
14
|
import tensorflow as tf
|
14
15
|
from numpy.typing import NDArray
|
15
16
|
from tensorflow_probability.python.distributions.mvn_diag import MultivariateNormalDiag
|
@@ -17,7 +18,7 @@ from tensorflow_probability.python.distributions.mvn_tril import MultivariateNor
|
|
17
18
|
from tensorflow_probability.python.stats import covariance
|
18
19
|
from tf_keras.layers import Flatten
|
19
20
|
|
20
|
-
from dataeval.
|
21
|
+
from dataeval.utils.tensorflow._internal.gmm import gmm_energy, gmm_params
|
21
22
|
|
22
23
|
|
23
24
|
class Elbo:
|
@@ -39,26 +40,26 @@ class Elbo:
|
|
39
40
|
def __init__(
|
40
41
|
self,
|
41
42
|
cov_type: Literal["cov_full", "cov_diag"] | float = 1.0,
|
42
|
-
x: tf.Tensor | NDArray | None = None,
|
43
|
+
x: tf.Tensor | NDArray[np.float32] | None = None,
|
43
44
|
):
|
44
45
|
if isinstance(cov_type, float):
|
45
|
-
self.
|
46
|
+
self._cov = ("sim", cov_type)
|
46
47
|
elif cov_type in ["cov_full", "cov_diag"]:
|
47
|
-
x_np: NDArray = x.numpy() if tf.is_tensor(x) else x # type: ignore
|
48
|
+
x_np: NDArray[np.float32] = x.numpy().astype(np.float32) if tf.is_tensor(x) else x # type: ignore
|
48
49
|
cov = covariance(x_np.reshape(x_np.shape[0], -1)) # type: ignore py38
|
49
50
|
if cov_type == "cov_diag": # infer standard deviation from covariance matrix
|
50
51
|
cov = tf.math.sqrt(tf.linalg.diag_part(cov))
|
51
|
-
self.
|
52
|
+
self._cov = (cov_type, cov)
|
52
53
|
else:
|
53
54
|
raise ValueError("Only cov_full, cov_diag or sim value should be specified.")
|
54
55
|
|
55
56
|
def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
|
56
57
|
y_pred_flat = cast(tf.Tensor, Flatten()(y_pred))
|
57
58
|
|
58
|
-
if self.
|
59
|
-
y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self.
|
59
|
+
if self._cov[0] == "cov_full":
|
60
|
+
y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self._cov[1]))
|
60
61
|
else: # cov_diag and sim
|
61
|
-
cov_diag = self.
|
62
|
+
cov_diag = self._cov[1] if self._cov[0] == "cov_diag" else self._cov[1] * tf.ones(y_pred_flat.shape[-1])
|
62
63
|
y_mn = MultivariateNormalDiag(y_pred_flat, scale_diag=cov_diag)
|
63
64
|
|
64
65
|
loss = -tf.reduce_mean(y_mn.log_prob(Flatten()(y_true)))
|