PyPI - dataeval - Versions diffs - 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl - Mend

dataeval 0.72.1py3-none-any.whl → 0.72.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

dataeval/__init__.py +4 -4
dataeval/detectors/__init__.py +4 -3
dataeval/detectors/drift/__init__.py +9 -10
dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
dataeval/detectors/drift/updates.py +61 -0
dataeval/detectors/linters/__init__.py +3 -3
dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
dataeval/detectors/ood/__init__.py +6 -6
dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
dataeval/detectors/ood/metadata_ks_compare.py +99 -0
dataeval/detectors/ood/metadata_least_likely.py +119 -0
dataeval/detectors/ood/metadata_ood_mi.py +92 -0
dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
dataeval/{_internal/interop.py → interop.py} +12 -7
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +4 -4
dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
dataeval/metrics/bias/metadata.py +275 -0
dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
dataeval/metrics/estimators/__init__.py +3 -3
dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
dataeval/metrics/stats/__init__.py +7 -7
dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
dataeval/metrics/stats/hashstats.py +156 -0
dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
dataeval/{_internal/output.py → output.py} +26 -6
dataeval/utils/__init__.py +7 -3
dataeval/utils/image.py +71 -0
dataeval/utils/shared.py +151 -0
dataeval/{_internal → utils}/split_dataset.py +98 -33
dataeval/utils/tensorflow/__init__.py +7 -6
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
dataeval/utils/tensorflow/loss/__init__.py +6 -2
dataeval/utils/torch/__init__.py +7 -3
dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
dataeval/{_internal → utils/torch}/datasets.py +48 -42
dataeval/utils/torch/models.py +138 -0
dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
dataeval/{_internal → utils/torch}/utils.py +3 -1
dataeval/workflows/__init__.py +1 -1
dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
dataeval-0.72.2.dist-info/RECORD +72 -0
dataeval/_internal/detectors/__init__.py +0 -0
dataeval/_internal/detectors/drift/__init__.py +0 -0
dataeval/_internal/detectors/ood/__init__.py +0 -0
dataeval/_internal/metrics/__init__.py +0 -0
dataeval/_internal/metrics/stats/hashstats.py +0 -75
dataeval/_internal/metrics/utils.py +0 -447
dataeval/_internal/models/__init__.py +0 -0
dataeval/_internal/models/pytorch/__init__.py +0 -0
dataeval/_internal/models/pytorch/utils.py +0 -67
dataeval/_internal/models/tensorflow/__init__.py +0 -0
dataeval/_internal/workflows/__init__.py +0 -0
dataeval/detectors/drift/kernels/__init__.py +0 -10
dataeval/detectors/drift/updates/__init__.py +0 -8
dataeval/utils/tensorflow/models/__init__.py +0 -9
dataeval/utils/tensorflow/recon/__init__.py +0 -3
dataeval/utils/torch/datasets/__init__.py +0 -12
dataeval/utils/torch/models/__init__.py +0 -11
dataeval/utils/torch/trainer/__init__.py +0 -7
dataeval-0.72.1.dist-info/RECORD +0 -81
/dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
{dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0

dataeval/{_internal → utils}/split_dataset.py RENAMED Viewed

@@ -1,20 +1,26 @@
 from __future__ import annotations
+__all__ = ["split_dataset"]
 import warnings
+from typing import Any
 import numpy as np
+from numpy.typing import NDArray
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_score
 from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
 from sklearn.utils.multiclass import type_of_target
-def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: float | None = None):
-    """Check input arguments to ensure unambiguous splitting arguments are passed.
+def validate_test_val(num_folds: int, test_frac: float | None, val_frac: float | None) -> tuple[float, float]:
+    """Check input fractions to ensure unambiguous splitting arguments are passed return calculated
+    test and validation fractions.
     Parameters
     ----------
-    num_folds : int, default 1
+    num_folds : int
         number of [train, val] cross-validation folds to generate
     test_frac : float, optional
         If specified, also generate a test set containing (test_frac*100)% of the data
@@ -36,19 +42,23 @@ def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: flo
     Returns
     -------
-    None
+    tuple[float, float]
+        Tuple of the validated and calculated values as appropriate for test and validation fractions
     """
     if (num_folds > 1) and (val_frac is not None):
         raise ValueError("If specifying val_frac, num_folds must be None or 1")
     if (num_folds == 1) and (val_frac is None):
-        raise UnboundLocalError("If num_folds is None or 1, must assign a value to val_frac")
+        raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
     t_frac = 0.0 if test_frac is None else test_frac
     v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
     if (t_frac + v_frac) >= 1.0:
         raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
+    return t_frac, v_frac
-def check_labels(labels: list | np.ndarray, total_partitions: int):
+def check_labels(
+    labels: list[int] | NDArray[np.int_], total_partitions: int
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
     """Check to make sure there are more input data than the total number of partitions requested
     Also converts labels to a numpy array, if it isn't already
@@ -88,7 +98,7 @@ def check_labels(labels: list | np.ndarray, total_partitions: int):
     return index, labels
-def check_stratifiable(labels: np.ndarray, total_partitions: int):
+def check_stratifiable(labels: NDArray[np.int_], total_partitions: int) -> bool:
     """
     Very basic check to see if dataset can be stratified by class label. This is not a
     comprehensive test, as factors such as grouping also affect the ability to stratify by label
@@ -124,7 +134,7 @@ def check_stratifiable(labels: np.ndarray, total_partitions: int):
     return stratifiable
-def check_groups(group_ids: np.ndarray, num_partitions: int):
+def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
     """
     Warns user if the number of unique group_ids is incompatible with a grouped partition containing
     num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
@@ -162,7 +172,7 @@ def check_groups(group_ids: np.ndarray, num_partitions: int):
     return groupable
-def bin_kmeans(array: np.ndarray):
+def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
     """
     Find bins of continuous data by iteratively applying k-means clustering, and keeping the
     clustering with the highest silhouette score.
@@ -182,18 +192,18 @@ def bin_kmeans(array: np.ndarray):
         best_score = 0.60
     else:
         best_score = 0.50
-    bin_index = np.zeros(len(array))
+    bin_index = np.zeros(len(array), dtype=np.int_)
     for k in range(2, 20):
         clusterer = KMeans(n_clusters=k)
         cluster_labels = clusterer.fit_predict(array)
         score = silhouette_score(array, cluster_labels, sample_size=25_000)
         if score > best_score:
             best_score = score
-            bin_index = cluster_labels
+            bin_index = cluster_labels.astype(np.int_)
     return bin_index
-def angle2xy(angles: np.ndarray):
+def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
     """
     Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
@@ -213,7 +223,7 @@ def angle2xy(angles: np.ndarray):
     return xy
-def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
+def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
     """Returns individual group numbers based on a subset of metadata defined by groupnames
     Parameters
@@ -235,7 +245,7 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
     group_ids: np.ndarray
         group identifiers from metadata
     """
-    features2group = {k: np.array(v) for k, v in metadata.items() if k in groupnames}
+    features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
     if not features2group:
         return np.zeros(num_samples, dtype=int)
     for name, feature in features2group.items():
@@ -252,8 +262,12 @@ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
 def make_splits(
-    index: np.ndarray, labels: np.ndarray, n_folds: int, groups: np.ndarray | None = None, stratified: bool = False
-):
+    index: NDArray[np.int_],
+    labels: NDArray[np.int_],
+    n_folds: int,
+    groups: NDArray[np.int_] | None = None,
+    stratified: bool = False,
+) -> list[dict[str, NDArray[np.int_]]]:
     """Split data into n_folds partitions of training and validation data.
     Parameters
@@ -290,9 +304,59 @@ def make_splits(
     return split_defs
+def find_best_split(
+    labels: NDArray[np.int_], split_defs: list[dict[str, NDArray[np.int_]]], stratified: bool, eval_frac: float
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
+    """Finds the split that most closely satisfies a criterion determined by the arguments passed.
+    If stratified is True, returns the split whose class balance most closely resembles the overall
+    class balance. If false, returns the split with the size closest to the desired eval_frac
+    Parameters
+    ----------
+    labels : np.ndarray
+        Labels upon which splits are (optionally) stratified
+    split_defs : list[dict]
+        List of dictionaries, which specifying train index, validation index, and the ratio of
+        validation to all data.
+    stratified: bool
+        If True, maintain dataset class balance within each train/val split
+    eval_frac: float
+        Desired fraction of the dataset sequestered for evaluation
+    Returns
+    -------
+    train_index : np.ndarray
+        indices of data partitioned for training
+    eval_index : np.ndarray
+        indices of data partitioned for evaluation
+    """
+    def class_freq_diff(split):
+        train_labels = labels[split["train"]]
+        _, train_counts = np.unique(train_labels, return_counts=True)
+        train_freq = train_counts / train_counts.sum()
+        return np.square(train_freq - class_freq).sum()
+    if stratified:
+        _, class_counts = np.unique(labels, return_counts=True)
+        class_freq = class_counts / class_counts.sum()
+        best_split = min(split_defs, key=class_freq_diff)
+        return best_split["train"], best_split["eval"]
+    elif eval_frac <= 2 / 3:
+        best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"]))  # type: ignore
+        return best_split["train"], best_split["eval"]
+    else:
+        best_split = min(split_defs, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))  # type: ignore
+        return best_split["eval"], best_split["train"]
 def single_split(
-    index: np.ndarray, labels: np.ndarray, eval_frac: float, groups: np.ndarray | None = None, stratified: bool = False
-):
+    index: NDArray[np.int_],
+    labels: NDArray[np.int_],
+    eval_frac: float,
+    groups: NDArray[np.int_] | None = None,
+    stratified: bool = False,
+) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
     """Handles the special case where only 1 partition of the data is desired (such as when
     generating the test holdout split). In this case, the desired fraction of the data to be
     partitioned into the test data must be specified, and a single [train, eval] pair are returned.
@@ -317,27 +381,28 @@ def single_split(
     eval_index : np.ndarray
         indices of data partitioned for evaluation
     """
-    if eval_frac <= 2 / 3:
+    if groups is not None:
+        n_unique_groups = np.unique(groups).shape[0]
+        _, label_counts = np.unique(labels, return_counts=True)
+        n_folds = min(n_unique_groups, label_counts.min())
+    elif eval_frac <= 2 / 3:
         n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
-        split_candidates = make_splits(index, labels, n_folds, groups, stratified)
-        best_split = min(split_candidates, key=lambda x: abs(eval_frac - x["eval_frac"]))
-        return best_split["train"], best_split["eval"]
     else:
-        n_folds = max(2, int(round(1 / (1 - eval_frac + 1e-6))))
-        split_candidates = make_splits(index, labels, n_folds, groups, stratified)
-        best_split = min(split_candidates, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))
-        return best_split["eval"], best_split["train"]
+        n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
+    split_candidates = make_splits(index, labels, n_folds, groups, stratified)
+    best_train, best_eval = find_best_split(labels, split_candidates, stratified, eval_frac)
+    return best_train, best_eval
 def split_dataset(
-    labels: list | np.ndarray,
+    labels: list[int] | NDArray[np.int_],
     num_folds: int = 1,
     stratify: bool = False,
-    split_on: list | None = None,
-    metadata: dict | None = None,
+    split_on: list[str] | None = None,
+    metadata: dict[str, Any] | None = None,
     test_frac: float | None = None,
     val_frac: float | None = None,
-):
+) -> dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]]:
     """Top level splitting function. Returns a dict with each key-value pair containing
     train and validation indices. Indices for a test holdout may also be optionally included
@@ -386,7 +451,7 @@ def split_dataset(
         }
     """
-    check_args(num_folds, test_frac, val_frac)
+    test_frac, val_frac = validate_test_val(num_folds, test_frac, val_frac)
     total_partitions = num_folds + 1 if test_frac else num_folds
     index, labels = check_labels(labels, total_partitions)
     stratify &= check_stratifiable(labels, total_partitions)
@@ -399,7 +464,7 @@ def split_dataset(
             groups = None
     else:
         groups = None
-    split_defs = {}
+    split_defs: dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]] = {}
     if test_frac:
         tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
         tv_labels = labels[tv_idx]
@@ -410,7 +475,7 @@ def split_dataset(
         tv_labels = labels
         tv_groups = groups
     if num_folds == 1:
-        train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)  # type: ignore
+        train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
         split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
     else:
         tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)

dataeval/utils/tensorflow/__init__.py CHANGED Viewed

@@ -2,17 +2,18 @@
 TensorFlow models are used in :term:`out of distribution<Out-of-distribution (OOD)>` detectors in the
 :mod:`dataeval.detectors.ood` module.
-DataEval provides both basic default models through the utility :func:`dataeval.utils.tensorflow.models.create_model`
-as well as constructors which allow for customization of the encoder, decoder and any other applicable
-layers used by the model.
+DataEval provides basic default models through the utility :func:`dataeval.utils.tensorflow.create_model`.
 """
 from dataeval import _IS_TENSORFLOW_AVAILABLE
-from . import loss, models, recon
 __all__ = []
 if _IS_TENSORFLOW_AVAILABLE:
-    __all__ = ["loss", "models", "recon"]
+    import dataeval.utils.tensorflow.loss as loss
+    from dataeval.utils.tensorflow._internal.utils import create_model
+    __all__ = ["create_model", "loss"]
+del _IS_TENSORFLOW_AVAILABLE

dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py RENAMED Viewed

@@ -6,14 +6,14 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
 Licensed under Apache Software License (Apache 2.0)
 """
-# pyright: reportIncompatibleMethodOverride=false
 from __future__ import annotations
-from typing import Callable, cast
+from typing import cast
 import tensorflow as tf
 import tf_keras as keras
+from tensorflow.python.module.module import Module  # noqa
+from tf_keras import Sequential
 from tf_keras.layers import (
     Dense,
     Flatten,
@@ -103,7 +103,7 @@ class Sampling(Layer):
 class EncoderAE(Layer):
-    def __init__(self, encoder_net: keras.Model) -> None:
+    def __init__(self, encoder_net: Sequential) -> None:
         """
         Encoder of AE.
@@ -115,14 +115,14 @@ class EncoderAE(Layer):
             Name of encoder.
         """
         super().__init__(name="encoder_ae")
-        self.encoder_net = encoder_net
+        self.encoder_net: Sequential = encoder_net
     def call(self, x: tf.Tensor) -> tf.Tensor:
         return cast(tf.Tensor, self.encoder_net(x))
 class EncoderVAE(Layer):
-    def __init__(self, encoder_net: keras.Model, latent_dim: int) -> None:
+    def __init__(self, encoder_net: Sequential, latent_dim: int) -> None:
         """
         Encoder of VAE.
@@ -136,23 +136,23 @@ class EncoderVAE(Layer):
             Name of encoder.
         """
         super().__init__(name="encoder_vae")
-        self.encoder_net = encoder_net
-        self.fc_mean = Dense(latent_dim, activation=None)
-        self.fc_log_var = Dense(latent_dim, activation=None)
-        self.sampling = Sampling()
+        self.encoder_net: Sequential = encoder_net
+        self._fc_mean = Dense(latent_dim, activation=None)
+        self._fc_log_var = Dense(latent_dim, activation=None)
+        self._sampling = Sampling()
     def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         x = cast(tf.Tensor, self.encoder_net(x))
         if len(x.shape) > 2:
             x = cast(tf.Tensor, Flatten()(x))
-        z_mean = cast(tf.Tensor, self.fc_mean(x))
-        z_log_var = cast(tf.Tensor, self.fc_log_var(x))
-        z = cast(tf.Tensor, self.sampling((z_mean, z_log_var)))
+        z_mean = cast(tf.Tensor, self._fc_mean(x))
+        z_log_var = cast(tf.Tensor, self._fc_log_var(x))
+        z = cast(tf.Tensor, self._sampling((z_mean, z_log_var)))
         return z_mean, z_log_var, z
 class Decoder(Layer):
-    def __init__(self, decoder_net: keras.Model) -> None:
+    def __init__(self, decoder_net: Sequential) -> None:
         """
         Decoder of AE and VAE.
@@ -164,10 +164,10 @@ class Decoder(Layer):
             Name of decoder.
         """
         super().__init__(name="decoder")
-        self.decoder_net = decoder_net
+        self.decoder_net: Sequential = decoder_net
-    def call(self, x: tf.Tensor) -> tf.Tensor:
-        return cast(tf.Tensor, self.decoder_net(x))
+    def call(self, inputs: tf.Tensor) -> tf.Tensor:
+        return cast(tf.Tensor, self.decoder_net(inputs))
 class AE(keras.Model):
@@ -176,19 +176,19 @@ class AE(keras.Model):
     Parameters
     ----------
-    encoder_net : keras.Model
+    encoder_net : Sequential
         Layers for the encoder wrapped in a keras.Sequential class.
-    decoder_net : keras.Model
+    decoder_net : Sequential
         Layers for the decoder wrapped in a keras.Sequential class.
     """
-    def __init__(self, encoder_net: keras.Model, decoder_net: keras.Model) -> None:
+    def __init__(self, encoder_net: Sequential, decoder_net: Sequential) -> None:
         super().__init__(name="ae")
-        self.encoder = EncoderAE(encoder_net)
-        self.decoder = Decoder(decoder_net)
+        self.encoder: Layer = EncoderAE(encoder_net)
+        self.decoder: Layer = Decoder(decoder_net)
-    def call(self, x: tf.Tensor) -> tf.Tensor:
-        z = cast(tf.Tensor, self.encoder(x))
+    def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
+        z = cast(tf.Tensor, self.encoder(inputs))
         x_recon = cast(tf.Tensor, self.decoder(z))
         return x_recon
@@ -199,9 +199,9 @@ class VAE(keras.Model):
     Parameters
     ----------
-    encoder_net : keras.Model
+    encoder_net : Sequential
         Layers for the encoder wrapped in a keras.Sequential class.
-    decoder_net : keras.Model
+    decoder_net : Sequential
         Layers for the decoder wrapped in a keras.Sequential class.
     latent_dim : int
         Dimensionality of the :term:`latent space<Latent Space>`.
@@ -209,15 +209,15 @@ class VAE(keras.Model):
         Beta parameter for KL-divergence loss term.
     """
-    def __init__(self, encoder_net: keras.Model, decoder_net: keras.Model, latent_dim: int, beta: float = 1.0) -> None:
+    def __init__(self, encoder_net: Sequential, decoder_net: Sequential, latent_dim: int, beta: float = 1.0) -> None:
         super().__init__(name="vae_model")
-        self.encoder = EncoderVAE(encoder_net, latent_dim)
-        self.decoder = Decoder(decoder_net)
-        self.beta = beta
-        self.latent_dim = latent_dim
+        self.encoder: Layer = EncoderVAE(encoder_net, latent_dim)
+        self.decoder: Layer = Decoder(decoder_net)
+        self.beta: float = beta
+        self.latent_dim: int = latent_dim
-    def call(self, x: tf.Tensor) -> tf.Tensor:
-        z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(x))
+    def call(self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None) -> tf.Tensor:
+        z_mean, z_log_var, z = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
         x_recon = self.decoder(z)
         # add KL divergence loss term
         kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
@@ -231,37 +231,35 @@ class AEGMM(keras.Model):
     Parameters
     ----------
-    encoder_net : keras.Model
+    encoder_net : Sequential
         Layers for the encoder wrapped in a keras.Sequential class.
-    decoder_net : keras.Model
+    decoder_net : Sequential
         Layers for the decoder wrapped in a keras.Sequential class.
-    gmm_density_net : keras.Model
+    gmm_density_net : Sequential
         Layers for the GMM network wrapped in a keras.Sequential class.
     n_gmm : int
         Number of components in GMM.
-    recon_features : Callable, default eucl_cosim_features
-        Function to extract features from the reconstructed instance by the decoder.
     """
     def __init__(
         self,
-        encoder_net: keras.Model,
-        decoder_net: keras.Model,
-        gmm_density_net: keras.Model,
+        encoder_net: Sequential,
+        decoder_net: Sequential,
+        gmm_density_net: Sequential,
         n_gmm: int,
-        recon_features: Callable = eucl_cosim_features,
     ) -> None:
         super().__init__("aegmm")
         self.encoder = encoder_net
         self.decoder = decoder_net
         self.gmm_density = gmm_density_net
         self.n_gmm = n_gmm
-        self.recon_features = recon_features
-    def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        enc = self.encoder(x)
+    def call(
+        self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        enc = self.encoder(inputs)
         x_recon = cast(tf.Tensor, self.decoder(enc))
-        recon_features = self.recon_features(x, x_recon)
+        recon_features = eucl_cosim_features(inputs, x_recon)
         z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
         gamma = cast(tf.Tensor, self.gmm_density(z))
         return x_recon, z, gamma
@@ -273,45 +271,43 @@ class VAEGMM(keras.Model):
     Parameters
     ----------
-    encoder_net : keras.Model
+    encoder_net : Sequential
         Layers for the encoder wrapped in a keras.Sequential class.
-    decoder_net : keras.Model
+    decoder_net : Sequential
         Layers for the decoder wrapped in a keras.Sequential class.
-    gmm_density_net : keras.Model
+    gmm_density_net : Sequential
         Layers for the GMM network wrapped in a keras.Sequential class.
     n_gmm : int
         Number of components in GMM.
     latent_dim : int
         Dimensionality of the :term:`latent space<Latent Space>`.
-    recon_features : Callable, default eucl_cosim_features
-        Function to extract features from the reconstructed instance by the decoder.
     beta : float, default 1.0
         Beta parameter for KL-divergence loss term.
     """
     def __init__(
         self,
-        encoder_net: keras.Model,
-        decoder_net: keras.Model,
-        gmm_density_net: keras.Model,
+        encoder_net: Sequential,
+        decoder_net: Sequential,
+        gmm_density_net: Sequential,
         n_gmm: int,
         latent_dim: int,
-        recon_features: Callable = eucl_cosim_features,
         beta: float = 1.0,
     ) -> None:
         super().__init__(name="vaegmm")
-        self.encoder = EncoderVAE(encoder_net, latent_dim)
-        self.decoder = decoder_net
-        self.gmm_density = gmm_density_net
-        self.n_gmm = n_gmm
-        self.latent_dim = latent_dim
-        self.recon_features = recon_features
+        self.encoder: Sequential = EncoderVAE(encoder_net, latent_dim)
+        self.decoder: Sequential = decoder_net
+        self.gmm_density: Sequential = gmm_density_net
+        self.n_gmm: int = n_gmm
+        self.latent_dim: int = latent_dim
         self.beta = beta
-    def call(self, x: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-        enc_mean, enc_log_var, enc = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(x))
+    def call(
+        self, inputs: tf.Tensor, training: bool | None = None, mask: tf.Tensor | None = None
+    ) -> tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        enc_mean, enc_log_var, enc = cast(tuple[tf.Tensor, tf.Tensor, tf.Tensor], self.encoder(inputs))
         x_recon = cast(tf.Tensor, self.decoder(enc))
-        recon_features = self.recon_features(x, x_recon)
+        recon_features = eucl_cosim_features(inputs, x_recon)
         z = cast(tf.Tensor, tf.concat([enc, recon_features], -1))
         gamma = cast(tf.Tensor, self.gmm_density(z))
         # add KL divergence loss term

dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} RENAMED Viewed

@@ -10,6 +10,7 @@ from __future__ import annotations
 from typing import Literal, cast
+import numpy as np
 import tensorflow as tf
 from numpy.typing import NDArray
 from tensorflow_probability.python.distributions.mvn_diag import MultivariateNormalDiag
@@ -17,7 +18,7 @@ from tensorflow_probability.python.distributions.mvn_tril import MultivariateNor
 from tensorflow_probability.python.stats import covariance
 from tf_keras.layers import Flatten
-from dataeval._internal.models.tensorflow.gmm import gmm_energy, gmm_params
+from dataeval.utils.tensorflow._internal.gmm import gmm_energy, gmm_params
 class Elbo:
@@ -39,26 +40,26 @@ class Elbo:
     def __init__(
         self,
         cov_type: Literal["cov_full", "cov_diag"] | float = 1.0,
-        x: tf.Tensor | NDArray | None = None,
+        x: tf.Tensor | NDArray[np.float32] | None = None,
     ):
         if isinstance(cov_type, float):
-            self.cov = ("sim", cov_type)
+            self._cov = ("sim", cov_type)
         elif cov_type in ["cov_full", "cov_diag"]:
-            x_np: NDArray = x.numpy() if tf.is_tensor(x) else x  # type: ignore
+            x_np: NDArray[np.float32] = x.numpy().astype(np.float32) if tf.is_tensor(x) else x  # type: ignore
             cov = covariance(x_np.reshape(x_np.shape[0], -1))  # type: ignore py38
             if cov_type == "cov_diag":  # infer standard deviation from covariance matrix
                 cov = tf.math.sqrt(tf.linalg.diag_part(cov))
-            self.cov = (cov_type, cov)
+            self._cov = (cov_type, cov)
         else:
             raise ValueError("Only cov_full, cov_diag or sim value should be specified.")
     def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
         y_pred_flat = cast(tf.Tensor, Flatten()(y_pred))
-        if self.cov[0] == "cov_full":
-            y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self.cov[1]))
+        if self._cov[0] == "cov_full":
+            y_mn = MultivariateNormalTriL(y_pred_flat, scale_tril=tf.linalg.cholesky(self._cov[1]))
         else:  # cov_diag and sim
-            cov_diag = self.cov[1] if self.cov[0] == "cov_diag" else self.cov[1] * tf.ones(y_pred_flat.shape[-1])
+            cov_diag = self._cov[1] if self._cov[0] == "cov_diag" else self._cov[1] * tf.ones(y_pred_flat.shape[-1])
             y_mn = MultivariateNormalDiag(y_pred_flat, scale_diag=cov_diag)
         loss = -tf.reduce_mean(y_mn.log_prob(Flatten()(y_true)))

dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl

dataeval 0.72.1py3-none-any.whl → 0.72.2py3-none-any.whl