PyPI - sdg-core-lib - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sdg-core-lib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

sdg_core_lib/data_generator/models/keras/VAE.py ADDED Viewed

@@ -0,0 +1,61 @@
+import keras
+from keras.api import layers, ops
+import tensorflow as tf
+class Sampling(layers.Layer):
+    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
+    def __init__(self, seed: int = 42, **kwargs):
+        super().__init__(**kwargs)
+        self.seed_generator = keras.random.SeedGenerator(seed)
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = ops.shape(z_mean)[0]
+        dim = ops.shape(z_mean)[1]
+        epsilon = keras.random.normal(shape=(batch, dim), seed=self.seed_generator)
+        return z_mean + ops.exp(0.5 * z_log_var) * epsilon
+class VAE(keras.Model):
+    def __init__(self, encoder, decoder, beta=1, **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.decoder = decoder
+        self._beta = beta
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+        ]
+    def train_step(self, data):
+        with tf.GradientTape() as tape:
+            z_mean, z_log_var, z = self.encoder(data)
+            reconstruction = self.decoder(z)
+            reconstruction_loss = ops.mean(
+                ops.sum(ops.abs(data - reconstruction), axis=-1)
+            )
+            kl_loss = -0.5 * (1 + z_log_var - ops.square(z_mean) - ops.exp(z_log_var))
+            kl_loss = ops.mean(ops.sum(kl_loss, axis=1))
+            total_loss = reconstruction_loss + self._beta * kl_loss
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+        }

sdg_core_lib/data_generator/models/keras/__init__.py ADDED Viewed

File without changes

sdg_core_lib/data_generator/models/keras/implementation/TabularVAE.py ADDED Viewed

@@ -0,0 +1,96 @@
+import keras
+from keras import layers
+from sdg_core_lib import NumericDataset
+from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo, AllowedData
+from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
+from sdg_core_lib.preprocess.scale import standardize_simple_tabular_input
+from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
+class TabularVAE(KerasBaseVAE):
+    """
+    TabularVAE is a class that implements a Variational Autoencoder (VAE) for tabular data generation.
+    It inherits from the KerasBaseVAE class and provides functionality specific to handling tabular data.
+    Attributes:
+        _latent_dim (int): The dimensionality of the latent space.
+        _beta (float): The beta parameter for the VAE loss function.
+        _learning_rate (float): Learning rate for the optimizer.
+        _batch_size (int): Number of samples per batch during training.
+        _epochs (int): Number of training epochs.
+        _scaler: Scaler used for standardizing input data.
+    Methods:
+        __init__: Initializes the TabularVAE with model parameters.
+        _load_model: Loads the VAE model with specified encoder and decoder.
+        _build: Builds the VAE model architecture.
+        _pre_process: Pre-processes input data using standardization.
+        self_describe: Provides metadata information about the model.
+    """
+    def __init__(
+        self,
+        metadata: dict,
+        model_name: str,
+        input_shape: str,
+        load_path: str | None,
+        latent_dim: int = 2,
+        learning_rate: float = 1e-3,
+        batch_size: int = 8,
+        epochs: int = 200,
+    ):
+        super().__init__(metadata, model_name, input_shape, load_path, latent_dim)
+        self._beta = 1
+        self._learning_rate = learning_rate
+        self._epochs = epochs
+        self._batch_size = batch_size
+        self._instantiate()
+    def _load_model(self, encoder, decoder):
+        self._model = VAE(encoder, decoder, self._beta)
+    def _build(self, input_shape: tuple[int, ...]):
+        encoder_inputs = keras.Input(shape=input_shape)
+        x = layers.Dense(32, activation="relu")(encoder_inputs)
+        x = layers.Dense(64, activation="relu")(x)
+        x = layers.Dense(16, activation="relu")(x)
+        z_mean = layers.Dense(self._latent_dim, name="z_mean")(x)
+        z_log_var = layers.Dense(self._latent_dim, name="z_log_var")(x)
+        z = Sampling()([z_mean, z_log_var])
+        encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
+        latent_inputs = keras.Input(shape=(self._latent_dim,))
+        y = layers.Dense(16, activation="relu")(latent_inputs)
+        y = layers.Dense(64, activation="relu")(y)
+        y = layers.Dense(32, activation="relu")(y)
+        decoder_outputs = layers.Dense(input_shape[0], activation="linear")(y)
+        decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
+        vae = VAE(encoder, decoder, self._beta, name="TabularVAE")
+        vae.summary()
+        return vae
+    def _pre_process(self, data: NumericDataset, **kwargs):
+        cont_np_data = data.continuous_data.to_numpy()
+        if not self._scaler:
+            scaler, np_input_scaled, _ = standardize_simple_tabular_input(
+                train_data=cont_np_data
+            )
+            self._scaler = scaler
+        else:
+            np_input_scaled = self._scale(cont_np_data)
+        return np_input_scaled
+    @classmethod
+    def self_describe(cls):
+        return ModelInfo(
+            name=f"{cls.__module__}.{cls.__qualname__}",
+            default_loss_function="ELBO LOSS",
+            description="A Variational Autoencoder for data generation",
+            allowed_data=[
+                AllowedData("float32", False),
+                AllowedData("int32", False),
+                AllowedData("int64", False),
+            ],
+        ).get_model_info()

sdg_core_lib/data_generator/models/keras/implementation/TimeSeriesVAE.py ADDED Viewed

@@ -0,0 +1,156 @@
+import numpy as np
+import keras
+from sdg_core_lib.NumericDataset import NumericDataset
+from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo, AllowedData
+from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
+from keras import layers
+from sdg_core_lib.preprocess.scale import standardize_simple_tabular_time_series
+from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
+class TimeSeriesVAE(KerasBaseVAE):
+    """
+    TimeSeriesVAE is a Variational Autoencoder designed for generating synthetic time series data.
+    This model is particularly useful in scenarios where time series data needs to be generated for
+    testing or simulation purposes. It leverages the power of variational inference to learn latent
+    representations of time series data, enabling the generation of new, similar sequences.
+    Attributes:
+        _beta (float): Coefficient for the KL divergence term in the VAE loss.
+        _learning_rate (float): Learning rate for the optimizer.
+        _epochs (int): Number of training epochs.
+        _batch_size (int): Number of samples per gradient update.
+    Methods:
+        _load_model(encoder, decoder): Loads the VAE model with the specified encoder and decoder.
+        _build(input_shape): Constructs the VAE model architecture.
+        _pre_process(data, **kwargs): Pre-processes the input data for training.
+        self_describe(): Provides a description of the model, including its name, loss function, and allowed data types.
+    """
+    def __init__(
+        self,
+        metadata: dict,
+        model_name: str,
+        input_shape: str,
+        load_path: str,
+        latent_dim: int = 2,
+        learning_rate: float = 1e-3,
+        batch_size: int = 16,
+        epochs: int = 60,
+    ):
+        super().__init__(metadata, model_name, input_shape, load_path, latent_dim)
+        self._beta = 0.15
+        self._learning_rate = learning_rate
+        self._epochs = epochs
+        self._batch_size = batch_size
+        self._instantiate()
+    def _load_model(self, encoder, decoder):
+        self._model = VAE(encoder, decoder, self._beta)
+    def _build(self, input_shape: tuple[int, ...]):
+        print(input_shape)
+        encoder_inputs = keras.Input(shape=input_shape)
+        encoder_inputs_permute = layers.Permute((2, 1))(encoder_inputs)
+        x = layers.Conv1D(
+            32,
+            9,
+            activation="relu",
+            padding="valid",
+            strides=1,
+            data_format="channels_last",
+        )(encoder_inputs_permute)
+        x = layers.Conv1D(
+            64,
+            5,
+            activation="relu",
+            padding="valid",
+            strides=1,
+            data_format="channels_last",
+        )(x)
+        shape_before_flatten = x.shape[1:]
+        x = layers.Flatten()(x)
+        x = layers.Dense(16, activation="relu")(x)
+        z_mean = layers.Dense(self._latent_dim, name="z_mean")(x)
+        z_log_var = layers.Dense(self._latent_dim, name="z_log_var")(x)
+        z = Sampling()([z_mean, z_log_var])
+        encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
+        latent_inputs = keras.Input(shape=(self._latent_dim,))
+        y = layers.Dense(np.prod(shape_before_flatten), activation="relu")(
+            latent_inputs
+        )
+        y = layers.Reshape(shape_before_flatten)(y)
+        y = layers.Conv1DTranspose(
+            64,
+            5,
+            activation="relu",
+            padding="valid",
+            strides=1,
+            data_format="channels_last",
+        )(y)
+        y = layers.Conv1DTranspose(
+            32,
+            9,
+            activation="relu",
+            padding="valid",
+            strides=1,
+            data_format="channels_last",
+        )(y)
+        decoder_outputs = layers.Conv1DTranspose(
+            input_shape[0], 3, activation="relu", padding="same"
+        )(y)
+        decoder_outputs_permute = layers.Permute((2, 1))(decoder_outputs)
+        decoder = keras.Model(latent_inputs, decoder_outputs_permute, name="decoder")
+        vae = VAE(encoder, decoder, self._beta, name="TimeSeriesVAE")
+        encoder.summary()
+        decoder.summary()
+        vae.summary()
+        return vae
+    def _scale(self, data: np.array):
+        batch, feats, steps = data.shape
+        if self._scaler is None:
+            return data
+        data_reshaped = data.transpose(0, 2, 1).reshape(-1, feats)
+        data_scaled = self._scaler.transform(data_reshaped)
+        data_scaled = data_scaled.reshape(batch, steps, feats).transpose(0, 2, 1)
+        return data_scaled
+    def _inverse_scale(self, data: np.array):
+        if self._scaler is None:
+            return data
+        batch, feats, steps = data.shape
+        data_reshaped = data.transpose(0, 2, 1).reshape(-1, feats)
+        data_unscaled = self._scaler.inverse_transform(data_reshaped)
+        data_unscaled = data_unscaled.reshape(batch, steps, feats).transpose(0, 2, 1)
+        return data_unscaled
+    def _pre_process(self, data: NumericDataset, **kwargs):
+        np_data = np.array(data.dataframe.values.tolist())
+        if not self._scaler:
+            scaler, np_input_scaled, _ = standardize_simple_tabular_time_series(
+                train_data=np_data
+            )
+            self._scaler = scaler
+        else:
+            np_input_scaled = self._scale(np_data)
+        return np_input_scaled
+    @classmethod
+    def self_describe(cls):
+        return ModelInfo(
+            name=f"{cls.__module__}.{cls.__qualname__}",
+            default_loss_function="ELBO LOSS",
+            description="A Beta-Variational Autoencoder for time series generation",
+            allowed_data=[
+                AllowedData("float32", False),
+                AllowedData("int32", False),
+                AllowedData("int64", False),
+            ],
+        ).get_model_info()

sdg_core_lib/data_generator/models/keras/implementation/__init__.py ADDED Viewed

File without changes

sdg_core_lib/evaluate/Metrics.py ADDED Viewed

@@ -0,0 +1,48 @@
+class Metric:
+    def __init__(self, title: str, unit_measure: str, value: float | int | dict):
+        self.title = title
+        self.unit_measure = unit_measure
+        self.value = value
+        self.type = None
+    def to_json(self):
+        return {
+            "title": self.title,
+            "unit_measure": self.unit_measure,
+            "value": self.value,
+        }
+class StatisticalMetric(Metric):
+    def __init__(self, title: str, unit_measure: str, value: float | int | dict):
+        super().__init__(title, unit_measure, value)
+        self.type = "statistical_metrics"
+class AdherenceMetric(Metric):
+    def __init__(self, title: str, unit_measure: str, value: float | int | dict):
+        super().__init__(title, unit_measure, value)
+        self.type = "adherence_metrics"
+class NoveltyMetric(Metric):
+    def __init__(self, title: str, unit_measure: str, value: float | int | dict):
+        super().__init__(title, unit_measure, value)
+        self.type = "novelty_metrics"
+class MetricReport:
+    def __init__(self):
+        self.report = {}
+    def add_metric(self, metric: Metric):
+        if metric.type not in self.report:
+            self.report[metric.type] = [metric.to_json()]
+        else:
+            self.report[metric.type].append(metric.to_json())
+    def to_json(self):
+        if len(self.report) == 0:
+            return {}
+        return self.report

sdg_core_lib/evaluate/TabularComparison.py ADDED Viewed

@@ -0,0 +1,276 @@
+import numpy as np
+import pandas as pd
+import scipy.stats as ss
+from sdg_core_lib.evaluate.Metrics import (
+    MetricReport,
+    StatisticalMetric,
+    AdherenceMetric,
+    NoveltyMetric,
+)
+class TabularComparisonEvaluator:
+    """
+    Evaluates the quality of a synthetic dataset with respect to a real one.
+    The evaluation is based on the following metrics:
+    - Statistical properties: wasserstein distance and Cramer's V
+    - Adherence: evaluates how well the synthetic data adheres to the real data distribution
+    - Novelty: evaluates how many new values are generated in the synthetic dataset
+    The evaluation is performed on a per-column basis, and the results are aggregated.
+    """
+    def __init__(
+        self,
+        real_data: pd.DataFrame,
+        synthetic_data: pd.DataFrame,
+        numerical_columns: list[str],
+        categorical_columns: list[str],
+    ):
+        self._real_data = real_data
+        self._synthetic_data = synthetic_data
+        self._numerical_columns = numerical_columns
+        self._categorical_columns = categorical_columns
+        self.report = MetricReport()
+    def compute(self):
+        if len(self._numerical_columns) <= 1 and len(self._categorical_columns) <= 1:
+            return
+        self._evaluate_statistical_properties()
+        self._evaluate_adherence()
+        self._evaluate_novelty()
+        return self.report.to_json()
+    @staticmethod
+    def _compute_cramer_v(data1: np.array, data2: np.array):
+        """
+        Computes Cramer's V on a pair of categorical columns
+        :param data1: first column
+        :param data2: second column
+        :return: Cramer's V
+        """
+        confusion_matrix = pd.crosstab(data1, data2)
+        chi2 = ss.chi2_contingency(confusion_matrix)[0]
+        # Total number of observations.
+        n = confusion_matrix.to_numpy().sum()
+        if n == 0:
+            return 0.0
+        phi2 = chi2 / n
+        r, k = confusion_matrix.shape
+        # Check for potential division by zero in the correction terms.
+        if n - 1 == 0:
+            return 0.0
+        phi2_corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
+        r_corr = r - ((r - 1) ** 2) / (n - 1)
+        k_corr = k - ((k - 1) ** 2) / (n - 1)
+        denominator = min(k_corr - 1, r_corr - 1)
+        if denominator <= 0:
+            return 0.0
+        V = np.sqrt(phi2_corr / denominator)
+        return V
+    def _evaluate_cramer_v_distance(self) -> float:
+        """
+        Evaluates Cramer's v with Bias Correction https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V on categorical data,
+        evaluating pairwise columns. Each pair of columns is evaluated on both datasets, appending scores in a list
+        and returning the aggregate.
+        :return: A score ranging from 0 to 1. A score of 0 is the worst possible score, while 1 is the best possible score,
+        meaning that category pairs are perfectly balanced
+        """
+        if len(self._categorical_columns) < 2:
+            return 0
+        contingency_scores_distances = []
+        for idx, col in enumerate(self._categorical_columns[:-1]):
+            for col2 in self._categorical_columns[idx + 1 :]:
+                v_real = self._compute_cramer_v(
+                    self._real_data[col].to_numpy(), self._real_data[col2].to_numpy()
+                )
+                v_synth = self._compute_cramer_v(
+                    self._synthetic_data[col].to_numpy(),
+                    self._synthetic_data[col2].to_numpy(),
+                )
+                contingency_scores_distances.append(np.abs(v_real - v_synth))
+        final_score = 1 - np.mean(contingency_scores_distances)
+        return np.clip(final_score, 0, 1)
+    def _evaluate_wasserstein_distance(self) -> float:
+        """
+        Computing the Wasserstein distance for each numerical column. The score is computed using a different approach,
+        trying to clip the values between 0 and 1. With 1 it means that the distribution of data is aligned, while with
+        0 means that the distribution of data are largely unaligned.
+        In particular, the Wasserstein distance score will be clipped between 0 and |max - min|, where max and min
+        are related to the real dataset distribution. In the end, the score is scaled between 0 and 1
+        :return: A single score, computed as 1 - mean(scores)
+        """
+        if len(self._numerical_columns) < 1:
+            return 0
+        wass_distance_scores = []
+        for col in self._numerical_columns:
+            real_data = self._real_data[col].to_numpy()
+            synth_data = self._synthetic_data[col].to_numpy()
+            distance = np.abs(np.max(real_data) - np.min(real_data))
+            wass_dist = ss.wasserstein_distance(real_data, synth_data)
+            wass_dist = np.clip(wass_dist, 0, distance) / distance
+            wass_distance_scores.append(wass_dist)
+        return 1 - np.mean(wass_distance_scores)
+    def _evaluate_statistical_properties(self):
+        """
+        This function evaluates both Wasserstein distance for numerical features and Cramer's V for categorical ones,
+        providing a weighted mean of the scores based on the number of features
+        """
+        cramer_v = self._evaluate_cramer_v_distance()
+        wass_distance = self._evaluate_wasserstein_distance()
+        n_features = len(self._real_data.columns)
+        stat_compliance = (
+            len(self._categorical_columns) * cramer_v
+            + len(self._numerical_columns) * wass_distance
+        ) / n_features
+        if not (
+            len(self._numerical_columns) == 0 or len(self._categorical_columns) == 0
+        ):
+            self.report.add_metric(
+                StatisticalMetric(
+                    title="Total Statistical Compliance",
+                    unit_measure="%",
+                    value=np.round(stat_compliance * 100, 2).item(),
+                )
+            )
+        if not len(self._categorical_columns) == 0:
+            self.report.add_metric(
+                StatisticalMetric(
+                    title="Categorical Features Cramer's V",
+                    unit_measure="%",
+                    value=np.round(cramer_v * 100, 2).item(),
+                )
+            )
+        if not len(self._numerical_columns) == 0:
+            self.report.add_metric(
+                StatisticalMetric(
+                    title="Numerical Features Wasserstein Distance",
+                    unit_measure="%",
+                    value=np.round(wass_distance * 100, 2).item(),
+                )
+            )
+    def _evaluate_novelty(self):
+        """
+        This function evaluates in two steps the following metrics
+        1) The number of unique samples generated in the synthetic dataset with respect to the real data
+        2) The number of duplicated samples in the synthetic dataset
+        """
+        synth_len = self._synthetic_data.shape[0]
+        synth_unique = self._synthetic_data.drop_duplicates()
+        synth_unique_len = synth_unique.shape[0]
+        real_unique = self._real_data.drop_duplicates()
+        real_unique_len = real_unique.shape[0]
+        concat_df = pd.concat([real_unique, synth_unique], axis=0)
+        concat_unique = concat_df.drop_duplicates()
+        conc_unique_len = concat_unique.shape[0]
+        new_synt_data = synth_len - (
+            (real_unique_len + synth_unique_len) - conc_unique_len
+        )
+        self.report.add_metric(
+            NoveltyMetric(
+                title="Unique Synthetic Data",
+                unit_measure="%",
+                value=np.round(synth_unique_len / conc_unique_len * 100, 2).item(),
+            )
+        )
+        self.report.add_metric(
+            NoveltyMetric(
+                title="New Synthetic Data",
+                unit_measure="%",
+                value=np.round(new_synt_data / conc_unique_len * 100, 2).item(),
+            )
+        )
+    def _evaluate_adherence(self):
+        """
+        Computes adherence metrics such as:
+        - Synthetic Categories Adherence to Real Categories
+        - Numerical min-max boundaries adherence
+        :return: A tuple containing:
+            - category_adherence_score: dict mapping column name to adherence percentage.
+            - boundary_adherence_score: dict mapping column name to adherence percentage.
+        """
+        # Ensure synthetic data is not empty
+        total_records = self._synthetic_data.shape[0]
+        if total_records == 0:
+            raise ValueError("Synthetic data is empty.")
+        # --- Categorical Adherence ---
+        # For each categorical column, compute the percentage of synthetic entries
+        # that have values found in the real data.
+        category_adherence_score: dict[str, float] = {}
+        real_categorical = self._real_data[self._categorical_columns]
+        synth_categorical = self._synthetic_data[self._categorical_columns]
+        for col in self._categorical_columns:
+            # Identify values present in synthetic data but missing in real data.
+            extra_values = set(synth_categorical[col].unique()) - set(
+                real_categorical[col].unique()
+            )
+            # Count how many synthetic records use these extra values.
+            extra_count = synth_categorical[col].isin(extra_values).sum()
+            # Define adherence as the percentage of records that do NOT have extra values.
+            adherence_percentage = np.round((1 - extra_count / total_records) * 100, 2)
+            category_adherence_score[col] = float(adherence_percentage)
+        # --- Numerical Boundary Adherence ---
+        # For each numerical column, compute the percentage of synthetic entries
+        # that lie within the min-max boundaries of the real data.
+        boundary_adherence_score: dict[str, float] = {}
+        real_numerical = self._real_data[self._numerical_columns]
+        synth_numerical = self._synthetic_data[self._numerical_columns]
+        for col in self._numerical_columns:
+            # Obtain min and max boundaries from the real data.
+            stats = real_numerical[col].describe()
+            min_boundary = stats["min"]
+            max_boundary = stats["max"]
+            # Filter synthetic records that fall within these boundaries.
+            in_boundary = synth_numerical[
+                (synth_numerical[col] >= min_boundary)
+                & (synth_numerical[col] <= max_boundary)
+            ]
+            in_boundary_count = in_boundary.shape[0]
+            adherence_percentage = np.round(in_boundary_count / total_records * 100, 2)
+            boundary_adherence_score[col] = float(adherence_percentage)
+        if not len(self._categorical_columns) == 0:
+            self.report.add_metric(
+                AdherenceMetric(
+                    title="Synthetic Categories Adherence to Real Categories",
+                    unit_measure="%",
+                    value=category_adherence_score,
+                )
+            )
+        if not len(self._numerical_columns) == 0:
+            self.report.add_metric(
+                AdherenceMetric(
+                    title="Synthetic Numerical Min-Max Boundaries Adherence",
+                    unit_measure="%",
+                    value=boundary_adherence_score,
+                )
+            )

sdg_core_lib/evaluate/__init__.py ADDED Viewed

File without changes