sdg-core-lib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. sdg_core_lib/NumericDataset.py +150 -0
  2. sdg_core_lib/__init__.py +0 -0
  3. sdg_core_lib/browser.py +73 -0
  4. sdg_core_lib/data_generator/__init__.py +0 -0
  5. sdg_core_lib/data_generator/model_factory.py +72 -0
  6. sdg_core_lib/data_generator/models/ModelInfo.py +42 -0
  7. sdg_core_lib/data_generator/models/TrainingInfo.py +40 -0
  8. sdg_core_lib/data_generator/models/UnspecializedModel.py +106 -0
  9. sdg_core_lib/data_generator/models/__init__.py +0 -0
  10. sdg_core_lib/data_generator/models/keras/KerasBaseVAE.py +172 -0
  11. sdg_core_lib/data_generator/models/keras/VAE.py +61 -0
  12. sdg_core_lib/data_generator/models/keras/__init__.py +0 -0
  13. sdg_core_lib/data_generator/models/keras/implementation/TabularVAE.py +96 -0
  14. sdg_core_lib/data_generator/models/keras/implementation/TimeSeriesVAE.py +156 -0
  15. sdg_core_lib/data_generator/models/keras/implementation/__init__.py +0 -0
  16. sdg_core_lib/evaluate/Metrics.py +48 -0
  17. sdg_core_lib/evaluate/TabularComparison.py +276 -0
  18. sdg_core_lib/evaluate/__init__.py +0 -0
  19. sdg_core_lib/job.py +56 -0
  20. sdg_core_lib/post_process/FunctionApplier.py +14 -0
  21. sdg_core_lib/post_process/__init__.py +0 -0
  22. sdg_core_lib/post_process/function_factory.py +41 -0
  23. sdg_core_lib/post_process/functions/FunctionInfo.py +25 -0
  24. sdg_core_lib/post_process/functions/FunctionResult.py +15 -0
  25. sdg_core_lib/post_process/functions/Parameter.py +33 -0
  26. sdg_core_lib/post_process/functions/UnspecializedFunction.py +42 -0
  27. sdg_core_lib/post_process/functions/__init__.py +0 -0
  28. sdg_core_lib/post_process/functions/distribution_evaluator/__init__.py +0 -0
  29. sdg_core_lib/post_process/functions/distribution_evaluator/implementation/NormalTester.py +65 -0
  30. sdg_core_lib/post_process/functions/distribution_evaluator/implementation/__init__.py +0 -0
  31. sdg_core_lib/post_process/functions/filter/IntervalThreshold.py +32 -0
  32. sdg_core_lib/post_process/functions/filter/MonoThreshold.py +28 -0
  33. sdg_core_lib/post_process/functions/filter/__init__.py +0 -0
  34. sdg_core_lib/post_process/functions/filter/implementation/InnerThreshold.py +43 -0
  35. sdg_core_lib/post_process/functions/filter/implementation/LowerThreshold.py +32 -0
  36. sdg_core_lib/post_process/functions/filter/implementation/OuterThreshold.py +42 -0
  37. sdg_core_lib/post_process/functions/filter/implementation/UpperThreshold.py +32 -0
  38. sdg_core_lib/post_process/functions/filter/implementation/__init__.py +0 -0
  39. sdg_core_lib/preprocess/__init__.py +0 -0
  40. sdg_core_lib/preprocess/scale.py +51 -0
  41. sdg_core_lib/test/__init__.py +0 -0
  42. sdg_core_lib/test/data_generator/__init__.py +0 -0
  43. sdg_core_lib/test/data_generator/models/__init__.py +0 -0
  44. sdg_core_lib/test/data_generator/models/keras/__init__.py +0 -0
  45. sdg_core_lib/test/data_generator/models/keras/implementation/__init__.py +0 -0
  46. sdg_core_lib/test/data_generator/models/keras/implementation/test_TabularVAE.py +120 -0
  47. sdg_core_lib/test/data_generator/models/keras/implementation/test_TimeSeriesVAE.py +110 -0
  48. sdg_core_lib/test/data_generator/models/keras/test_KerasBaseVAE.py +74 -0
  49. sdg_core_lib/test/data_generator/models/test_ModelInfo.py +27 -0
  50. sdg_core_lib/test/data_generator/models/test_TrainingInfo.py +30 -0
  51. sdg_core_lib/test/data_generator/models/test_UnspecializedModel.py +32 -0
  52. sdg_core_lib/test/data_generator/test_model_factory.py +52 -0
  53. sdg_core_lib/test/evaluate/__init__.py +0 -0
  54. sdg_core_lib/test/evaluate/test_Metrics.py +62 -0
  55. sdg_core_lib/test/evaluate/test_TabularComparisonEvaluator.py +75 -0
  56. sdg_core_lib/test/infer_test.json +168 -0
  57. sdg_core_lib/test/infer_test_nodata.json +77 -0
  58. sdg_core_lib/test/infer_test_nodata_wrong.json +11 -0
  59. sdg_core_lib/test/post_process/__init__.py +0 -0
  60. sdg_core_lib/test/post_process/functions/__init__.py +0 -0
  61. sdg_core_lib/test/post_process/functions/distribution_evaluator/__init__.py +0 -0
  62. sdg_core_lib/test/post_process/functions/distribution_evaluator/implementation/__init__.py +0 -0
  63. sdg_core_lib/test/post_process/functions/distribution_evaluator/implementation/test_NormalTester.py +55 -0
  64. sdg_core_lib/test/post_process/functions/filters/__init__.py +0 -0
  65. sdg_core_lib/test/post_process/functions/filters/implementation/__init__.py +0 -0
  66. sdg_core_lib/test/post_process/functions/filters/implementation/test_InnerThreshold.py +30 -0
  67. sdg_core_lib/test/pre_process/__init__.py +0 -0
  68. sdg_core_lib/test/pre_process/test_scaling.py +55 -0
  69. sdg_core_lib/test/test_browser.py +11 -0
  70. sdg_core_lib/test/test_dataset.py +149 -0
  71. sdg_core_lib/test/test_job.py +128 -0
  72. sdg_core_lib/test/train_test.json +166 -0
  73. sdg_core_lib/test/train_test_2.json +9 -0
  74. sdg_core_lib-0.1.0.dist-info/METADATA +9 -0
  75. sdg_core_lib-0.1.0.dist-info/RECORD +77 -0
  76. sdg_core_lib-0.1.0.dist-info/WHEEL +4 -0
  77. sdg_core_lib-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,61 @@
1
+ import keras
2
+ from keras.api import layers, ops
3
+ import tensorflow as tf
4
+
5
+
6
+ class Sampling(layers.Layer):
7
+ """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
8
+
9
+ def __init__(self, seed: int = 42, **kwargs):
10
+ super().__init__(**kwargs)
11
+ self.seed_generator = keras.random.SeedGenerator(seed)
12
+
13
+ def call(self, inputs):
14
+ z_mean, z_log_var = inputs
15
+ batch = ops.shape(z_mean)[0]
16
+ dim = ops.shape(z_mean)[1]
17
+ epsilon = keras.random.normal(shape=(batch, dim), seed=self.seed_generator)
18
+ return z_mean + ops.exp(0.5 * z_log_var) * epsilon
19
+
20
+
21
+ class VAE(keras.Model):
22
+ def __init__(self, encoder, decoder, beta=1, **kwargs):
23
+ super().__init__(**kwargs)
24
+ self.encoder = encoder
25
+ self.decoder = decoder
26
+ self._beta = beta
27
+ self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
28
+ self.reconstruction_loss_tracker = keras.metrics.Mean(
29
+ name="reconstruction_loss"
30
+ )
31
+ self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
32
+
33
+ @property
34
+ def metrics(self):
35
+ return [
36
+ self.total_loss_tracker,
37
+ self.reconstruction_loss_tracker,
38
+ self.kl_loss_tracker,
39
+ ]
40
+
41
+ def train_step(self, data):
42
+ with tf.GradientTape() as tape:
43
+ z_mean, z_log_var, z = self.encoder(data)
44
+ reconstruction = self.decoder(z)
45
+ reconstruction_loss = ops.mean(
46
+ ops.sum(ops.abs(data - reconstruction), axis=-1)
47
+ )
48
+ kl_loss = -0.5 * (1 + z_log_var - ops.square(z_mean) - ops.exp(z_log_var))
49
+ kl_loss = ops.mean(ops.sum(kl_loss, axis=1))
50
+ total_loss = reconstruction_loss + self._beta * kl_loss
51
+ grads = tape.gradient(total_loss, self.trainable_weights)
52
+ self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
53
+ self.total_loss_tracker.update_state(total_loss)
54
+ self.reconstruction_loss_tracker.update_state(reconstruction_loss)
55
+ self.kl_loss_tracker.update_state(kl_loss)
56
+
57
+ return {
58
+ "loss": self.total_loss_tracker.result(),
59
+ "reconstruction_loss": self.reconstruction_loss_tracker.result(),
60
+ "kl_loss": self.kl_loss_tracker.result(),
61
+ }
File without changes
@@ -0,0 +1,96 @@
1
+ import keras
2
+ from keras import layers
3
+
4
+ from sdg_core_lib import NumericDataset
5
+ from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo, AllowedData
6
+ from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
7
+ from sdg_core_lib.preprocess.scale import standardize_simple_tabular_input
8
+ from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
9
+
10
+
11
+ class TabularVAE(KerasBaseVAE):
12
+ """
13
+ TabularVAE is a class that implements a Variational Autoencoder (VAE) for tabular data generation.
14
+ It inherits from the KerasBaseVAE class and provides functionality specific to handling tabular data.
15
+
16
+ Attributes:
17
+ _latent_dim (int): The dimensionality of the latent space.
18
+ _beta (float): The beta parameter for the VAE loss function.
19
+ _learning_rate (float): Learning rate for the optimizer.
20
+ _batch_size (int): Number of samples per batch during training.
21
+ _epochs (int): Number of training epochs.
22
+ _scaler: Scaler used for standardizing input data.
23
+
24
+ Methods:
25
+ __init__: Initializes the TabularVAE with model parameters.
26
+ _load_model: Loads the VAE model with specified encoder and decoder.
27
+ _build: Builds the VAE model architecture.
28
+ _pre_process: Pre-processes input data using standardization.
29
+ self_describe: Provides metadata information about the model.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ metadata: dict,
35
+ model_name: str,
36
+ input_shape: str,
37
+ load_path: str | None,
38
+ latent_dim: int = 2,
39
+ learning_rate: float = 1e-3,
40
+ batch_size: int = 8,
41
+ epochs: int = 200,
42
+ ):
43
+ super().__init__(metadata, model_name, input_shape, load_path, latent_dim)
44
+ self._beta = 1
45
+ self._learning_rate = learning_rate
46
+ self._epochs = epochs
47
+ self._batch_size = batch_size
48
+ self._instantiate()
49
+
50
+ def _load_model(self, encoder, decoder):
51
+ self._model = VAE(encoder, decoder, self._beta)
52
+
53
+ def _build(self, input_shape: tuple[int, ...]):
54
+ encoder_inputs = keras.Input(shape=input_shape)
55
+ x = layers.Dense(32, activation="relu")(encoder_inputs)
56
+ x = layers.Dense(64, activation="relu")(x)
57
+ x = layers.Dense(16, activation="relu")(x)
58
+ z_mean = layers.Dense(self._latent_dim, name="z_mean")(x)
59
+ z_log_var = layers.Dense(self._latent_dim, name="z_log_var")(x)
60
+ z = Sampling()([z_mean, z_log_var])
61
+ encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
62
+
63
+ latent_inputs = keras.Input(shape=(self._latent_dim,))
64
+ y = layers.Dense(16, activation="relu")(latent_inputs)
65
+ y = layers.Dense(64, activation="relu")(y)
66
+ y = layers.Dense(32, activation="relu")(y)
67
+ decoder_outputs = layers.Dense(input_shape[0], activation="linear")(y)
68
+ decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
69
+
70
+ vae = VAE(encoder, decoder, self._beta, name="TabularVAE")
71
+ vae.summary()
72
+ return vae
73
+
74
+ def _pre_process(self, data: NumericDataset, **kwargs):
75
+ cont_np_data = data.continuous_data.to_numpy()
76
+ if not self._scaler:
77
+ scaler, np_input_scaled, _ = standardize_simple_tabular_input(
78
+ train_data=cont_np_data
79
+ )
80
+ self._scaler = scaler
81
+ else:
82
+ np_input_scaled = self._scale(cont_np_data)
83
+ return np_input_scaled
84
+
85
+ @classmethod
86
+ def self_describe(cls):
87
+ return ModelInfo(
88
+ name=f"{cls.__module__}.{cls.__qualname__}",
89
+ default_loss_function="ELBO LOSS",
90
+ description="A Variational Autoencoder for data generation",
91
+ allowed_data=[
92
+ AllowedData("float32", False),
93
+ AllowedData("int32", False),
94
+ AllowedData("int64", False),
95
+ ],
96
+ ).get_model_info()
@@ -0,0 +1,156 @@
1
+ import numpy as np
2
+ import keras
3
+
4
+ from sdg_core_lib.NumericDataset import NumericDataset
5
+ from sdg_core_lib.data_generator.models.ModelInfo import ModelInfo, AllowedData
6
+ from sdg_core_lib.data_generator.models.keras.KerasBaseVAE import KerasBaseVAE
7
+ from keras import layers
8
+
9
+ from sdg_core_lib.preprocess.scale import standardize_simple_tabular_time_series
10
+ from sdg_core_lib.data_generator.models.keras.VAE import Sampling, VAE
11
+
12
+
13
+ class TimeSeriesVAE(KerasBaseVAE):
14
+ """
15
+ TimeSeriesVAE is a Variational Autoencoder designed for generating synthetic time series data.
16
+
17
+ This model is particularly useful in scenarios where time series data needs to be generated for
18
+ testing or simulation purposes. It leverages the power of variational inference to learn latent
19
+ representations of time series data, enabling the generation of new, similar sequences.
20
+
21
+ Attributes:
22
+ _beta (float): Coefficient for the KL divergence term in the VAE loss.
23
+ _learning_rate (float): Learning rate for the optimizer.
24
+ _epochs (int): Number of training epochs.
25
+ _batch_size (int): Number of samples per gradient update.
26
+
27
+ Methods:
28
+ _load_model(encoder, decoder): Loads the VAE model with the specified encoder and decoder.
29
+ _build(input_shape): Constructs the VAE model architecture.
30
+ _pre_process(data, **kwargs): Pre-processes the input data for training.
31
+ self_describe(): Provides a description of the model, including its name, loss function, and allowed data types.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ metadata: dict,
37
+ model_name: str,
38
+ input_shape: str,
39
+ load_path: str,
40
+ latent_dim: int = 2,
41
+ learning_rate: float = 1e-3,
42
+ batch_size: int = 16,
43
+ epochs: int = 60,
44
+ ):
45
+ super().__init__(metadata, model_name, input_shape, load_path, latent_dim)
46
+ self._beta = 0.15
47
+ self._learning_rate = learning_rate
48
+ self._epochs = epochs
49
+ self._batch_size = batch_size
50
+ self._instantiate()
51
+
52
+ def _load_model(self, encoder, decoder):
53
+ self._model = VAE(encoder, decoder, self._beta)
54
+
55
+ def _build(self, input_shape: tuple[int, ...]):
56
+ print(input_shape)
57
+ encoder_inputs = keras.Input(shape=input_shape)
58
+ encoder_inputs_permute = layers.Permute((2, 1))(encoder_inputs)
59
+ x = layers.Conv1D(
60
+ 32,
61
+ 9,
62
+ activation="relu",
63
+ padding="valid",
64
+ strides=1,
65
+ data_format="channels_last",
66
+ )(encoder_inputs_permute)
67
+ x = layers.Conv1D(
68
+ 64,
69
+ 5,
70
+ activation="relu",
71
+ padding="valid",
72
+ strides=1,
73
+ data_format="channels_last",
74
+ )(x)
75
+ shape_before_flatten = x.shape[1:]
76
+ x = layers.Flatten()(x)
77
+ x = layers.Dense(16, activation="relu")(x)
78
+ z_mean = layers.Dense(self._latent_dim, name="z_mean")(x)
79
+ z_log_var = layers.Dense(self._latent_dim, name="z_log_var")(x)
80
+ z = Sampling()([z_mean, z_log_var])
81
+ encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
82
+
83
+ latent_inputs = keras.Input(shape=(self._latent_dim,))
84
+ y = layers.Dense(np.prod(shape_before_flatten), activation="relu")(
85
+ latent_inputs
86
+ )
87
+ y = layers.Reshape(shape_before_flatten)(y)
88
+ y = layers.Conv1DTranspose(
89
+ 64,
90
+ 5,
91
+ activation="relu",
92
+ padding="valid",
93
+ strides=1,
94
+ data_format="channels_last",
95
+ )(y)
96
+ y = layers.Conv1DTranspose(
97
+ 32,
98
+ 9,
99
+ activation="relu",
100
+ padding="valid",
101
+ strides=1,
102
+ data_format="channels_last",
103
+ )(y)
104
+ decoder_outputs = layers.Conv1DTranspose(
105
+ input_shape[0], 3, activation="relu", padding="same"
106
+ )(y)
107
+ decoder_outputs_permute = layers.Permute((2, 1))(decoder_outputs)
108
+ decoder = keras.Model(latent_inputs, decoder_outputs_permute, name="decoder")
109
+
110
+ vae = VAE(encoder, decoder, self._beta, name="TimeSeriesVAE")
111
+ encoder.summary()
112
+ decoder.summary()
113
+ vae.summary()
114
+ return vae
115
+
116
+ def _scale(self, data: np.array):
117
+ batch, feats, steps = data.shape
118
+ if self._scaler is None:
119
+ return data
120
+ data_reshaped = data.transpose(0, 2, 1).reshape(-1, feats)
121
+ data_scaled = self._scaler.transform(data_reshaped)
122
+ data_scaled = data_scaled.reshape(batch, steps, feats).transpose(0, 2, 1)
123
+ return data_scaled
124
+
125
+ def _inverse_scale(self, data: np.array):
126
+ if self._scaler is None:
127
+ return data
128
+ batch, feats, steps = data.shape
129
+ data_reshaped = data.transpose(0, 2, 1).reshape(-1, feats)
130
+ data_unscaled = self._scaler.inverse_transform(data_reshaped)
131
+ data_unscaled = data_unscaled.reshape(batch, steps, feats).transpose(0, 2, 1)
132
+ return data_unscaled
133
+
134
+ def _pre_process(self, data: NumericDataset, **kwargs):
135
+ np_data = np.array(data.dataframe.values.tolist())
136
+ if not self._scaler:
137
+ scaler, np_input_scaled, _ = standardize_simple_tabular_time_series(
138
+ train_data=np_data
139
+ )
140
+ self._scaler = scaler
141
+ else:
142
+ np_input_scaled = self._scale(np_data)
143
+ return np_input_scaled
144
+
145
+ @classmethod
146
+ def self_describe(cls):
147
+ return ModelInfo(
148
+ name=f"{cls.__module__}.{cls.__qualname__}",
149
+ default_loss_function="ELBO LOSS",
150
+ description="A Beta-Variational Autoencoder for time series generation",
151
+ allowed_data=[
152
+ AllowedData("float32", False),
153
+ AllowedData("int32", False),
154
+ AllowedData("int64", False),
155
+ ],
156
+ ).get_model_info()
@@ -0,0 +1,48 @@
1
+ class Metric:
2
+ def __init__(self, title: str, unit_measure: str, value: float | int | dict):
3
+ self.title = title
4
+ self.unit_measure = unit_measure
5
+ self.value = value
6
+ self.type = None
7
+
8
+ def to_json(self):
9
+ return {
10
+ "title": self.title,
11
+ "unit_measure": self.unit_measure,
12
+ "value": self.value,
13
+ }
14
+
15
+
16
+ class StatisticalMetric(Metric):
17
+ def __init__(self, title: str, unit_measure: str, value: float | int | dict):
18
+ super().__init__(title, unit_measure, value)
19
+ self.type = "statistical_metrics"
20
+
21
+
22
+ class AdherenceMetric(Metric):
23
+ def __init__(self, title: str, unit_measure: str, value: float | int | dict):
24
+ super().__init__(title, unit_measure, value)
25
+ self.type = "adherence_metrics"
26
+
27
+
28
+ class NoveltyMetric(Metric):
29
+ def __init__(self, title: str, unit_measure: str, value: float | int | dict):
30
+ super().__init__(title, unit_measure, value)
31
+ self.type = "novelty_metrics"
32
+
33
+
34
+ class MetricReport:
35
+ def __init__(self):
36
+ self.report = {}
37
+
38
+ def add_metric(self, metric: Metric):
39
+ if metric.type not in self.report:
40
+ self.report[metric.type] = [metric.to_json()]
41
+ else:
42
+ self.report[metric.type].append(metric.to_json())
43
+
44
+ def to_json(self):
45
+ if len(self.report) == 0:
46
+ return {}
47
+
48
+ return self.report
@@ -0,0 +1,276 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import scipy.stats as ss
4
+
5
+ from sdg_core_lib.evaluate.Metrics import (
6
+ MetricReport,
7
+ StatisticalMetric,
8
+ AdherenceMetric,
9
+ NoveltyMetric,
10
+ )
11
+
12
+
13
+ class TabularComparisonEvaluator:
14
+ """
15
+ Evaluates the quality of a synthetic dataset with respect to a real one.
16
+
17
+ The evaluation is based on the following metrics:
18
+ - Statistical properties: wasserstein distance and Cramer's V
19
+ - Adherence: evaluates how well the synthetic data adheres to the real data distribution
20
+ - Novelty: evaluates how many new values are generated in the synthetic dataset
21
+
22
+ The evaluation is performed on a per-column basis, and the results are aggregated.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ real_data: pd.DataFrame,
28
+ synthetic_data: pd.DataFrame,
29
+ numerical_columns: list[str],
30
+ categorical_columns: list[str],
31
+ ):
32
+ self._real_data = real_data
33
+ self._synthetic_data = synthetic_data
34
+ self._numerical_columns = numerical_columns
35
+ self._categorical_columns = categorical_columns
36
+ self.report = MetricReport()
37
+
38
+ def compute(self):
39
+ if len(self._numerical_columns) <= 1 and len(self._categorical_columns) <= 1:
40
+ return
41
+
42
+ self._evaluate_statistical_properties()
43
+ self._evaluate_adherence()
44
+ self._evaluate_novelty()
45
+
46
+ return self.report.to_json()
47
+
48
+ @staticmethod
49
+ def _compute_cramer_v(data1: np.array, data2: np.array):
50
+ """
51
+ Computes Cramer's V on a pair of categorical columns
52
+ :param data1: first column
53
+ :param data2: second column
54
+ :return: Cramer's V
55
+ """
56
+ confusion_matrix = pd.crosstab(data1, data2)
57
+ chi2 = ss.chi2_contingency(confusion_matrix)[0]
58
+ # Total number of observations.
59
+ n = confusion_matrix.to_numpy().sum()
60
+ if n == 0:
61
+ return 0.0
62
+ phi2 = chi2 / n
63
+ r, k = confusion_matrix.shape
64
+ # Check for potential division by zero in the correction terms.
65
+ if n - 1 == 0:
66
+ return 0.0
67
+ phi2_corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
68
+ r_corr = r - ((r - 1) ** 2) / (n - 1)
69
+ k_corr = k - ((k - 1) ** 2) / (n - 1)
70
+ denominator = min(k_corr - 1, r_corr - 1)
71
+ if denominator <= 0:
72
+ return 0.0
73
+ V = np.sqrt(phi2_corr / denominator)
74
+ return V
75
+
76
+ def _evaluate_cramer_v_distance(self) -> float:
77
+ """
78
+ Evaluates Cramer's v with Bias Correction https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V on categorical data,
79
+ evaluating pairwise columns. Each pair of columns is evaluated on both datasets, appending scores in a list
80
+ and returning the aggregate.
81
+
82
+ :return: A score ranging from 0 to 1. A score of 0 is the worst possible score, while 1 is the best possible score,
83
+ meaning that category pairs are perfectly balanced
84
+ """
85
+ if len(self._categorical_columns) < 2:
86
+ return 0
87
+
88
+ contingency_scores_distances = []
89
+ for idx, col in enumerate(self._categorical_columns[:-1]):
90
+ for col2 in self._categorical_columns[idx + 1 :]:
91
+ v_real = self._compute_cramer_v(
92
+ self._real_data[col].to_numpy(), self._real_data[col2].to_numpy()
93
+ )
94
+ v_synth = self._compute_cramer_v(
95
+ self._synthetic_data[col].to_numpy(),
96
+ self._synthetic_data[col2].to_numpy(),
97
+ )
98
+ contingency_scores_distances.append(np.abs(v_real - v_synth))
99
+
100
+ final_score = 1 - np.mean(contingency_scores_distances)
101
+ return np.clip(final_score, 0, 1)
102
+
103
+ def _evaluate_wasserstein_distance(self) -> float:
104
+ """
105
+ Computing the Wasserstein distance for each numerical column. The score is computed using a different approach,
106
+ trying to clip the values between 0 and 1. With 1 it means that the distribution of data is aligned, while with
107
+ 0 means that the distribution of data are largely unaligned.
108
+ In particular, the Wasserstein distance score will be clipped between 0 and |max - min|, where max and min
109
+ are related to the real dataset distribution. In the end, the score is scaled between 0 and 1
110
+ :return: A single score, computed as 1 - mean(scores)
111
+ """
112
+ if len(self._numerical_columns) < 1:
113
+ return 0
114
+
115
+ wass_distance_scores = []
116
+ for col in self._numerical_columns:
117
+ real_data = self._real_data[col].to_numpy()
118
+ synth_data = self._synthetic_data[col].to_numpy()
119
+ distance = np.abs(np.max(real_data) - np.min(real_data))
120
+ wass_dist = ss.wasserstein_distance(real_data, synth_data)
121
+ wass_dist = np.clip(wass_dist, 0, distance) / distance
122
+ wass_distance_scores.append(wass_dist)
123
+
124
+ return 1 - np.mean(wass_distance_scores)
125
+
126
+ def _evaluate_statistical_properties(self):
127
+ """
128
+ This function evaluates both Wasserstein distance for numerical features and Cramer's V for categorical ones,
129
+ providing a weighted mean of the scores based on the number of features
130
+ """
131
+ cramer_v = self._evaluate_cramer_v_distance()
132
+ wass_distance = self._evaluate_wasserstein_distance()
133
+ n_features = len(self._real_data.columns)
134
+ stat_compliance = (
135
+ len(self._categorical_columns) * cramer_v
136
+ + len(self._numerical_columns) * wass_distance
137
+ ) / n_features
138
+
139
+ if not (
140
+ len(self._numerical_columns) == 0 or len(self._categorical_columns) == 0
141
+ ):
142
+ self.report.add_metric(
143
+ StatisticalMetric(
144
+ title="Total Statistical Compliance",
145
+ unit_measure="%",
146
+ value=np.round(stat_compliance * 100, 2).item(),
147
+ )
148
+ )
149
+
150
+ if not len(self._categorical_columns) == 0:
151
+ self.report.add_metric(
152
+ StatisticalMetric(
153
+ title="Categorical Features Cramer's V",
154
+ unit_measure="%",
155
+ value=np.round(cramer_v * 100, 2).item(),
156
+ )
157
+ )
158
+
159
+ if not len(self._numerical_columns) == 0:
160
+ self.report.add_metric(
161
+ StatisticalMetric(
162
+ title="Numerical Features Wasserstein Distance",
163
+ unit_measure="%",
164
+ value=np.round(wass_distance * 100, 2).item(),
165
+ )
166
+ )
167
+
168
+ def _evaluate_novelty(self):
169
+ """
170
+ This function evaluates in two steps the following metrics
171
+ 1) The number of unique samples generated in the synthetic dataset with respect to the real data
172
+ 2) The number of duplicated samples in the synthetic dataset
173
+ """
174
+ synth_len = self._synthetic_data.shape[0]
175
+
176
+ synth_unique = self._synthetic_data.drop_duplicates()
177
+ synth_unique_len = synth_unique.shape[0]
178
+
179
+ real_unique = self._real_data.drop_duplicates()
180
+ real_unique_len = real_unique.shape[0]
181
+
182
+ concat_df = pd.concat([real_unique, synth_unique], axis=0)
183
+ concat_unique = concat_df.drop_duplicates()
184
+ conc_unique_len = concat_unique.shape[0]
185
+
186
+ new_synt_data = synth_len - (
187
+ (real_unique_len + synth_unique_len) - conc_unique_len
188
+ )
189
+
190
+ self.report.add_metric(
191
+ NoveltyMetric(
192
+ title="Unique Synthetic Data",
193
+ unit_measure="%",
194
+ value=np.round(synth_unique_len / conc_unique_len * 100, 2).item(),
195
+ )
196
+ )
197
+
198
+ self.report.add_metric(
199
+ NoveltyMetric(
200
+ title="New Synthetic Data",
201
+ unit_measure="%",
202
+ value=np.round(new_synt_data / conc_unique_len * 100, 2).item(),
203
+ )
204
+ )
205
+
206
+ def _evaluate_adherence(self):
207
+ """
208
+ Computes adherence metrics such as:
209
+ - Synthetic Categories Adherence to Real Categories
210
+ - Numerical min-max boundaries adherence
211
+
212
+ :return: A tuple containing:
213
+ - category_adherence_score: dict mapping column name to adherence percentage.
214
+ - boundary_adherence_score: dict mapping column name to adherence percentage.
215
+ """
216
+ # Ensure synthetic data is not empty
217
+ total_records = self._synthetic_data.shape[0]
218
+ if total_records == 0:
219
+ raise ValueError("Synthetic data is empty.")
220
+
221
+ # --- Categorical Adherence ---
222
+ # For each categorical column, compute the percentage of synthetic entries
223
+ # that have values found in the real data.
224
+ category_adherence_score: dict[str, float] = {}
225
+ real_categorical = self._real_data[self._categorical_columns]
226
+ synth_categorical = self._synthetic_data[self._categorical_columns]
227
+
228
+ for col in self._categorical_columns:
229
+ # Identify values present in synthetic data but missing in real data.
230
+ extra_values = set(synth_categorical[col].unique()) - set(
231
+ real_categorical[col].unique()
232
+ )
233
+ # Count how many synthetic records use these extra values.
234
+ extra_count = synth_categorical[col].isin(extra_values).sum()
235
+ # Define adherence as the percentage of records that do NOT have extra values.
236
+ adherence_percentage = np.round((1 - extra_count / total_records) * 100, 2)
237
+ category_adherence_score[col] = float(adherence_percentage)
238
+
239
+ # --- Numerical Boundary Adherence ---
240
+ # For each numerical column, compute the percentage of synthetic entries
241
+ # that lie within the min-max boundaries of the real data.
242
+ boundary_adherence_score: dict[str, float] = {}
243
+ real_numerical = self._real_data[self._numerical_columns]
244
+ synth_numerical = self._synthetic_data[self._numerical_columns]
245
+
246
+ for col in self._numerical_columns:
247
+ # Obtain min and max boundaries from the real data.
248
+ stats = real_numerical[col].describe()
249
+ min_boundary = stats["min"]
250
+ max_boundary = stats["max"]
251
+ # Filter synthetic records that fall within these boundaries.
252
+ in_boundary = synth_numerical[
253
+ (synth_numerical[col] >= min_boundary)
254
+ & (synth_numerical[col] <= max_boundary)
255
+ ]
256
+ in_boundary_count = in_boundary.shape[0]
257
+ adherence_percentage = np.round(in_boundary_count / total_records * 100, 2)
258
+ boundary_adherence_score[col] = float(adherence_percentage)
259
+
260
+ if not len(self._categorical_columns) == 0:
261
+ self.report.add_metric(
262
+ AdherenceMetric(
263
+ title="Synthetic Categories Adherence to Real Categories",
264
+ unit_measure="%",
265
+ value=category_adherence_score,
266
+ )
267
+ )
268
+
269
+ if not len(self._numerical_columns) == 0:
270
+ self.report.add_metric(
271
+ AdherenceMetric(
272
+ title="Synthetic Numerical Min-Max Boundaries Adherence",
273
+ unit_measure="%",
274
+ value=boundary_adherence_score,
275
+ )
276
+ )
File without changes