PyPI - sdg-core-lib - Versions diffs - 0.1.8.dev1__tar.gz → 0.1.8.dev2__tar.gz - Mend

sdg-core-lib 0.1.8.dev1tar.gz → 0.1.8.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{sdg_core_lib-0.1.8.dev1 → sdg_core_lib-0.1.8.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sdg-core-lib
-Version: 0.1.8.dev1
+Version: 0.1.8.dev2
 Summary: Add your description here
 Author: emiliocimino
 Author-email: emiliocimino <emilio.cimino@outlook.it>

{sdg_core_lib-0.1.8.dev1 → sdg_core_lib-0.1.8.dev2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "sdg-core-lib"
-version = "0.1.8.dev1"
+version = "0.1.8.dev2"
 description = "Add your description here"
 license = "AGPL-3.0"
 readme = "README.md"

{sdg_core_lib-0.1.8.dev1 → sdg_core_lib-0.1.8.dev2}/src/sdg_core_lib/data_generator/models/GANs/CTGANComponents.py RENAMED Viewed

@@ -185,8 +185,8 @@ class CTGANModel(keras.Model):
     def metrics(self):
         return [self.gen_loss_tracker, self.critic_loss_tracker]
-    @tf.function
     def generate_batch_cond(self, batch_size):
+        batch_size = int(batch_size)  # Convert symbolic tensor to int
         num_cats = len(self.generator.cats_disc)
         total_cond_dim = sum(self.generator.cats_disc)
         cats_disc = tf.convert_to_tensor(self.generator.cats_disc, dtype=tf.int32)
@@ -195,25 +195,22 @@ class CTGANModel(keras.Model):
             shape=[batch_size], minval=0, maxval=num_cats, dtype=tf.int32
         )
-        relevant_pmfs = tf.gather(self.probability_mass_function_list, col_indices)
-        cat_indices = tf.random.categorical(tf.math.log(relevant_pmfs), num_samples=1)
-        cat_indices = tf.cast(tf.squeeze(cat_indices, axis=1), tf.int32)
-        offsets_table = tf.concat([[0], tf.cumsum(cats_disc)[:-1]], axis=0)
-        batch_offsets = tf.gather(offsets_table, col_indices)
-        global_hot_indices = batch_offsets + cat_indices
-        row_indices = tf.range(batch_size)
-        scatter_indices = tf.stack([row_indices, global_hot_indices], axis=1)
-        cond_batch = tf.scatter_nd(
-            indices=scatter_indices,
-            updates=tf.ones([batch_size], dtype=tf.float32),
-            shape=[batch_size, total_cond_dim],
-        )
-        return cond_batch
+        # Create condition vector by sampling from PMFs and creating one-hot encoding
+        condition_list = []
+        for i in range(batch_size):
+            idx = col_indices[i].numpy()
+            pmf = self.probability_mass_function_list[idx].numpy().flatten()
+            # Sample from PMF
+            cat_idx = np.random.choice(len(pmf), p=pmf)
+            # Create one-hot vector for total_cond_dim
+            one_hot = np.zeros(total_cond_dim)
+            # Calculate offset for this categorical variable
+            offset = sum(self.generator.cats_disc[:idx])
+            one_hot[offset + cat_idx] = 1.0
+            condition_list.append(one_hot)
+        cond = tf.convert_to_tensor(condition_list, dtype=tf.float32)
+        return cond
     @staticmethod
     @tf.function
@@ -291,9 +288,12 @@ class CTGANModel(keras.Model):
         pmfs = []
         curr = 0
         for sz in self.generator.cats_disc:
-            chunk = onehot_all[:, curr : curr + sz]
-            log_freqs = tf.math.log(tf.reduce_sum(chunk, axis=0) + 1.0)
-            pmfs.append(log_freqs / tf.reduce_sum(log_freqs))
+            chunk = onehot_all[:, curr : curr + sz]  # (N_row, cats)
+            chunk_np = chunk.numpy()
+            log_freqs = np.log(np.sum(chunk_np, axis=0) + 1.0).reshape(1, -1)
+            pmfs.append(
+                tf.convert_to_tensor(log_freqs / np.sum(log_freqs), dtype=tf.float32)
+            )
             curr += sz
         return pmfs
@@ -301,7 +301,16 @@ class CTGANModel(keras.Model):
         batch = ops.shape(data)[0]
         self.row_dim = ops.shape(data)[1]
         z = tf.random.normal([batch, self.row_dim - sum(self.generator.cats_disc)])
-        cond = self.generate_batch_cond(batch)
+        # Use tf.py_function to call generate_batch_cond in eager mode
+        def generate_cond_eager(batch_size):
+            return self.generate_batch_cond(batch_size)
+        cond = tf.py_function(generate_cond_eager, [batch], tf.float32)
+        # Set the shape explicitly - it should be [batch_size, total_cond_dim]
+        total_cond_dim = sum(self.generator.cats_disc)
+        cond.set_shape([None, total_cond_dim])
         real_batch = CTGANModel.sample_real_data(
             self._train_data, cond, self.onehot_discrete_indexes
         )

{sdg_core_lib-0.1.8.dev1 → sdg_core_lib-0.1.8.dev2}/src/sdg_core_lib/data_generator/models/GANs/implementation/CTGAN.py RENAMED Viewed

@@ -11,6 +11,7 @@ from sdg_core_lib.data_generator.models.GANs.CTGANComponents import (
     CTGANModel,
 )
 import keras
+import tensorflow as tf
 from sdg_core_lib.data_generator.models.TrainingInfo import TrainingInfo
 import numpy as np
@@ -18,7 +19,7 @@ import numpy as np
 class CTGAN(UnspecializedModel):
     def __init__(
         self,
-        metadata: dict,
+        metadata: list[dict],
         model_name: str,
         input_shape: str = None,
         load_path: str = None,
@@ -31,7 +32,9 @@ class CTGAN(UnspecializedModel):
         gen_steps=4,
         critic_dropout=0.2,
     ):
-        super().__init__(metadata, model_name, input_shape, load_path)
+        super().__init__(
+            self._clean_skeleton(metadata), model_name, input_shape, load_path
+        )
         self._batch_size = batch_size
         self._epochs = epochs
         self._gen_steps = gen_steps
@@ -43,10 +46,19 @@ class CTGAN(UnspecializedModel):
         self._instantiate()
     @staticmethod
-    def infer_data_structure(skeleton):
+    def _clean_skeleton(skeleton):
+        if skeleton != [{}]:
+            return [
+                item
+                for item in skeleton
+                if item["feature_type"] in ["continuous", "categorical"]
+            ]
+        return skeleton
+    def infer_data_structure(self):
         cats, modes, idxs = [], [], []
         true_index = 0
-        for col in skeleton:
+        for col in self._metadata:
             try:
                 f_size = int(col["feature_size"])
                 if col["feature_type"] == "categorical":
@@ -90,7 +102,7 @@ class CTGAN(UnspecializedModel):
             categories_per_discrete_column,
             modes_per_continuous_column,
             onehot_discrete_indexes,
-        ) = CTGAN.infer_data_structure(self._metadata)
+        ) = self.infer_data_structure()
         self.generator = CTGANGenerator(
             self._metadata,
             modes_per_continuous_column,
@@ -106,7 +118,7 @@ class CTGAN(UnspecializedModel):
         # Should set the _model variable CTGAN Model complete with Generator and Critic
         # Does NOT return the model
         # self._metadata is available
-        _, _, onehot_discrete_indexes = CTGAN.infer_data_structure(self._metadata)
+        _, _, onehot_discrete_indexes = self.infer_data_structure()
         critic = keras.saving.load_model(os.path.join(folder_path, "critic.keras"))
         generator = keras.saving.load_model(
             os.path.join(folder_path, "generator.keras")
@@ -114,11 +126,33 @@ class CTGAN(UnspecializedModel):
         self._model = CTGANModel(generator, critic, onehot_discrete_indexes)
         # Load probability_mass_function_list if it exists
-        pmf_path = os.path.join(folder_path, "probability_mass_function_list.npy")
+        pmf_path = os.path.join(folder_path, "probability_mass_function_list.npz")
         if os.path.exists(pmf_path):
-            self._model.probability_mass_function_list = np.load(
-                pmf_path, allow_pickle=True
+            pmf_data = np.load(pmf_path)
+            # Convert back to list of TensorFlow tensors
+            pmf_list = []
+            for key in sorted(pmf_data.keys()):
+                pmf_list.append(tf.convert_to_tensor(pmf_data[key], dtype=tf.float32))
+            self._model.probability_mass_function_list = pmf_list
+        # Also check for old .npy format for backward compatibility
+        elif os.path.exists(
+            os.path.join(folder_path, "probability_mass_function_list.npy")
+        ):
+            pmf_list = np.load(
+                os.path.join(folder_path, "probability_mass_function_list.npy"),
+                allow_pickle=True,
             )
+            # Convert to TensorFlow tensors if needed
+            if (
+                isinstance(pmf_list, list)
+                and len(pmf_list) > 0
+                and isinstance(pmf_list[0], np.ndarray)
+            ):
+                self._model.probability_mass_function_list = [
+                    tf.convert_to_tensor(pmf, dtype=tf.float32) for pmf in pmf_list
+                ]
+            else:
+                self._model.probability_mass_function_list = pmf_list
     def save(self, folder_path: str):
         keras.saving.save_model(
@@ -132,9 +166,13 @@ class CTGAN(UnspecializedModel):
             hasattr(self._model, "probability_mass_function_list")
             and self._model.probability_mass_function_list is not None
         ):
-            np.save(
-                os.path.join(folder_path, "probability_mass_function_list.npy"),
-                self._model.probability_mass_function_list,
+            # Convert TensorFlow tensors to numpy arrays before saving
+            pmf_list = [
+                tensor.numpy() for tensor in self._model.probability_mass_function_list
+            ]
+            np.savez(
+                os.path.join(folder_path, "probability_mass_function_list.npz"),
+                *pmf_list,
             )
     def train(self, data: np.ndarray):
@@ -157,9 +195,7 @@ class CTGAN(UnspecializedModel):
         )
         self._model._train_data = data
         probability_mass_function_list = self._model.get_pmfs(data)
-        self._model.probability_mass_function_list = keras.ops.convert_to_numpy(
-            probability_mass_function_list
-        )
+        self._model.probability_mass_function_list = probability_mass_function_list
         history = self._model.fit(
             data, batch_size=self._batch_size, epochs=self._epochs, verbose=1
         )

{sdg_core_lib-0.1.8.dev1 → sdg_core_lib-0.1.8.dev2}/src/sdg_core_lib/preprocess/strategies/steps.py RENAMED Viewed

@@ -180,33 +180,32 @@ class PerModeNormalization(Step):
         if self.operator is None:
             raise ValueError("Operator not initialized")
         column = data.reshape(-1, 1)
-        active_weights_indx = np.where(self.operator.weights_ > 0.01)
-        weights = self.operator.weights_[active_weights_indx]
-        means = self.operator.means_[active_weights_indx].flatten()
-        stds = np.sqrt(self.operator.covariances_[active_weights_indx].flatten())
+        active_weights_indexes = np.where(self.operator.weights_ > 0.01)
+        weights = self.operator.weights_[active_weights_indexes]
+        means = self.operator.means_[active_weights_indexes].flatten()
+        stds = np.sqrt(self.operator.covariances_[active_weights_indexes].flatten())
         mixture_probability_density = []
-        for w, m, s in zip(weights, means, stds):
+        for weight, mean, std in zip(weights, means, stds):
             mixture_probability_density.append(
-                w
-                * PerModeNormalization._gaussian_probability_density_function(
-                    column, m, s
-                )
+                weight * self._gaussian_probability_density_function(column, mean, std)
             )
         marginal_mixture_probability_density = np.hstack(mixture_probability_density)
-        responsibilities = PerModeNormalization._compute_responsibilities(
+        responsibilities = self._compute_responsibilities(
             marginal_mixture_probability_density
         )
         rng = np.random.default_rng(self.random_state)
-        n, K = responsibilities.shape
+        n, k = responsibilities.shape
         sampled_mode = np.array(
-            [rng.choice(K, p=responsibilities[i]) for i in range(n)]
+            [rng.choice(k, p=responsibilities[i]) for i in range(n)]
         )
-        f = np.zeros((n, K), dtype=int)
-        f[np.arange(n), sampled_mode] = 1
+        mode_assignment = np.zeros((n, k), dtype=int)
+        mode_assignment[np.arange(n), sampled_mode] = 1
         mu_sel = means[sampled_mode]
         std_sel = stds[sampled_mode]
         normalized_value = (column.reshape(-1) - mu_sel) / (4.0 * std_sel)
-        to_return = np.concatenate([normalized_value.reshape(-1, 1), f], axis=1)
+        to_return = np.concatenate(
+            [normalized_value.reshape(-1, 1), mode_assignment], axis=1
+        )
         return to_return
     def inverse_transform(self, data: np.ndarray) -> np.ndarray:
@@ -221,18 +220,10 @@ class PerModeNormalization(Step):
             data = data.reshape(1, -1)
         active_modes = np.argmax(data[:, 1:], axis=1)
-        # Get the means and stds for the active modes
         selected_mus = means[active_modes]
         selected_devs = stds[active_modes]
-        # Get the normalized values (first column)
         normalized_values = data[:, 0]
-        # Denormalize the values
         values = (normalized_values * 4 * selected_devs) + selected_mus
-        # Always return 2D array with shape (n_samples, 1) for consistency
         return values.reshape(-1, 1)
     @staticmethod