PyPI - flexynesis - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

flexynesis 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{flexynesis-0.2.1 → flexynesis-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.1
+Version: 0.2.3
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/__main__.py RENAMED Viewed

@@ -46,6 +46,7 @@ def main():
         --use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
         --evaluate_baseline_performance (str): Whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset. Choices are ['True', 'False']. Default is 'True'.
         --threads (int): How many threads to use when using CPU. Default is 4.
+        --num_workers (int): How many workers to use for model training. Default is 2
         --use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
         --disable_marker_finding (bool): If set, marker discovery after model training is disabled.
         --string_organism (int): STRING DB organism id. Default is 9606.
@@ -99,6 +100,7 @@ def main():
     parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
     parser.add_argument("--evaluate_baseline_performance", help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset", type=str, choices=['True', 'False'], default = 'True')
     parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
+    parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
     parser.add_argument("--use_gpu", action="store_true",
                         help="(Optional) If set, the system will attempt to use CUDA/GPU if available.")
     parser.add_argument("--disable_marker_finding", action="store_true",
@@ -253,7 +255,8 @@ def main():
                                             device_type = device_type,
                                             gnn_conv_type = gnn_conv_type,
                                             input_layers = input_layers,
-                                            output_layers = output_layers)
+                                            output_layers = output_layers,
+                                            num_workers = args.num_workers)
     # do a hyperparameter search training multiple models and get the best_configuration
     model, best_params = tuner.perform_tuning(hpo_patience = args.hpo_patience)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/config.py RENAMED Viewed

@@ -6,28 +6,28 @@ epochs = [500]
 search_spaces = {
     'DirectPred': [
         Integer(16, 128, name='latent_dim'),
-        Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
+        Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
         Real(0.0001, 0.01, prior='log-uniform', name='lr'),
         Integer(8, 32, name='supervisor_hidden_dim'),
         Categorical(epochs, name='epochs')
     ],
     'supervised_vae': [
         Integer(16, 128, name='latent_dim'),
-        Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
+        Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
         Integer(8, 32, name='supervisor_hidden_dim'),
         Real(0.0001, 0.01, prior='log-uniform', name='lr'),
         Categorical(epochs, name='epochs')
     ],
     'CrossModalPred': [
         Integer(16, 128, name='latent_dim'),
-        Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
+        Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
         Integer(8, 32, name='supervisor_hidden_dim'),
         Real(0.0001, 0.01, prior='log-uniform', name='lr'),
         Categorical(epochs, name='epochs')
     ],
     'MultiTripletNetwork': [
         Integer(16, 128, name='latent_dim'),
-        Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
+        Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
         Integer(8, 32, name='supervisor_hidden_dim'),
         Real(0.0001, 0.01, prior='log-uniform', name='lr'),
         Categorical(epochs, name='epochs')

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/data.py RENAMED Viewed

@@ -525,7 +525,7 @@ class MultiOmicDataset(Dataset):
         """
         subset_dat = {x: self.dat[x][index] for x in self.dat.keys()}
         subset_ann = {x: self.ann[x][index] for x in self.ann.keys()}
-        return subset_dat, subset_ann
+        return subset_dat, subset_ann, self.samples[index]
     def __len__ (self):
         """Get the total number of samples in the dataset.

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/main.py RENAMED Viewed

@@ -56,7 +56,7 @@ class HyperparameterTuning:
                  cv_splits=5, use_loss_weighting=True, early_stop_patience=-1, device_type=None, gnn_conv_type=None,
                  input_layers=None, output_layers=None): Initializes the hyperparameter tuner with specific settings.
-        get_batch_space(min_size=16, max_size=256): Determines the batch size search space based on the dataset size.
+        get_batch_space(min_size=16, max_size=128): Determines the batch size search space based on the dataset size.
         setup_trainer(params, current_step, total_steps, full_train=False): Sets up the trainer with appropriate callbacks
             and configurations for either full training or validation based training.
@@ -80,7 +80,7 @@ class HyperparameterTuning:
                  val_size = 0.2,  use_cv = False, cv_splits = 5,
                  use_loss_weighting = True, early_stop_patience = -1,
                  device_type = None, gnn_conv_type = None,
-                 input_layers = None, output_layers = None):
+                 input_layers = None, output_layers = None, num_workers = 2):
         self.dataset = dataset # dataset for model initiation
         self.loader_dataset = dataset # dataset for defining data loaders (this can be model specific)
         self.model_class = model_class
@@ -107,6 +107,7 @@ class HyperparameterTuning:
         self.gnn_conv_type = gnn_conv_type
         self.input_layers = input_layers
         self.output_layers = output_layers
+        self.num_workers = num_workers
         self.DataLoader = torch.utils.data.DataLoader # use torch data loader by default
@@ -128,7 +129,7 @@ class HyperparameterTuning:
             else:
                 raise ValueError(f"'{self.config_name}' not found in the default config.")
-    def get_batch_space(self, min_size = 32, max_size = 256):
+    def get_batch_space(self, min_size = 32, max_size = 128):
         m = int(np.log2(len(self.dataset) * 0.8))
         st = int(np.log2(min_size))
         end = int(np.log2(max_size))
@@ -214,9 +215,11 @@ class HyperparameterTuning:
                 train_subset = torch.utils.data.Subset(self.loader_dataset, train_index)
                 val_subset = torch.utils.data.Subset(self.loader_dataset, val_index)
                 train_loader = self.DataLoader(train_subset, batch_size=int(params['batch_size']),
-                                               pin_memory=True, shuffle=True, drop_last=True, num_workers = 4, prefetch_factor = None, persistent_workers = True)
+                                               pin_memory=True, shuffle=True, drop_last=True, num_workers = self.num_workers, prefetch_factor = None,
+                                               persistent_workers = self.num_workers > 0)
                 val_loader = self.DataLoader(val_subset, batch_size=int(params['batch_size']),
-                                             pin_memory=True, shuffle=False, num_workers = 4, prefetch_factor = None, persistent_workers = True)
+                                             pin_memory=True, shuffle=False, num_workers = self.num_workers, prefetch_factor = None,
+                                             persistent_workers = self.num_workers > 0)
                 model = self.model_class(**model_args)
                 trainer, early_stop_callback = self.setup_trainer(params, current_step, total_steps)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/crossmodal_pred.py RENAMED Viewed

@@ -262,7 +262,7 @@ class CrossModalPred(pl.LightningModule):
         dataset, particularly handling survival analysis if applicable. All losses are aggregated to compute a total loss,
         which is logged and returned.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         # get input omics modalities and encode them; decode them to output layers
         x_list_input = [dat[x] for x in self.input_layers]
@@ -315,7 +315,7 @@ class CrossModalPred(pl.LightningModule):
         analysis where applicable. The aggregated losses are then summed up to form the total validation loss, which is logged
         and returned.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         # get input omics modalities and encode them
         x_list_input = [dat[x] for x in self.input_layers]
@@ -515,7 +515,7 @@ class CrossModalPred(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in self.input_layers]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/direct_pred.py RENAMED Viewed

@@ -190,7 +190,7 @@ class DirectPred(pl.LightningModule):
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -226,7 +226,7 @@ class DirectPred(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -250,28 +250,44 @@ class DirectPred(pl.LightningModule):
     def predict(self, dataset):
         """
-        Make predictions on an entire dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset: The MultiOmicDataset object to evaluate the model on.
+            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
         Returns:
-            dict: Predictions mapped by target variable names.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = dataset.dat.keys()
-        x_list = [dataset.dat[x] for x in layers]
-        outputs = self.forward(x_list)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
-        return predictions
+        # Create a DataLoader with a practical batch size
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, y_dict, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
+        return predictions
     def transform(self, dataset):
         """
         Transforms the input data into a lower-dimensional representation using trained encoders.
@@ -282,16 +298,36 @@ class DirectPred(pl.LightningModule):
         Returns:
             pd.DataFrame: DataFrame containing the transformed data.
         """
-        self.eval()
-        embeddings_list = []
-        # Process each input matrix with its corresponding Encoder
-        for i, x in enumerate(dataset.dat.values()):
-            embeddings_list.append(self.encoders[i](x))
-        embeddings_concat = torch.cat(embeddings_list, dim=1)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        embeddings_list = []  # Initialize a list to collect all batch embeddings
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            batch_embeddings = []
+            # Process each input matrix with its corresponding Encoder
+            for i, x in enumerate(dat.values()):
+                x = x.to(device)  # Move data to GPU
+                encoded_x = self.encoders[i](x)  # Transform data using the corresponding encoder
+                batch_embeddings.append(encoded_x)
+            # Concatenate all embeddings from the current batch
+            embeddings_batch_concat = torch.cat(batch_embeddings, dim=1)
+            embeddings_list.append(embeddings_batch_concat.detach().cpu())  # Move tensor back to CPU and detach
+            sample_names.extend(samples)  # Collect sample names
+        # Concatenate all batch embeddings into one tensor
+        embeddings_concat = torch.cat(embeddings_list, dim=0)
         # Converting tensor to numpy array and then to DataFrame
-        embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
-                                     index=dataset.samples,
+        embeddings_df = pd.DataFrame(embeddings_concat.numpy(),
+                                     index=sample_names,  # Set DataFrame index to sample names
                                      columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
         return embeddings_df
@@ -337,11 +373,11 @@ class DirectPred(pl.LightningModule):
         if dataset.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/gnn_early.py RENAMED Viewed

@@ -375,7 +375,7 @@ class GNN(pl.LightningModule):
         return torch.cat(outputs_list, dim = 0)
-    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 32):
+    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 64):
         """
         Computes the feature importance for each variable in the dataset using the Integrated Gradients method.
         This method measures the importance of each feature by attributing the prediction output to each input feature.
@@ -397,13 +397,9 @@ class GNN(pl.LightningModule):
         """
         def bytes_to_gb(bytes):
             return bytes / 1024 ** 2
-        print("Memory before moving model to device: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         device = torch.device("cuda" if self.device_type == 'gpu' and torch.cuda.is_available() else 'cpu')
         self.to(device)
-        print("Memory before edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         self.dataset_edge_index = dataset.edge_index.to(device)
-        print("Memory after edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
         ig = IntegratedGradients(self.forward_target)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/supervised_vae.py RENAMED Viewed

@@ -259,7 +259,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -303,7 +303,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -335,7 +335,7 @@ class supervised_vae(pl.LightningModule):
     def transform(self, dataset):
         """
-        Transform the input dataset to latent representation.
+        Transform the input dataset to latent representation using batching.
         Args:
             dataset (MultiOmicDataset): MultiOmicDataset containing input matrices for each omics layer.
@@ -343,37 +343,71 @@ class supervised_vae(pl.LightningModule):
         Returns:
             pd.DataFrame: Transformed dataset as a pandas DataFrame.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        M = self.forward(x_list)[1].detach().numpy()
-        z = pd.DataFrame(M)
-        z.columns = [''.join(['E', str(x)]) for x in z.columns]
-        z.index = dataset.samples
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        all_latent_representations = []  # Initialize a list to collect all batch latent representations
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass and extract the latent representation
+            latent_representation = self.forward(x_list)[1].detach().cpu().numpy()  # Index [1] assumes second return is the latent rep
+            all_latent_representations.append(latent_representation)  # Store the batch's latent representation
+            sample_names.extend(samples)  # Collect sample names for this batch
+        # Concatenate all batch latent representations into one array
+        concatenated_latents = np.concatenate(all_latent_representations, axis=0)
+        # Convert the array to a DataFrame
+        z = pd.DataFrame(concatenated_latents)
+        z.columns = ['E' + str(i) for i in range(z.shape[1])]  # Name columns
+        z.index = sample_names  # Set DataFrame index to sample names
         return z
     def predict(self, dataset):
         """
-        Evaluate the model on a dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
+            dataset (MultiOmicDataset): Dataset containing input matrices for each omics layer.
         Returns:
-            predicted values.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        X_hat, z, mean, log_var, outputs = self.forward(x_list)
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.dataset.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, _, _ = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            X_hat, z, mean, log_var, outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
         return predictions
@@ -484,7 +518,7 @@ class supervised_vae(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/triplet_encoder.py RENAMED Viewed

@@ -405,7 +405,7 @@ class MultiTripletNetwork(pl.LightningModule):
         if self.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:

{flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.1
+Version: 0.2.3
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.1 → flexynesis-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "flexynesis"
-version = "0.2.1"
+version = "0.2.3"
 authors = [
     {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
     {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},