PyPI - flexynesis - Versions diffs - 0.2.1__tar.gz → 0.2.2__tar.gz - Mend

flexynesis 0.2.1tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{flexynesis-0.2.1 → flexynesis-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.1
+Version: 0.2.2
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/data.py RENAMED Viewed

@@ -525,7 +525,7 @@ class MultiOmicDataset(Dataset):
         """
         subset_dat = {x: self.dat[x][index] for x in self.dat.keys()}
         subset_ann = {x: self.ann[x][index] for x in self.ann.keys()}
-        return subset_dat, subset_ann
+        return subset_dat, subset_ann, self.samples[index]
     def __len__ (self):
         """Get the total number of samples in the dataset.

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/crossmodal_pred.py RENAMED Viewed

@@ -262,7 +262,7 @@ class CrossModalPred(pl.LightningModule):
         dataset, particularly handling survival analysis if applicable. All losses are aggregated to compute a total loss,
         which is logged and returned.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         # get input omics modalities and encode them; decode them to output layers
         x_list_input = [dat[x] for x in self.input_layers]
@@ -315,7 +315,7 @@ class CrossModalPred(pl.LightningModule):
         analysis where applicable. The aggregated losses are then summed up to form the total validation loss, which is logged
         and returned.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         # get input omics modalities and encode them
         x_list_input = [dat[x] for x in self.input_layers]
@@ -515,7 +515,7 @@ class CrossModalPred(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in self.input_layers]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/direct_pred.py RENAMED Viewed

@@ -190,7 +190,7 @@ class DirectPred(pl.LightningModule):
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -226,7 +226,7 @@ class DirectPred(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -250,28 +250,44 @@ class DirectPred(pl.LightningModule):
     def predict(self, dataset):
         """
-        Make predictions on an entire dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset: The MultiOmicDataset object to evaluate the model on.
+            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
         Returns:
-            dict: Predictions mapped by target variable names.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = dataset.dat.keys()
-        x_list = [dataset.dat[x] for x in layers]
-        outputs = self.forward(x_list)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
-        return predictions
+        # Create a DataLoader with a practical batch size
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, y_dict, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
+        return predictions
     def transform(self, dataset):
         """
         Transforms the input data into a lower-dimensional representation using trained encoders.
@@ -282,16 +298,36 @@ class DirectPred(pl.LightningModule):
         Returns:
             pd.DataFrame: DataFrame containing the transformed data.
         """
-        self.eval()
-        embeddings_list = []
-        # Process each input matrix with its corresponding Encoder
-        for i, x in enumerate(dataset.dat.values()):
-            embeddings_list.append(self.encoders[i](x))
-        embeddings_concat = torch.cat(embeddings_list, dim=1)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        embeddings_list = []  # Initialize a list to collect all batch embeddings
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            batch_embeddings = []
+            # Process each input matrix with its corresponding Encoder
+            for i, x in enumerate(dat.values()):
+                x = x.to(device)  # Move data to GPU
+                encoded_x = self.encoders[i](x)  # Transform data using the corresponding encoder
+                batch_embeddings.append(encoded_x)
+            # Concatenate all embeddings from the current batch
+            embeddings_batch_concat = torch.cat(batch_embeddings, dim=1)
+            embeddings_list.append(embeddings_batch_concat.detach().cpu())  # Move tensor back to CPU and detach
+            sample_names.extend(samples)  # Collect sample names
+        # Concatenate all batch embeddings into one tensor
+        embeddings_concat = torch.cat(embeddings_list, dim=0)
         # Converting tensor to numpy array and then to DataFrame
-        embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
-                                     index=dataset.samples,
+        embeddings_df = pd.DataFrame(embeddings_concat.numpy(),
+                                     index=sample_names,  # Set DataFrame index to sample names
                                      columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
         return embeddings_df
@@ -337,11 +373,11 @@ class DirectPred(pl.LightningModule):
         if dataset.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/gnn_early.py RENAMED Viewed

@@ -375,7 +375,7 @@ class GNN(pl.LightningModule):
         return torch.cat(outputs_list, dim = 0)
-    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 32):
+    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 64):
         """
         Computes the feature importance for each variable in the dataset using the Integrated Gradients method.
         This method measures the importance of each feature by attributing the prediction output to each input feature.
@@ -397,13 +397,9 @@ class GNN(pl.LightningModule):
         """
         def bytes_to_gb(bytes):
             return bytes / 1024 ** 2
-        print("Memory before moving model to device: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         device = torch.device("cuda" if self.device_type == 'gpu' and torch.cuda.is_available() else 'cpu')
         self.to(device)
-        print("Memory before edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         self.dataset_edge_index = dataset.edge_index.to(device)
-        print("Memory after edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
         ig = IntegratedGradients(self.forward_target)

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/supervised_vae.py RENAMED Viewed

@@ -259,7 +259,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -303,7 +303,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -335,7 +335,7 @@ class supervised_vae(pl.LightningModule):
     def transform(self, dataset):
         """
-        Transform the input dataset to latent representation.
+        Transform the input dataset to latent representation using batching.
         Args:
             dataset (MultiOmicDataset): MultiOmicDataset containing input matrices for each omics layer.
@@ -343,37 +343,71 @@ class supervised_vae(pl.LightningModule):
         Returns:
             pd.DataFrame: Transformed dataset as a pandas DataFrame.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        M = self.forward(x_list)[1].detach().numpy()
-        z = pd.DataFrame(M)
-        z.columns = [''.join(['E', str(x)]) for x in z.columns]
-        z.index = dataset.samples
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        all_latent_representations = []  # Initialize a list to collect all batch latent representations
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass and extract the latent representation
+            latent_representation = self.forward(x_list)[1].detach().cpu().numpy()  # Index [1] assumes second return is the latent rep
+            all_latent_representations.append(latent_representation)  # Store the batch's latent representation
+            sample_names.extend(samples)  # Collect sample names for this batch
+        # Concatenate all batch latent representations into one array
+        concatenated_latents = np.concatenate(all_latent_representations, axis=0)
+        # Convert the array to a DataFrame
+        z = pd.DataFrame(concatenated_latents)
+        z.columns = ['E' + str(i) for i in range(z.shape[1])]  # Name columns
+        z.index = sample_names  # Set DataFrame index to sample names
         return z
     def predict(self, dataset):
         """
-        Evaluate the model on a dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
+            dataset (MultiOmicDataset): Dataset containing input matrices for each omics layer.
         Returns:
-            predicted values.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        X_hat, z, mean, log_var, outputs = self.forward(x_list)
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.dataset.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, _, _ = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            X_hat, z, mean, log_var, outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
         return predictions
@@ -484,7 +518,7 @@ class supervised_vae(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/triplet_encoder.py RENAMED Viewed

@@ -405,7 +405,7 @@ class MultiTripletNetwork(pl.LightningModule):
         if self.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:

{flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.1
+Version: 0.2.2
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.1 → flexynesis-0.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "flexynesis"
-version = "0.2.1"
+version = "0.2.2"
 authors = [
     {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
     {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},