PyPI - flexynesis - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

flexynesis 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{flexynesis-0.2.0 → flexynesis-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.0
+Version: 0.2.2
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/data.py RENAMED Viewed

@@ -525,7 +525,7 @@ class MultiOmicDataset(Dataset):
         """
         subset_dat = {x: self.dat[x][index] for x in self.dat.keys()}
         subset_ann = {x: self.ann[x][index] for x in self.ann.keys()}
-        return subset_dat, subset_ann
+        return subset_dat, subset_ann, self.samples[index]
     def __len__ (self):
         """Get the total number of samples in the dataset.
@@ -680,7 +680,15 @@ class MultiOmicDatasetNW(Dataset):
             if indices:  # Ensure there are common features in this data type
                 all_features[:, :, i] = data_matrix[:, indices]
         return all_features
+    def subset(self, indices):
+        # Create a subset of the main multiomic dataset
+        dataset_subset = self.multiomic_dataset.subset(indices)
+        # Create a new instance of MultiOmicDatasetNW with the subsetted multiomic dataset
+        return MultiOmicDatasetNW(dataset_subset, self.interaction_df.copy())
     def __getitem__(self, idx):
         node_features_tensor = self.node_features_tensor[idx]
         y_dict = {target_name: self.labels[target_name][idx] for target_name in self.labels}

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/__init__.py RENAMED Viewed

@@ -3,5 +3,4 @@ from .supervised_vae import supervised_vae
 from .triplet_encoder import MultiTripletNetwork
 from .crossmodal_pred import CrossModalPred
 from .gnn_early import GNN
 __all__ = ["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN"]

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/crossmodal_pred.py RENAMED Viewed

@@ -262,7 +262,7 @@ class CrossModalPred(pl.LightningModule):
         dataset, particularly handling survival analysis if applicable. All losses are aggregated to compute a total loss,
         which is logged and returned.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         # get input omics modalities and encode them; decode them to output layers
         x_list_input = [dat[x] for x in self.input_layers]
@@ -315,7 +315,7 @@ class CrossModalPred(pl.LightningModule):
         analysis where applicable. The aggregated losses are then summed up to form the total validation loss, which is logged
         and returned.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         # get input omics modalities and encode them
         x_list_input = [dat[x] for x in self.input_layers]
@@ -515,7 +515,7 @@ class CrossModalPred(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in self.input_layers]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/direct_pred.py RENAMED Viewed

@@ -190,7 +190,7 @@ class DirectPred(pl.LightningModule):
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -226,7 +226,7 @@ class DirectPred(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
         outputs = self.forward(x_list)
@@ -250,28 +250,44 @@ class DirectPred(pl.LightningModule):
     def predict(self, dataset):
         """
-        Make predictions on an entire dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset: The MultiOmicDataset object to evaluate the model on.
+            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
         Returns:
-            dict: Predictions mapped by target variable names.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = dataset.dat.keys()
-        x_list = [dataset.dat[x] for x in layers]
-        outputs = self.forward(x_list)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
-        return predictions
+        # Create a DataLoader with a practical batch size
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, y_dict, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
+        return predictions
     def transform(self, dataset):
         """
         Transforms the input data into a lower-dimensional representation using trained encoders.
@@ -282,16 +298,36 @@ class DirectPred(pl.LightningModule):
         Returns:
             pd.DataFrame: DataFrame containing the transformed data.
         """
-        self.eval()
-        embeddings_list = []
-        # Process each input matrix with its corresponding Encoder
-        for i, x in enumerate(dataset.dat.values()):
-            embeddings_list.append(self.encoders[i](x))
-        embeddings_concat = torch.cat(embeddings_list, dim=1)
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        embeddings_list = []  # Initialize a list to collect all batch embeddings
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            batch_embeddings = []
+            # Process each input matrix with its corresponding Encoder
+            for i, x in enumerate(dat.values()):
+                x = x.to(device)  # Move data to GPU
+                encoded_x = self.encoders[i](x)  # Transform data using the corresponding encoder
+                batch_embeddings.append(encoded_x)
+            # Concatenate all embeddings from the current batch
+            embeddings_batch_concat = torch.cat(batch_embeddings, dim=1)
+            embeddings_list.append(embeddings_batch_concat.detach().cpu())  # Move tensor back to CPU and detach
+            sample_names.extend(samples)  # Collect sample names
+        # Concatenate all batch embeddings into one tensor
+        embeddings_concat = torch.cat(embeddings_list, dim=0)
         # Converting tensor to numpy array and then to DataFrame
-        embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
-                                     index=dataset.samples,
+        embeddings_df = pd.DataFrame(embeddings_concat.numpy(),
+                                     index=sample_names,  # Set DataFrame index to sample names
                                      columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
         return embeddings_df
@@ -337,11 +373,11 @@ class DirectPred(pl.LightningModule):
         if dataset.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/gnn_early.py RENAMED Viewed

@@ -95,15 +95,16 @@ class GNN(pl.LightningModule):
             for var in self.variables:
                 self.log_vars[var] = nn.Parameter(torch.zeros(1))
-        self.encoders = flexGCN(
-                        node_count = dataset[0][0].shape[0], #number of nodes
-                        node_feature_count= dataset[0][0].shape[1], # number of node features
-                        node_embedding_dim=int(self.config["node_embedding_dim"]),
-                        num_convs = int(self.config['num_convs']), # Number of convolutional layers
-                        output_dim=self.config["latent_dim"],
-                        act = self.config['activation'],
-                        conv = self.gnn_conv_type
-        )
+        self.encoders = nn.ModuleList([
+            flexGCN(
+                node_count = dataset[0][0].shape[0], #number of nodes
+                node_feature_count= dataset[0][0].shape[1], # number of node features
+                node_embedding_dim=int(self.config["node_embedding_dim"]),
+                num_convs = int(self.config['num_convs']), # Number of convolutional layers
+                output_dim=self.config["latent_dim"],
+                act = self.config['activation'],
+                conv = self.gnn_conv_type
+            )])
         # Init output layers
         self.MLPs = nn.ModuleDict()
@@ -129,14 +130,15 @@ class GNN(pl.LightningModule):
         Returns:
             dict: Outputs from the MLPs, one for each target variable.
         """
-        embeddings = self.encoders(x, edge_index)
+        # notice we are using the first encoder (it is currently a early fusion method)
+        embeddings = self.encoders[0](x, edge_index)
         outputs = {}
         for var, mlp in self.MLPs.items():
             outputs[var] = mlp(embeddings)
         return outputs
-    def training_step(self, batch):
+    def training_step(self, batch, batch_idx, log = True):
         """
         Performs a training step including loss calculation and logging.
@@ -164,10 +166,11 @@ class GNN(pl.LightningModule):
         total_loss = self.compute_total_loss(losses)
         losses["train_loss"] = total_loss
-        self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True, batch_size=len(batch))
+        if log:
+            self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True, batch_size=len(batch))
         return total_loss
-    def validation_step(self, batch):
+    def validation_step(self, batch, batch_idx, log = True):
         """
         Performs a validation step, computing losses for a batch of data.
@@ -194,7 +197,8 @@ class GNN(pl.LightningModule):
         total_loss = sum(losses.values())
         losses["val_loss"] = total_loss
-        self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True, batch_size=len(batch))
+        if log:
+            self.log_dict(losses, on_step=False, on_epoch=True, prog_bar=True, batch_size=len(batch))
         return total_loss
     def configure_optimizers(self):
@@ -341,7 +345,8 @@ class GNN(pl.LightningModule):
         for x, y_dict, samples in dataloader:
             x = x.to(device)  # Move data to GPU
-            embeddings = self.encoders(x, edge_index).detach().cpu().numpy()  # Compute embeddings and move to CPU
+            # notice we are using the first encoder (it is currently a early fusion method)
+            embeddings = self.encoders[0](x, edge_index).detach().cpu().numpy()  # Compute embeddings and move to CPU
             all_embeddings.append(embeddings)
             sample_ids.extend(samples)
@@ -370,7 +375,7 @@ class GNN(pl.LightningModule):
         return torch.cat(outputs_list, dim = 0)
-    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 32):
+    def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 64):
         """
         Computes the feature importance for each variable in the dataset using the Integrated Gradients method.
         This method measures the importance of each feature by attributing the prediction output to each input feature.
@@ -392,13 +397,9 @@ class GNN(pl.LightningModule):
         """
         def bytes_to_gb(bytes):
             return bytes / 1024 ** 2
-        print("Memory before moving model to device: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         device = torch.device("cuda" if self.device_type == 'gpu' and torch.cuda.is_available() else 'cpu')
         self.to(device)
-        print("Memory before edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         self.dataset_edge_index = dataset.edge_index.to(device)
-        print("Memory after edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
         dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
         ig = IntegratedGradients(self.forward_target)
@@ -455,8 +456,12 @@ class GNN(pl.LightningModule):
             features = dataset.common_features
             target_class_label = dataset.label_mappings[target_var].get(i) if target_var in dataset.label_mappings else ''
             for l in range(len(layers)):
-                # extracting node feature attributes coming from different omic layers
-                importances = imp[i].squeeze().detach().numpy()[:,l]
+                # Extracting node feature attributes coming from different omic layers
+                importances_array = imp[i].squeeze().detach().numpy()
+                if importances_array.ndim == 1:
+                    importances = importances_array  # Use the array as is if it is 1-dimensional
+                else:
+                    importances = importances_array[:, l]  # Use the original indexing for 2D arrays
                 df_list.append(pd.DataFrame({'target_variable': target_var,
                                              'target_class': i,
                                              'target_class_label': target_class_label,

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/supervised_vae.py RENAMED Viewed

@@ -259,7 +259,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = train_batch
+        dat, y_dict, samples = train_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -303,7 +303,7 @@ class supervised_vae(pl.LightningModule):
         Returns:
             torch.Tensor: The total loss computed for the batch.
         """
-        dat, y_dict = val_batch
+        dat, y_dict, samples = val_batch
         layers = dat.keys()
         x_list = [dat[x] for x in layers]
@@ -335,7 +335,7 @@ class supervised_vae(pl.LightningModule):
     def transform(self, dataset):
         """
-        Transform the input dataset to latent representation.
+        Transform the input dataset to latent representation using batching.
         Args:
             dataset (MultiOmicDataset): MultiOmicDataset containing input matrices for each omics layer.
@@ -343,37 +343,71 @@ class supervised_vae(pl.LightningModule):
         Returns:
             pd.DataFrame: Transformed dataset as a pandas DataFrame.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        M = self.forward(x_list)[1].detach().numpy()
-        z = pd.DataFrame(M)
-        z.columns = [''.join(['E', str(x)]) for x in z.columns]
-        z.index = dataset.samples
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        all_latent_representations = []  # Initialize a list to collect all batch latent representations
+        sample_names = []  # List to collect sample names
+        # Process each batch
+        for batch in dataloader:
+            dat, _, samples = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass and extract the latent representation
+            latent_representation = self.forward(x_list)[1].detach().cpu().numpy()  # Index [1] assumes second return is the latent rep
+            all_latent_representations.append(latent_representation)  # Store the batch's latent representation
+            sample_names.extend(samples)  # Collect sample names for this batch
+        # Concatenate all batch latent representations into one array
+        concatenated_latents = np.concatenate(all_latent_representations, axis=0)
+        # Convert the array to a DataFrame
+        z = pd.DataFrame(concatenated_latents)
+        z.columns = ['E' + str(i) for i in range(z.shape[1])]  # Name columns
+        z.index = sample_names  # Set DataFrame index to sample names
         return z
     def predict(self, dataset):
         """
-        Evaluate the model on a dataset.
+        Evaluate the model on a dataset using batching.
         Args:
-            dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
+            dataset (MultiOmicDataset): Dataset containing input matrices for each omics layer.
         Returns:
-            predicted values.
+            dict: Predicted values mapped by target variable names.
         """
-        self.eval()
-        layers = list(dataset.dat.keys())
-        x_list = [dataset.dat[x] for x in layers]
-        X_hat, z, mean, log_var, outputs = self.forward(x_list)
-        predictions = {}
-        for var in self.variables:
-            y_pred = outputs[var].detach().numpy()
-            if self.dataset.variable_types[var] == 'categorical':
-                predictions[var] = np.argmax(y_pred, axis=1)
-            else:
-                predictions[var] = y_pred
+        self.eval()  # Set the model to evaluation mode
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.to(device)  # Move the model to the appropriate device
+        dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  # Adjust the batch size as needed
+        predictions = {var: [] for var in self.variables}  # Initialize prediction storage
+        # Process each batch
+        for batch in dataloader:
+            dat, _, _ = batch
+            x_list = [dat[x].to(device) for x in dat.keys()]  # Prepare the data batch for processing
+            # Perform the forward pass
+            X_hat, z, mean, log_var, outputs = self.forward(x_list)
+            # Collect predictions for each variable
+            for var in self.variables:
+                y_pred = outputs[var].detach().cpu().numpy()  # Move outputs back to CPU and convert to numpy
+                if dataset.variable_types[var] == 'categorical':
+                    predictions[var].extend(np.argmax(y_pred, axis=1))
+                else:
+                    predictions[var].extend(y_pred)
+        # Convert lists to arrays if necessary, depending on the downstream use-case
+        predictions = {var: np.array(predictions[var]) for var in predictions}
         return predictions
@@ -484,7 +518,7 @@ class supervised_vae(pl.LightningModule):
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:
-            dat, _ = batch
+            dat, _, _ = batch
             x_list = [dat[x].to(device) for x in dat.keys()]
             input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
             baseline = tuple(torch.zeros_like(x) for x in input_data)

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis/models/triplet_encoder.py RENAMED Viewed

@@ -405,7 +405,7 @@ class MultiTripletNetwork(pl.LightningModule):
         if self.variable_types[target_var] == 'numerical':
             num_class = 1
         else:
-            num_class = len(np.unique([y[target_var] for _, y in dataset]))
+            num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
         aggregated_attributions = [[] for _ in range(num_class)]
         for batch in dataloader:

{flexynesis-0.2.0 → flexynesis-0.2.2}/flexynesis.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flexynesis
-Version: 0.2.0
+Version: 0.2.2
 Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
 Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
 Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis

{flexynesis-0.2.0 → flexynesis-0.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "flexynesis"
-version = "0.2.0"
+version = "0.2.2"
 authors = [
     {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
     {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},