flexynesis 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {flexynesis-0.2.1 → flexynesis-0.2.2}/PKG-INFO +1 -1
  2. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/data.py +1 -1
  3. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/crossmodal_pred.py +3 -3
  4. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/direct_pred.py +63 -27
  5. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/gnn_early.py +1 -5
  6. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/supervised_vae.py +60 -26
  7. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/triplet_encoder.py +1 -1
  8. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/PKG-INFO +1 -1
  9. {flexynesis-0.2.1 → flexynesis-0.2.2}/pyproject.toml +1 -1
  10. {flexynesis-0.2.1 → flexynesis-0.2.2}/LICENCE.md +0 -0
  11. {flexynesis-0.2.1 → flexynesis-0.2.2}/README.md +0 -0
  12. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/__init__.py +0 -0
  13. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/__main__.py +0 -0
  14. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/cli.py +0 -0
  15. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/config.py +0 -0
  16. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/feature_selection.py +0 -0
  17. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/main.py +0 -0
  18. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/__init__.py +0 -0
  19. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
  20. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
  21. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
  22. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/modules.py +0 -0
  23. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis/utils.py +0 -0
  24. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/SOURCES.txt +0 -0
  25. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/dependency_links.txt +0 -0
  26. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/entry_points.txt +0 -0
  27. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/requires.txt +0 -0
  28. {flexynesis-0.2.1 → flexynesis-0.2.2}/flexynesis.egg-info/top_level.txt +0 -0
  29. {flexynesis-0.2.1 → flexynesis-0.2.2}/setup.cfg +0 -0
  30. {flexynesis-0.2.1 → flexynesis-0.2.2}/tests/__init__.py +0 -0
  31. {flexynesis-0.2.1 → flexynesis-0.2.2}/tests/unit/__init__.py +0 -0
  32. {flexynesis-0.2.1 → flexynesis-0.2.2}/tests/unit/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -525,7 +525,7 @@ class MultiOmicDataset(Dataset):
525
525
  """
526
526
  subset_dat = {x: self.dat[x][index] for x in self.dat.keys()}
527
527
  subset_ann = {x: self.ann[x][index] for x in self.ann.keys()}
528
- return subset_dat, subset_ann
528
+ return subset_dat, subset_ann, self.samples[index]
529
529
 
530
530
  def __len__ (self):
531
531
  """Get the total number of samples in the dataset.
@@ -262,7 +262,7 @@ class CrossModalPred(pl.LightningModule):
262
262
  dataset, particularly handling survival analysis if applicable. All losses are aggregated to compute a total loss,
263
263
  which is logged and returned.
264
264
  """
265
- dat, y_dict = train_batch
265
+ dat, y_dict, samples = train_batch
266
266
 
267
267
  # get input omics modalities and encode them; decode them to output layers
268
268
  x_list_input = [dat[x] for x in self.input_layers]
@@ -315,7 +315,7 @@ class CrossModalPred(pl.LightningModule):
315
315
  analysis where applicable. The aggregated losses are then summed up to form the total validation loss, which is logged
316
316
  and returned.
317
317
  """
318
- dat, y_dict = val_batch
318
+ dat, y_dict, samples = val_batch
319
319
 
320
320
  # get input omics modalities and encode them
321
321
  x_list_input = [dat[x] for x in self.input_layers]
@@ -515,7 +515,7 @@ class CrossModalPred(pl.LightningModule):
515
515
  aggregated_attributions = [[] for _ in range(num_class)]
516
516
 
517
517
  for batch in dataloader:
518
- dat, _ = batch
518
+ dat, _, _ = batch
519
519
  x_list = [dat[x].to(device) for x in self.input_layers]
520
520
  input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
521
521
  baseline = tuple(torch.zeros_like(x) for x in input_data)
@@ -190,7 +190,7 @@ class DirectPred(pl.LightningModule):
190
190
  torch.Tensor: The total loss computed for the batch.
191
191
  """
192
192
 
193
- dat, y_dict = train_batch
193
+ dat, y_dict, samples = train_batch
194
194
  layers = dat.keys()
195
195
  x_list = [dat[x] for x in layers]
196
196
  outputs = self.forward(x_list)
@@ -226,7 +226,7 @@ class DirectPred(pl.LightningModule):
226
226
  Returns:
227
227
  torch.Tensor: The total loss computed for the batch.
228
228
  """
229
- dat, y_dict = val_batch
229
+ dat, y_dict, samples = val_batch
230
230
  layers = dat.keys()
231
231
  x_list = [dat[x] for x in layers]
232
232
  outputs = self.forward(x_list)
@@ -250,28 +250,44 @@ class DirectPred(pl.LightningModule):
250
250
 
251
251
  def predict(self, dataset):
252
252
  """
253
- Make predictions on an entire dataset.
253
+ Evaluate the model on a dataset using batching.
254
254
 
255
255
  Args:
256
- dataset: The MultiOmicDataset object to evaluate the model on.
256
+ dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
257
257
 
258
258
  Returns:
259
- dict: Predictions mapped by target variable names.
259
+ dict: Predicted values mapped by target variable names.
260
260
  """
261
- self.eval()
262
- layers = dataset.dat.keys()
263
- x_list = [dataset.dat[x] for x in layers]
264
- outputs = self.forward(x_list)
261
+ self.eval() # Set the model to evaluation mode
262
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
263
+ self.to(device) # Move the model to the appropriate device
265
264
 
266
- predictions = {}
267
- for var in self.variables:
268
- y_pred = outputs[var].detach().numpy()
269
- if self.variable_types[var] == 'categorical':
270
- predictions[var] = np.argmax(y_pred, axis=1)
271
- else:
272
- predictions[var] = y_pred
273
- return predictions
265
+ # Create a DataLoader with a practical batch size
266
+ dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
267
+
268
+ predictions = {var: [] for var in self.variables} # Initialize prediction storage
269
+
270
+ # Process each batch
271
+ for batch in dataloader:
272
+ dat, y_dict, samples = batch
273
+ x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
274
+
275
+ # Perform the forward pass
276
+ outputs = self.forward(x_list)
277
+
278
+ # Collect predictions for each variable
279
+ for var in self.variables:
280
+ y_pred = outputs[var].detach().cpu().numpy() # Move outputs back to CPU and convert to numpy
281
+ if dataset.variable_types[var] == 'categorical':
282
+ predictions[var].extend(np.argmax(y_pred, axis=1))
283
+ else:
284
+ predictions[var].extend(y_pred)
285
+
286
+ # Convert lists to arrays if necessary, depending on the downstream use-case
287
+ predictions = {var: np.array(predictions[var]) for var in predictions}
274
288
 
289
+ return predictions
290
+
275
291
  def transform(self, dataset):
276
292
  """
277
293
  Transforms the input data into a lower-dimensional representation using trained encoders.
@@ -282,16 +298,36 @@ class DirectPred(pl.LightningModule):
282
298
  Returns:
283
299
  pd.DataFrame: DataFrame containing the transformed data.
284
300
  """
285
- self.eval()
286
- embeddings_list = []
287
- # Process each input matrix with its corresponding Encoder
288
- for i, x in enumerate(dataset.dat.values()):
289
- embeddings_list.append(self.encoders[i](x))
290
- embeddings_concat = torch.cat(embeddings_list, dim=1)
301
+ self.eval() # Set the model to evaluation mode
302
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
303
+ self.to(device) # Move the model to the appropriate device
304
+
305
+ dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
306
+
307
+ embeddings_list = [] # Initialize a list to collect all batch embeddings
308
+ sample_names = [] # List to collect sample names
309
+
310
+ # Process each batch
311
+ for batch in dataloader:
312
+ dat, _, samples = batch
313
+ batch_embeddings = []
314
+ # Process each input matrix with its corresponding Encoder
315
+ for i, x in enumerate(dat.values()):
316
+ x = x.to(device) # Move data to GPU
317
+ encoded_x = self.encoders[i](x) # Transform data using the corresponding encoder
318
+ batch_embeddings.append(encoded_x)
319
+
320
+ # Concatenate all embeddings from the current batch
321
+ embeddings_batch_concat = torch.cat(batch_embeddings, dim=1)
322
+ embeddings_list.append(embeddings_batch_concat.detach().cpu()) # Move tensor back to CPU and detach
323
+ sample_names.extend(samples) # Collect sample names
324
+
325
+ # Concatenate all batch embeddings into one tensor
326
+ embeddings_concat = torch.cat(embeddings_list, dim=0)
291
327
 
292
328
  # Converting tensor to numpy array and then to DataFrame
293
- embeddings_df = pd.DataFrame(embeddings_concat.detach().numpy(),
294
- index=dataset.samples,
329
+ embeddings_df = pd.DataFrame(embeddings_concat.numpy(),
330
+ index=sample_names, # Set DataFrame index to sample names
295
331
  columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
296
332
  return embeddings_df
297
333
 
@@ -337,11 +373,11 @@ class DirectPred(pl.LightningModule):
337
373
  if dataset.variable_types[target_var] == 'numerical':
338
374
  num_class = 1
339
375
  else:
340
- num_class = len(np.unique([y[target_var] for _, y in dataset]))
376
+ num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
341
377
 
342
378
  aggregated_attributions = [[] for _ in range(num_class)]
343
379
  for batch in dataloader:
344
- dat, _ = batch
380
+ dat, _, _ = batch
345
381
  x_list = [dat[x].to(device) for x in dat.keys()]
346
382
  input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
347
383
  baseline = tuple(torch.zeros_like(x) for x in input_data)
@@ -375,7 +375,7 @@ class GNN(pl.LightningModule):
375
375
  return torch.cat(outputs_list, dim = 0)
376
376
 
377
377
 
378
- def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 32):
378
+ def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 64):
379
379
  """
380
380
  Computes the feature importance for each variable in the dataset using the Integrated Gradients method.
381
381
  This method measures the importance of each feature by attributing the prediction output to each input feature.
@@ -397,13 +397,9 @@ class GNN(pl.LightningModule):
397
397
  """
398
398
  def bytes_to_gb(bytes):
399
399
  return bytes / 1024 ** 2
400
- print("Memory before moving model to device: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
401
400
  device = torch.device("cuda" if self.device_type == 'gpu' and torch.cuda.is_available() else 'cpu')
402
401
  self.to(device)
403
- print("Memory before edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
404
402
  self.dataset_edge_index = dataset.edge_index.to(device)
405
- print("Memory after edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
406
-
407
403
 
408
404
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
409
405
  ig = IntegratedGradients(self.forward_target)
@@ -259,7 +259,7 @@ class supervised_vae(pl.LightningModule):
259
259
  Returns:
260
260
  torch.Tensor: The total loss computed for the batch.
261
261
  """
262
- dat, y_dict = train_batch
262
+ dat, y_dict, samples = train_batch
263
263
  layers = dat.keys()
264
264
  x_list = [dat[x] for x in layers]
265
265
 
@@ -303,7 +303,7 @@ class supervised_vae(pl.LightningModule):
303
303
  Returns:
304
304
  torch.Tensor: The total loss computed for the batch.
305
305
  """
306
- dat, y_dict = val_batch
306
+ dat, y_dict, samples = val_batch
307
307
  layers = dat.keys()
308
308
  x_list = [dat[x] for x in layers]
309
309
 
@@ -335,7 +335,7 @@ class supervised_vae(pl.LightningModule):
335
335
 
336
336
  def transform(self, dataset):
337
337
  """
338
- Transform the input dataset to latent representation.
338
+ Transform the input dataset to latent representation using batching.
339
339
 
340
340
  Args:
341
341
  dataset (MultiOmicDataset): MultiOmicDataset containing input matrices for each omics layer.
@@ -343,37 +343,71 @@ class supervised_vae(pl.LightningModule):
343
343
  Returns:
344
344
  pd.DataFrame: Transformed dataset as a pandas DataFrame.
345
345
  """
346
- self.eval()
347
- layers = list(dataset.dat.keys())
348
- x_list = [dataset.dat[x] for x in layers]
349
- M = self.forward(x_list)[1].detach().numpy()
350
- z = pd.DataFrame(M)
351
- z.columns = [''.join(['E', str(x)]) for x in z.columns]
352
- z.index = dataset.samples
346
+ self.eval() # Set the model to evaluation mode
347
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
348
+ self.to(device) # Move the model to the appropriate device
349
+
350
+ dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
351
+ all_latent_representations = [] # Initialize a list to collect all batch latent representations
352
+ sample_names = [] # List to collect sample names
353
+
354
+ # Process each batch
355
+ for batch in dataloader:
356
+ dat, _, samples = batch
357
+ x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
358
+
359
+ # Perform the forward pass and extract the latent representation
360
+ latent_representation = self.forward(x_list)[1].detach().cpu().numpy() # Index [1] assumes second return is the latent rep
361
+
362
+ all_latent_representations.append(latent_representation) # Store the batch's latent representation
363
+ sample_names.extend(samples) # Collect sample names for this batch
364
+
365
+ # Concatenate all batch latent representations into one array
366
+ concatenated_latents = np.concatenate(all_latent_representations, axis=0)
367
+
368
+ # Convert the array to a DataFrame
369
+ z = pd.DataFrame(concatenated_latents)
370
+ z.columns = ['E' + str(i) for i in range(z.shape[1])] # Name columns
371
+ z.index = sample_names # Set DataFrame index to sample names
372
+
353
373
  return z
354
374
 
355
375
  def predict(self, dataset):
356
376
  """
357
- Evaluate the model on a dataset.
377
+ Evaluate the model on a dataset using batching.
358
378
 
359
379
  Args:
360
- dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
380
+ dataset (MultiOmicDataset): Dataset containing input matrices for each omics layer.
361
381
 
362
382
  Returns:
363
- predicted values.
383
+ dict: Predicted values mapped by target variable names.
364
384
  """
365
- self.eval()
366
- layers = list(dataset.dat.keys())
367
- x_list = [dataset.dat[x] for x in layers]
368
- X_hat, z, mean, log_var, outputs = self.forward(x_list)
369
-
370
- predictions = {}
371
- for var in self.variables:
372
- y_pred = outputs[var].detach().numpy()
373
- if self.dataset.variable_types[var] == 'categorical':
374
- predictions[var] = np.argmax(y_pred, axis=1)
375
- else:
376
- predictions[var] = y_pred
385
+ self.eval() # Set the model to evaluation mode
386
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
387
+ self.to(device) # Move the model to the appropriate device
388
+
389
+ dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
390
+
391
+ predictions = {var: [] for var in self.variables} # Initialize prediction storage
392
+
393
+ # Process each batch
394
+ for batch in dataloader:
395
+ dat, _, _ = batch
396
+ x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
397
+
398
+ # Perform the forward pass
399
+ X_hat, z, mean, log_var, outputs = self.forward(x_list)
400
+
401
+ # Collect predictions for each variable
402
+ for var in self.variables:
403
+ y_pred = outputs[var].detach().cpu().numpy() # Move outputs back to CPU and convert to numpy
404
+ if dataset.variable_types[var] == 'categorical':
405
+ predictions[var].extend(np.argmax(y_pred, axis=1))
406
+ else:
407
+ predictions[var].extend(y_pred)
408
+
409
+ # Convert lists to arrays if necessary, depending on the downstream use-case
410
+ predictions = {var: np.array(predictions[var]) for var in predictions}
377
411
 
378
412
  return predictions
379
413
 
@@ -484,7 +518,7 @@ class supervised_vae(pl.LightningModule):
484
518
  aggregated_attributions = [[] for _ in range(num_class)]
485
519
 
486
520
  for batch in dataloader:
487
- dat, _ = batch
521
+ dat, _, _ = batch
488
522
  x_list = [dat[x].to(device) for x in dat.keys()]
489
523
  input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
490
524
  baseline = tuple(torch.zeros_like(x) for x in input_data)
@@ -405,7 +405,7 @@ class MultiTripletNetwork(pl.LightningModule):
405
405
  if self.variable_types[target_var] == 'numerical':
406
406
  num_class = 1
407
407
  else:
408
- num_class = len(np.unique([y[target_var] for _, y in dataset]))
408
+ num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
409
409
 
410
410
  aggregated_attributions = [[] for _ in range(num_class)]
411
411
  for batch in dataloader:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "flexynesis"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  authors = [
9
9
  {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
10
10
  {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},
File without changes
File without changes
File without changes
File without changes
File without changes