flexynesis 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flexynesis-0.2.1 → flexynesis-0.2.3}/PKG-INFO +1 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/__main__.py +4 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/config.py +4 -4
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/data.py +1 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/main.py +8 -5
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/crossmodal_pred.py +3 -3
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/direct_pred.py +63 -27
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/gnn_early.py +1 -5
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/supervised_vae.py +60 -26
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/triplet_encoder.py +1 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/PKG-INFO +1 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/pyproject.toml +1 -1
- {flexynesis-0.2.1 → flexynesis-0.2.3}/LICENCE.md +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/README.md +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/__init__.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/cli.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/feature_selection.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/__init__.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/modules.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis/utils.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/SOURCES.txt +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/dependency_links.txt +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/entry_points.txt +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/requires.txt +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/flexynesis.egg-info/top_level.txt +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/setup.cfg +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/tests/__init__.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/tests/unit/__init__.py +0 -0
- {flexynesis-0.2.1 → flexynesis-0.2.3}/tests/unit/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
@@ -46,6 +46,7 @@ def main():
|
|
|
46
46
|
--use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
|
|
47
47
|
--evaluate_baseline_performance (str): Whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset. Choices are ['True', 'False']. Default is 'True'.
|
|
48
48
|
--threads (int): How many threads to use when using CPU. Default is 4.
|
|
49
|
+
--num_workers (int): How many workers to use for model training. Default is 2
|
|
49
50
|
--use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
|
|
50
51
|
--disable_marker_finding (bool): If set, marker discovery after model training is disabled.
|
|
51
52
|
--string_organism (int): STRING DB organism id. Default is 9606.
|
|
@@ -99,6 +100,7 @@ def main():
|
|
|
99
100
|
parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
|
|
100
101
|
parser.add_argument("--evaluate_baseline_performance", help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset", type=str, choices=['True', 'False'], default = 'True')
|
|
101
102
|
parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
|
|
103
|
+
parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
|
|
102
104
|
parser.add_argument("--use_gpu", action="store_true",
|
|
103
105
|
help="(Optional) If set, the system will attempt to use CUDA/GPU if available.")
|
|
104
106
|
parser.add_argument("--disable_marker_finding", action="store_true",
|
|
@@ -253,7 +255,8 @@ def main():
|
|
|
253
255
|
device_type = device_type,
|
|
254
256
|
gnn_conv_type = gnn_conv_type,
|
|
255
257
|
input_layers = input_layers,
|
|
256
|
-
output_layers = output_layers
|
|
258
|
+
output_layers = output_layers,
|
|
259
|
+
num_workers = args.num_workers)
|
|
257
260
|
|
|
258
261
|
# do a hyperparameter search training multiple models and get the best_configuration
|
|
259
262
|
model, best_params = tuner.perform_tuning(hpo_patience = args.hpo_patience)
|
|
@@ -6,28 +6,28 @@ epochs = [500]
|
|
|
6
6
|
search_spaces = {
|
|
7
7
|
'DirectPred': [
|
|
8
8
|
Integer(16, 128, name='latent_dim'),
|
|
9
|
-
Real(0.2,
|
|
9
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
10
10
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
11
11
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
12
12
|
Categorical(epochs, name='epochs')
|
|
13
13
|
],
|
|
14
14
|
'supervised_vae': [
|
|
15
15
|
Integer(16, 128, name='latent_dim'),
|
|
16
|
-
Real(0.2,
|
|
16
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
17
17
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
18
18
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
19
19
|
Categorical(epochs, name='epochs')
|
|
20
20
|
],
|
|
21
21
|
'CrossModalPred': [
|
|
22
22
|
Integer(16, 128, name='latent_dim'),
|
|
23
|
-
Real(0.2,
|
|
23
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
24
24
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
25
25
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
26
26
|
Categorical(epochs, name='epochs')
|
|
27
27
|
],
|
|
28
28
|
'MultiTripletNetwork': [
|
|
29
29
|
Integer(16, 128, name='latent_dim'),
|
|
30
|
-
Real(0.2,
|
|
30
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
31
31
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
32
32
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
33
33
|
Categorical(epochs, name='epochs')
|
|
@@ -525,7 +525,7 @@ class MultiOmicDataset(Dataset):
|
|
|
525
525
|
"""
|
|
526
526
|
subset_dat = {x: self.dat[x][index] for x in self.dat.keys()}
|
|
527
527
|
subset_ann = {x: self.ann[x][index] for x in self.ann.keys()}
|
|
528
|
-
return subset_dat, subset_ann
|
|
528
|
+
return subset_dat, subset_ann, self.samples[index]
|
|
529
529
|
|
|
530
530
|
def __len__ (self):
|
|
531
531
|
"""Get the total number of samples in the dataset.
|
|
@@ -56,7 +56,7 @@ class HyperparameterTuning:
|
|
|
56
56
|
cv_splits=5, use_loss_weighting=True, early_stop_patience=-1, device_type=None, gnn_conv_type=None,
|
|
57
57
|
input_layers=None, output_layers=None): Initializes the hyperparameter tuner with specific settings.
|
|
58
58
|
|
|
59
|
-
get_batch_space(min_size=16, max_size=
|
|
59
|
+
get_batch_space(min_size=16, max_size=128): Determines the batch size search space based on the dataset size.
|
|
60
60
|
|
|
61
61
|
setup_trainer(params, current_step, total_steps, full_train=False): Sets up the trainer with appropriate callbacks
|
|
62
62
|
and configurations for either full training or validation based training.
|
|
@@ -80,7 +80,7 @@ class HyperparameterTuning:
|
|
|
80
80
|
val_size = 0.2, use_cv = False, cv_splits = 5,
|
|
81
81
|
use_loss_weighting = True, early_stop_patience = -1,
|
|
82
82
|
device_type = None, gnn_conv_type = None,
|
|
83
|
-
input_layers = None, output_layers = None):
|
|
83
|
+
input_layers = None, output_layers = None, num_workers = 2):
|
|
84
84
|
self.dataset = dataset # dataset for model initiation
|
|
85
85
|
self.loader_dataset = dataset # dataset for defining data loaders (this can be model specific)
|
|
86
86
|
self.model_class = model_class
|
|
@@ -107,6 +107,7 @@ class HyperparameterTuning:
|
|
|
107
107
|
self.gnn_conv_type = gnn_conv_type
|
|
108
108
|
self.input_layers = input_layers
|
|
109
109
|
self.output_layers = output_layers
|
|
110
|
+
self.num_workers = num_workers
|
|
110
111
|
|
|
111
112
|
self.DataLoader = torch.utils.data.DataLoader # use torch data loader by default
|
|
112
113
|
|
|
@@ -128,7 +129,7 @@ class HyperparameterTuning:
|
|
|
128
129
|
else:
|
|
129
130
|
raise ValueError(f"'{self.config_name}' not found in the default config.")
|
|
130
131
|
|
|
131
|
-
def get_batch_space(self, min_size = 32, max_size =
|
|
132
|
+
def get_batch_space(self, min_size = 32, max_size = 128):
|
|
132
133
|
m = int(np.log2(len(self.dataset) * 0.8))
|
|
133
134
|
st = int(np.log2(min_size))
|
|
134
135
|
end = int(np.log2(max_size))
|
|
@@ -214,9 +215,11 @@ class HyperparameterTuning:
|
|
|
214
215
|
train_subset = torch.utils.data.Subset(self.loader_dataset, train_index)
|
|
215
216
|
val_subset = torch.utils.data.Subset(self.loader_dataset, val_index)
|
|
216
217
|
train_loader = self.DataLoader(train_subset, batch_size=int(params['batch_size']),
|
|
217
|
-
pin_memory=True, shuffle=True, drop_last=True, num_workers =
|
|
218
|
+
pin_memory=True, shuffle=True, drop_last=True, num_workers = self.num_workers, prefetch_factor = None,
|
|
219
|
+
persistent_workers = self.num_workers > 0)
|
|
218
220
|
val_loader = self.DataLoader(val_subset, batch_size=int(params['batch_size']),
|
|
219
|
-
pin_memory=True, shuffle=False, num_workers =
|
|
221
|
+
pin_memory=True, shuffle=False, num_workers = self.num_workers, prefetch_factor = None,
|
|
222
|
+
persistent_workers = self.num_workers > 0)
|
|
220
223
|
|
|
221
224
|
model = self.model_class(**model_args)
|
|
222
225
|
trainer, early_stop_callback = self.setup_trainer(params, current_step, total_steps)
|
|
@@ -262,7 +262,7 @@ class CrossModalPred(pl.LightningModule):
|
|
|
262
262
|
dataset, particularly handling survival analysis if applicable. All losses are aggregated to compute a total loss,
|
|
263
263
|
which is logged and returned.
|
|
264
264
|
"""
|
|
265
|
-
dat, y_dict = train_batch
|
|
265
|
+
dat, y_dict, samples = train_batch
|
|
266
266
|
|
|
267
267
|
# get input omics modalities and encode them; decode them to output layers
|
|
268
268
|
x_list_input = [dat[x] for x in self.input_layers]
|
|
@@ -315,7 +315,7 @@ class CrossModalPred(pl.LightningModule):
|
|
|
315
315
|
analysis where applicable. The aggregated losses are then summed up to form the total validation loss, which is logged
|
|
316
316
|
and returned.
|
|
317
317
|
"""
|
|
318
|
-
dat, y_dict = val_batch
|
|
318
|
+
dat, y_dict, samples = val_batch
|
|
319
319
|
|
|
320
320
|
# get input omics modalities and encode them
|
|
321
321
|
x_list_input = [dat[x] for x in self.input_layers]
|
|
@@ -515,7 +515,7 @@ class CrossModalPred(pl.LightningModule):
|
|
|
515
515
|
aggregated_attributions = [[] for _ in range(num_class)]
|
|
516
516
|
|
|
517
517
|
for batch in dataloader:
|
|
518
|
-
dat, _ = batch
|
|
518
|
+
dat, _, _ = batch
|
|
519
519
|
x_list = [dat[x].to(device) for x in self.input_layers]
|
|
520
520
|
input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
|
|
521
521
|
baseline = tuple(torch.zeros_like(x) for x in input_data)
|
|
@@ -190,7 +190,7 @@ class DirectPred(pl.LightningModule):
|
|
|
190
190
|
torch.Tensor: The total loss computed for the batch.
|
|
191
191
|
"""
|
|
192
192
|
|
|
193
|
-
dat, y_dict = train_batch
|
|
193
|
+
dat, y_dict, samples = train_batch
|
|
194
194
|
layers = dat.keys()
|
|
195
195
|
x_list = [dat[x] for x in layers]
|
|
196
196
|
outputs = self.forward(x_list)
|
|
@@ -226,7 +226,7 @@ class DirectPred(pl.LightningModule):
|
|
|
226
226
|
Returns:
|
|
227
227
|
torch.Tensor: The total loss computed for the batch.
|
|
228
228
|
"""
|
|
229
|
-
dat, y_dict = val_batch
|
|
229
|
+
dat, y_dict, samples = val_batch
|
|
230
230
|
layers = dat.keys()
|
|
231
231
|
x_list = [dat[x] for x in layers]
|
|
232
232
|
outputs = self.forward(x_list)
|
|
@@ -250,28 +250,44 @@ class DirectPred(pl.LightningModule):
|
|
|
250
250
|
|
|
251
251
|
def predict(self, dataset):
|
|
252
252
|
"""
|
|
253
|
-
|
|
253
|
+
Evaluate the model on a dataset using batching.
|
|
254
254
|
|
|
255
255
|
Args:
|
|
256
|
-
dataset:
|
|
256
|
+
dataset (MultiOmicDataset): dataset containing input matrices for each omics layer.
|
|
257
257
|
|
|
258
258
|
Returns:
|
|
259
|
-
dict:
|
|
259
|
+
dict: Predicted values mapped by target variable names.
|
|
260
260
|
"""
|
|
261
|
-
self.eval()
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
outputs = self.forward(x_list)
|
|
261
|
+
self.eval() # Set the model to evaluation mode
|
|
262
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
263
|
+
self.to(device) # Move the model to the appropriate device
|
|
265
264
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
265
|
+
# Create a DataLoader with a practical batch size
|
|
266
|
+
dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
|
|
267
|
+
|
|
268
|
+
predictions = {var: [] for var in self.variables} # Initialize prediction storage
|
|
269
|
+
|
|
270
|
+
# Process each batch
|
|
271
|
+
for batch in dataloader:
|
|
272
|
+
dat, y_dict, samples = batch
|
|
273
|
+
x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
|
|
274
|
+
|
|
275
|
+
# Perform the forward pass
|
|
276
|
+
outputs = self.forward(x_list)
|
|
277
|
+
|
|
278
|
+
# Collect predictions for each variable
|
|
279
|
+
for var in self.variables:
|
|
280
|
+
y_pred = outputs[var].detach().cpu().numpy() # Move outputs back to CPU and convert to numpy
|
|
281
|
+
if dataset.variable_types[var] == 'categorical':
|
|
282
|
+
predictions[var].extend(np.argmax(y_pred, axis=1))
|
|
283
|
+
else:
|
|
284
|
+
predictions[var].extend(y_pred)
|
|
285
|
+
|
|
286
|
+
# Convert lists to arrays if necessary, depending on the downstream use-case
|
|
287
|
+
predictions = {var: np.array(predictions[var]) for var in predictions}
|
|
274
288
|
|
|
289
|
+
return predictions
|
|
290
|
+
|
|
275
291
|
def transform(self, dataset):
|
|
276
292
|
"""
|
|
277
293
|
Transforms the input data into a lower-dimensional representation using trained encoders.
|
|
@@ -282,16 +298,36 @@ class DirectPred(pl.LightningModule):
|
|
|
282
298
|
Returns:
|
|
283
299
|
pd.DataFrame: DataFrame containing the transformed data.
|
|
284
300
|
"""
|
|
285
|
-
self.eval()
|
|
286
|
-
|
|
287
|
-
#
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
301
|
+
self.eval() # Set the model to evaluation mode
|
|
302
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
303
|
+
self.to(device) # Move the model to the appropriate device
|
|
304
|
+
|
|
305
|
+
dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
|
|
306
|
+
|
|
307
|
+
embeddings_list = [] # Initialize a list to collect all batch embeddings
|
|
308
|
+
sample_names = [] # List to collect sample names
|
|
309
|
+
|
|
310
|
+
# Process each batch
|
|
311
|
+
for batch in dataloader:
|
|
312
|
+
dat, _, samples = batch
|
|
313
|
+
batch_embeddings = []
|
|
314
|
+
# Process each input matrix with its corresponding Encoder
|
|
315
|
+
for i, x in enumerate(dat.values()):
|
|
316
|
+
x = x.to(device) # Move data to GPU
|
|
317
|
+
encoded_x = self.encoders[i](x) # Transform data using the corresponding encoder
|
|
318
|
+
batch_embeddings.append(encoded_x)
|
|
319
|
+
|
|
320
|
+
# Concatenate all embeddings from the current batch
|
|
321
|
+
embeddings_batch_concat = torch.cat(batch_embeddings, dim=1)
|
|
322
|
+
embeddings_list.append(embeddings_batch_concat.detach().cpu()) # Move tensor back to CPU and detach
|
|
323
|
+
sample_names.extend(samples) # Collect sample names
|
|
324
|
+
|
|
325
|
+
# Concatenate all batch embeddings into one tensor
|
|
326
|
+
embeddings_concat = torch.cat(embeddings_list, dim=0)
|
|
291
327
|
|
|
292
328
|
# Converting tensor to numpy array and then to DataFrame
|
|
293
|
-
embeddings_df = pd.DataFrame(embeddings_concat.
|
|
294
|
-
index=
|
|
329
|
+
embeddings_df = pd.DataFrame(embeddings_concat.numpy(),
|
|
330
|
+
index=sample_names, # Set DataFrame index to sample names
|
|
295
331
|
columns=[f"E{dim}" for dim in range(embeddings_concat.shape[1])])
|
|
296
332
|
return embeddings_df
|
|
297
333
|
|
|
@@ -337,11 +373,11 @@ class DirectPred(pl.LightningModule):
|
|
|
337
373
|
if dataset.variable_types[target_var] == 'numerical':
|
|
338
374
|
num_class = 1
|
|
339
375
|
else:
|
|
340
|
-
num_class = len(np.unique([y[target_var] for _, y in dataset]))
|
|
376
|
+
num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
|
|
341
377
|
|
|
342
378
|
aggregated_attributions = [[] for _ in range(num_class)]
|
|
343
379
|
for batch in dataloader:
|
|
344
|
-
dat, _ = batch
|
|
380
|
+
dat, _, _ = batch
|
|
345
381
|
x_list = [dat[x].to(device) for x in dat.keys()]
|
|
346
382
|
input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
|
|
347
383
|
baseline = tuple(torch.zeros_like(x) for x in input_data)
|
|
@@ -375,7 +375,7 @@ class GNN(pl.LightningModule):
|
|
|
375
375
|
return torch.cat(outputs_list, dim = 0)
|
|
376
376
|
|
|
377
377
|
|
|
378
|
-
def compute_feature_importance(self, dataset, target_var, steps=5, batch_size =
|
|
378
|
+
def compute_feature_importance(self, dataset, target_var, steps=5, batch_size = 64):
|
|
379
379
|
"""
|
|
380
380
|
Computes the feature importance for each variable in the dataset using the Integrated Gradients method.
|
|
381
381
|
This method measures the importance of each feature by attributing the prediction output to each input feature.
|
|
@@ -397,13 +397,9 @@ class GNN(pl.LightningModule):
|
|
|
397
397
|
"""
|
|
398
398
|
def bytes_to_gb(bytes):
|
|
399
399
|
return bytes / 1024 ** 2
|
|
400
|
-
print("Memory before moving model to device: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
|
|
401
400
|
device = torch.device("cuda" if self.device_type == 'gpu' and torch.cuda.is_available() else 'cpu')
|
|
402
401
|
self.to(device)
|
|
403
|
-
print("Memory before edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
|
|
404
402
|
self.dataset_edge_index = dataset.edge_index.to(device)
|
|
405
|
-
print("Memory after edges: {:.3f} MB".format(bytes_to_gb(torch.cuda.max_memory_reserved())))
|
|
406
|
-
|
|
407
403
|
|
|
408
404
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
|
|
409
405
|
ig = IntegratedGradients(self.forward_target)
|
|
@@ -259,7 +259,7 @@ class supervised_vae(pl.LightningModule):
|
|
|
259
259
|
Returns:
|
|
260
260
|
torch.Tensor: The total loss computed for the batch.
|
|
261
261
|
"""
|
|
262
|
-
dat, y_dict = train_batch
|
|
262
|
+
dat, y_dict, samples = train_batch
|
|
263
263
|
layers = dat.keys()
|
|
264
264
|
x_list = [dat[x] for x in layers]
|
|
265
265
|
|
|
@@ -303,7 +303,7 @@ class supervised_vae(pl.LightningModule):
|
|
|
303
303
|
Returns:
|
|
304
304
|
torch.Tensor: The total loss computed for the batch.
|
|
305
305
|
"""
|
|
306
|
-
dat, y_dict = val_batch
|
|
306
|
+
dat, y_dict, samples = val_batch
|
|
307
307
|
layers = dat.keys()
|
|
308
308
|
x_list = [dat[x] for x in layers]
|
|
309
309
|
|
|
@@ -335,7 +335,7 @@ class supervised_vae(pl.LightningModule):
|
|
|
335
335
|
|
|
336
336
|
def transform(self, dataset):
|
|
337
337
|
"""
|
|
338
|
-
Transform the input dataset to latent representation.
|
|
338
|
+
Transform the input dataset to latent representation using batching.
|
|
339
339
|
|
|
340
340
|
Args:
|
|
341
341
|
dataset (MultiOmicDataset): MultiOmicDataset containing input matrices for each omics layer.
|
|
@@ -343,37 +343,71 @@ class supervised_vae(pl.LightningModule):
|
|
|
343
343
|
Returns:
|
|
344
344
|
pd.DataFrame: Transformed dataset as a pandas DataFrame.
|
|
345
345
|
"""
|
|
346
|
-
self.eval()
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
346
|
+
self.eval() # Set the model to evaluation mode
|
|
347
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
348
|
+
self.to(device) # Move the model to the appropriate device
|
|
349
|
+
|
|
350
|
+
dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
|
|
351
|
+
all_latent_representations = [] # Initialize a list to collect all batch latent representations
|
|
352
|
+
sample_names = [] # List to collect sample names
|
|
353
|
+
|
|
354
|
+
# Process each batch
|
|
355
|
+
for batch in dataloader:
|
|
356
|
+
dat, _, samples = batch
|
|
357
|
+
x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
|
|
358
|
+
|
|
359
|
+
# Perform the forward pass and extract the latent representation
|
|
360
|
+
latent_representation = self.forward(x_list)[1].detach().cpu().numpy() # Index [1] assumes second return is the latent rep
|
|
361
|
+
|
|
362
|
+
all_latent_representations.append(latent_representation) # Store the batch's latent representation
|
|
363
|
+
sample_names.extend(samples) # Collect sample names for this batch
|
|
364
|
+
|
|
365
|
+
# Concatenate all batch latent representations into one array
|
|
366
|
+
concatenated_latents = np.concatenate(all_latent_representations, axis=0)
|
|
367
|
+
|
|
368
|
+
# Convert the array to a DataFrame
|
|
369
|
+
z = pd.DataFrame(concatenated_latents)
|
|
370
|
+
z.columns = ['E' + str(i) for i in range(z.shape[1])] # Name columns
|
|
371
|
+
z.index = sample_names # Set DataFrame index to sample names
|
|
372
|
+
|
|
353
373
|
return z
|
|
354
374
|
|
|
355
375
|
def predict(self, dataset):
|
|
356
376
|
"""
|
|
357
|
-
Evaluate the model on a dataset.
|
|
377
|
+
Evaluate the model on a dataset using batching.
|
|
358
378
|
|
|
359
379
|
Args:
|
|
360
|
-
dataset (MultiOmicDataset):
|
|
380
|
+
dataset (MultiOmicDataset): Dataset containing input matrices for each omics layer.
|
|
361
381
|
|
|
362
382
|
Returns:
|
|
363
|
-
|
|
383
|
+
dict: Predicted values mapped by target variable names.
|
|
364
384
|
"""
|
|
365
|
-
self.eval()
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
for var in self.variables
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
385
|
+
self.eval() # Set the model to evaluation mode
|
|
386
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
387
|
+
self.to(device) # Move the model to the appropriate device
|
|
388
|
+
|
|
389
|
+
dataloader = DataLoader(dataset, batch_size=64, shuffle=False) # Adjust the batch size as needed
|
|
390
|
+
|
|
391
|
+
predictions = {var: [] for var in self.variables} # Initialize prediction storage
|
|
392
|
+
|
|
393
|
+
# Process each batch
|
|
394
|
+
for batch in dataloader:
|
|
395
|
+
dat, _, _ = batch
|
|
396
|
+
x_list = [dat[x].to(device) for x in dat.keys()] # Prepare the data batch for processing
|
|
397
|
+
|
|
398
|
+
# Perform the forward pass
|
|
399
|
+
X_hat, z, mean, log_var, outputs = self.forward(x_list)
|
|
400
|
+
|
|
401
|
+
# Collect predictions for each variable
|
|
402
|
+
for var in self.variables:
|
|
403
|
+
y_pred = outputs[var].detach().cpu().numpy() # Move outputs back to CPU and convert to numpy
|
|
404
|
+
if dataset.variable_types[var] == 'categorical':
|
|
405
|
+
predictions[var].extend(np.argmax(y_pred, axis=1))
|
|
406
|
+
else:
|
|
407
|
+
predictions[var].extend(y_pred)
|
|
408
|
+
|
|
409
|
+
# Convert lists to arrays if necessary, depending on the downstream use-case
|
|
410
|
+
predictions = {var: np.array(predictions[var]) for var in predictions}
|
|
377
411
|
|
|
378
412
|
return predictions
|
|
379
413
|
|
|
@@ -484,7 +518,7 @@ class supervised_vae(pl.LightningModule):
|
|
|
484
518
|
aggregated_attributions = [[] for _ in range(num_class)]
|
|
485
519
|
|
|
486
520
|
for batch in dataloader:
|
|
487
|
-
dat, _ = batch
|
|
521
|
+
dat, _, _ = batch
|
|
488
522
|
x_list = [dat[x].to(device) for x in dat.keys()]
|
|
489
523
|
input_data = tuple([data.unsqueeze(0).requires_grad_() for data in x_list])
|
|
490
524
|
baseline = tuple(torch.zeros_like(x) for x in input_data)
|
|
@@ -405,7 +405,7 @@ class MultiTripletNetwork(pl.LightningModule):
|
|
|
405
405
|
if self.variable_types[target_var] == 'numerical':
|
|
406
406
|
num_class = 1
|
|
407
407
|
else:
|
|
408
|
-
num_class = len(np.unique([y[target_var] for _, y in dataset]))
|
|
408
|
+
num_class = len(np.unique([y[target_var] for _, y, _ in dataset]))
|
|
409
409
|
|
|
410
410
|
aggregated_attributions = [[] for _ in range(num_class)]
|
|
411
411
|
for batch in dataloader:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|