flexynesis 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flexynesis-0.2.2 → flexynesis-0.2.4}/PKG-INFO +1 -1
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/__main__.py +85 -44
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/config.py +4 -4
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/data.py +29 -18
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/main.py +8 -5
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/utils.py +24 -44
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/PKG-INFO +1 -1
- {flexynesis-0.2.2 → flexynesis-0.2.4}/pyproject.toml +1 -1
- {flexynesis-0.2.2 → flexynesis-0.2.4}/LICENCE.md +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/README.md +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/__init__.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/cli.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/feature_selection.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/__init__.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/crossmodal_pred.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/direct_pred.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/gnn_early.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/supervised_vae.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/triplet_encoder.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/modules.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/SOURCES.txt +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/dependency_links.txt +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/entry_points.txt +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/requires.txt +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/top_level.txt +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/setup.cfg +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/__init__.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/unit/__init__.py +0 -0
- {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/unit/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from lightning import seed_everything
|
|
2
2
|
import lightning as pl
|
|
3
3
|
from typing import NamedTuple
|
|
4
|
-
import os, yaml, torch, time, random, warnings, argparse
|
|
4
|
+
import os, yaml, torch, time, random, warnings, argparse, sys
|
|
5
5
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import flexynesis
|
|
@@ -18,7 +18,8 @@ def main():
|
|
|
18
18
|
|
|
19
19
|
Args:
|
|
20
20
|
--data_path (str): Path to the folder with train/test data files. (Required)
|
|
21
|
-
--model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
|
|
21
|
+
--model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
|
|
22
|
+
"MultiTripletNetwork", "CrossModalPred", "RandomForest", "SVM", "RandomSurvivalForest"]. (Required)
|
|
22
23
|
--gnn_conv_type (str): If model_class is set to GNN, choose which graph convolution type to use. Choices are ["GC", "GCN", "SAGE"].
|
|
23
24
|
--target_variables (str): Which variables in 'clin.csv' to use for predictions, comma-separated if multiple. Optional if survival variables are not set to None.
|
|
24
25
|
--batch_variables (str): Which variables in 'clin.csv' to use for data integration/batch correction, comma-separated if multiple. Optional.
|
|
@@ -44,8 +45,9 @@ def main():
|
|
|
44
45
|
--hpo_patience (int): How many hyperparameter optimisation iterations to wait for when no improvements are observed. Default is 10; set to 0 to disable early stopping.
|
|
45
46
|
--use_cv (bool): If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done.
|
|
46
47
|
--use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
|
|
47
|
-
--evaluate_baseline_performance (
|
|
48
|
+
--evaluate_baseline_performance (bool): Enables modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset.
|
|
48
49
|
--threads (int): How many threads to use when using CPU. Default is 4.
|
|
50
|
+
--num_workers (int): How many workers to use for model training. Default is 2
|
|
49
51
|
--use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
|
|
50
52
|
--disable_marker_finding (bool): If set, marker discovery after model training is disabled.
|
|
51
53
|
--string_organism (int): STRING DB organism id. Default is 9606.
|
|
@@ -56,7 +58,7 @@ def main():
|
|
|
56
58
|
|
|
57
59
|
parser.add_argument("--data_path", help="(Required) Path to the folder with train/test data files", type=str, required = True)
|
|
58
60
|
parser.add_argument("--model_class", help="(Required) The kind of model class to instantiate", type=str,
|
|
59
|
-
choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN"], required = True)
|
|
61
|
+
choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN", "RandomForest", "SVM", "RandomSurvivalForest"], required = True)
|
|
60
62
|
parser.add_argument("--gnn_conv_type", help="If model_class is set to GNN, choose which graph convolution type to use", type=str,
|
|
61
63
|
choices=["GC", "GCN", "SAGE"])
|
|
62
64
|
parser.add_argument("--target_variables",
|
|
@@ -97,8 +99,10 @@ def main():
|
|
|
97
99
|
parser.add_argument("--use_cv", action="store_true",
|
|
98
100
|
help="(Optional) If set, the a 5-fold cross-validation training will be done. Otherwise, a single trainig on 80 percent of the dataset is done.")
|
|
99
101
|
parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
|
|
100
|
-
parser.add_argument("--evaluate_baseline_performance",
|
|
102
|
+
parser.add_argument("--evaluate_baseline_performance", action="store_true",
|
|
103
|
+
help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset")
|
|
101
104
|
parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
|
|
105
|
+
parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
|
|
102
106
|
parser.add_argument("--use_gpu", action="store_true",
|
|
103
107
|
help="(Optional) If set, the system will attempt to use CUDA/GPU if available.")
|
|
104
108
|
parser.add_argument("--disable_marker_finding", action="store_true",
|
|
@@ -184,20 +188,24 @@ def main():
|
|
|
184
188
|
if not os.path.exists(args.outdir):
|
|
185
189
|
raise FileNotFoundError(f"Path to --outdir doesn't exist at:", {args.outdir})
|
|
186
190
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
191
|
+
available_models = {
|
|
192
|
+
"DirectPred": (DirectPred, "DirectPred"),
|
|
193
|
+
"supervised_vae": (supervised_vae, "supervised_vae"),
|
|
194
|
+
"MultiTripletNetwork": (MultiTripletNetwork, "MultiTripletNetwork"),
|
|
195
|
+
"CrossModalPred": (CrossModalPred, "CrossModalPred"),
|
|
196
|
+
"GNN": (GNN, "GNN"),
|
|
197
|
+
"RandomForest": ("RandomForest", None),
|
|
198
|
+
"SVM": ("SVM", None),
|
|
199
|
+
"RandomSurvivalForest": ("RandomSurvivalForest", None)
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
model_info = available_models.get(args.model_class)
|
|
203
|
+
|
|
204
|
+
if model_info is None:
|
|
205
|
+
raise ValueError(f"Unsupported model class {args.model_class}")
|
|
206
|
+
|
|
207
|
+
# Unpack the tuple into model class and config name
|
|
208
|
+
model_class, config_name = model_info
|
|
201
209
|
|
|
202
210
|
# import assays and labels
|
|
203
211
|
inputDir = args.data_path
|
|
@@ -219,6 +227,34 @@ def main():
|
|
|
219
227
|
downsample = args.subsample)
|
|
220
228
|
train_dataset, test_dataset = data_importer.import_data()
|
|
221
229
|
|
|
230
|
+
if args.model_class in ["RandomForest", "SVM"]:
|
|
231
|
+
if args.target_variables:
|
|
232
|
+
var = args.target_variables.strip().split(',')[0]
|
|
233
|
+
print(f"Training {args.model_class} on variable: {var}")
|
|
234
|
+
metrics = flexynesis.evaluate_baseline_performance(train_dataset, test_dataset, variable_name=var,
|
|
235
|
+
methods=[args.model_class], n_folds=5, n_jobs=args.threads)
|
|
236
|
+
metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
237
|
+
print(f"{args.model_class} evaluation complete. Results saved.")
|
|
238
|
+
# we skip everything related to deep learning models here
|
|
239
|
+
sys.exit(0)
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError(f"At least one target variable is required to run RandomForest/SVM models. Set --target_variables argument")
|
|
242
|
+
|
|
243
|
+
if args.model_class == "RandomSurvivalForest":
|
|
244
|
+
if args.surv_event_var and args.surv_time_var:
|
|
245
|
+
print(f"Training {args.model_class} on survival variables: {args.surv_event_var} and {args.surv_time_var}")
|
|
246
|
+
metrics = flexynesis.evaluate_baseline_survival_performance(train_dataset, test_dataset,
|
|
247
|
+
args.surv_time_var,
|
|
248
|
+
args.surv_event_var,
|
|
249
|
+
n_folds = 5,
|
|
250
|
+
n_jobs = int(args.threads))
|
|
251
|
+
metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
252
|
+
print(f"{args.model_class} evaluation complete. Results saved.")
|
|
253
|
+
# we skip everything related to deep learning models here
|
|
254
|
+
sys.exit(0)
|
|
255
|
+
else:
|
|
256
|
+
raise ValueError(f"Missing survival variables. Set --surv_event_var --surv_time_var arguments")
|
|
257
|
+
|
|
222
258
|
if args.model_class == 'GNN':
|
|
223
259
|
# overlay datasets with network info
|
|
224
260
|
# this is a temporary solution
|
|
@@ -253,7 +289,8 @@ def main():
|
|
|
253
289
|
device_type = device_type,
|
|
254
290
|
gnn_conv_type = gnn_conv_type,
|
|
255
291
|
input_layers = input_layers,
|
|
256
|
-
output_layers = output_layers
|
|
292
|
+
output_layers = output_layers,
|
|
293
|
+
num_workers = args.num_workers)
|
|
257
294
|
|
|
258
295
|
# do a hyperparameter search training multiple models and get the best_configuration
|
|
259
296
|
model, best_params = tuner.perform_tuning(hpo_patience = args.hpo_patience)
|
|
@@ -279,20 +316,16 @@ def main():
|
|
|
279
316
|
# update the test dataset to exclude finetuning samples
|
|
280
317
|
test_dataset = holdout_dataset
|
|
281
318
|
|
|
319
|
+
# get sample embeddings and save
|
|
320
|
+
print("[INFO] Extracting sample embeddings")
|
|
321
|
+
embeddings_train = model.transform(train_dataset)
|
|
322
|
+
embeddings_test = model.transform(test_dataset)
|
|
323
|
+
|
|
324
|
+
embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
|
|
325
|
+
embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
|
|
326
|
+
|
|
282
327
|
# evaluate predictions; (if any supervised learning happened)
|
|
283
328
|
if any([args.target_variables, args.surv_event_var, args.batch_variables]):
|
|
284
|
-
print("[INFO] Computing model evaluation metrics")
|
|
285
|
-
metrics_df = flexynesis.evaluate_wrapper(model.predict(test_dataset), test_dataset,
|
|
286
|
-
surv_event_var=model.surv_event_var,
|
|
287
|
-
surv_time_var=model.surv_time_var)
|
|
288
|
-
metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
289
|
-
|
|
290
|
-
# print known/predicted labels
|
|
291
|
-
predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
|
|
292
|
-
flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
|
|
293
|
-
ignore_index=True)
|
|
294
|
-
predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
|
|
295
|
-
|
|
296
329
|
if not args.disable_marker_finding: # unless marker discovery is disabled
|
|
297
330
|
# compute feature importance values
|
|
298
331
|
print("[INFO] Computing variable importance scores")
|
|
@@ -302,14 +335,19 @@ def main():
|
|
|
302
335
|
ignore_index = True)
|
|
303
336
|
df_imp.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'feature_importance.csv'])), header=True, index=False)
|
|
304
337
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
338
|
+
# print known/predicted labels
|
|
339
|
+
predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
|
|
340
|
+
flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
|
|
341
|
+
ignore_index=True)
|
|
342
|
+
predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
|
|
343
|
+
|
|
344
|
+
print("[INFO] Computing model evaluation metrics")
|
|
345
|
+
metrics_df = flexynesis.evaluate_wrapper(args.model_class, model.predict(test_dataset), test_dataset,
|
|
346
|
+
surv_event_var=model.surv_event_var,
|
|
347
|
+
surv_time_var=model.surv_time_var)
|
|
348
|
+
metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
349
|
+
|
|
350
|
+
|
|
313
351
|
# also filter embeddings to remove batch-associated dims and only keep target-variable associated dims
|
|
314
352
|
if args.batch_variables is not None:
|
|
315
353
|
print("[INFO] Printing filtered embeddings")
|
|
@@ -336,7 +374,7 @@ def main():
|
|
|
336
374
|
|
|
337
375
|
|
|
338
376
|
# evaluate off-the-shelf methods on the main target variable
|
|
339
|
-
if args.evaluate_baseline_performance
|
|
377
|
+
if args.evaluate_baseline_performance:
|
|
340
378
|
print("[INFO] Computing off-the-shelf method performance on first target variable:",model.target_variables[0])
|
|
341
379
|
var = model.target_variables[0]
|
|
342
380
|
metrics = pd.DataFrame()
|
|
@@ -348,9 +386,10 @@ def main():
|
|
|
348
386
|
|
|
349
387
|
if var != model.surv_event_var:
|
|
350
388
|
metrics = flexynesis.evaluate_baseline_performance(train, test,
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
389
|
+
variable_name = var,
|
|
390
|
+
methods = ['RandomForest', 'SVM'],
|
|
391
|
+
n_folds = 5,
|
|
392
|
+
n_jobs = int(args.threads))
|
|
354
393
|
if model.surv_event_var and model.surv_time_var:
|
|
355
394
|
print("[INFO] Computing off-the-shelf method performance on survival variable:",model.surv_time_var)
|
|
356
395
|
metrics_baseline_survival = flexynesis.evaluate_baseline_survival_performance(train, test,
|
|
@@ -366,5 +405,7 @@ def main():
|
|
|
366
405
|
# save the trained model in file
|
|
367
406
|
torch.save(model, os.path.join(args.outdir, '.'.join([args.prefix, 'final_model.pth'])))
|
|
368
407
|
|
|
408
|
+
|
|
369
409
|
if __name__ == "__main__":
|
|
370
410
|
main()
|
|
411
|
+
print("[INFO] Finished the analysis!")
|
|
@@ -6,28 +6,28 @@ epochs = [500]
|
|
|
6
6
|
search_spaces = {
|
|
7
7
|
'DirectPred': [
|
|
8
8
|
Integer(16, 128, name='latent_dim'),
|
|
9
|
-
Real(0.2,
|
|
9
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
10
10
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
11
11
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
12
12
|
Categorical(epochs, name='epochs')
|
|
13
13
|
],
|
|
14
14
|
'supervised_vae': [
|
|
15
15
|
Integer(16, 128, name='latent_dim'),
|
|
16
|
-
Real(0.2,
|
|
16
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
17
17
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
18
18
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
19
19
|
Categorical(epochs, name='epochs')
|
|
20
20
|
],
|
|
21
21
|
'CrossModalPred': [
|
|
22
22
|
Integer(16, 128, name='latent_dim'),
|
|
23
|
-
Real(0.2,
|
|
23
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
24
24
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
25
25
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
26
26
|
Categorical(epochs, name='epochs')
|
|
27
27
|
],
|
|
28
28
|
'MultiTripletNetwork': [
|
|
29
29
|
Integer(16, 128, name='latent_dim'),
|
|
30
|
-
Real(0.2,
|
|
30
|
+
Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
|
|
31
31
|
Integer(8, 32, name='supervisor_hidden_dim'),
|
|
32
32
|
Real(0.0001, 0.01, prior='log-uniform', name='lr'),
|
|
33
33
|
Categorical(epochs, name='epochs')
|
|
@@ -627,13 +627,14 @@ class TripletMultiOmicDataset(Dataset):
|
|
|
627
627
|
for label in labels_set}
|
|
628
628
|
return labels_set, label_to_indices
|
|
629
629
|
|
|
630
|
+
|
|
630
631
|
class MultiOmicDatasetNW(Dataset):
|
|
631
632
|
def __init__(self, multiomic_dataset, interaction_df):
|
|
632
633
|
self.multiomic_dataset = multiomic_dataset
|
|
633
634
|
self.interaction_df = interaction_df
|
|
634
635
|
|
|
635
|
-
#
|
|
636
|
-
self.common_features = self.
|
|
636
|
+
# Compute union of features in the data matrices that also appear in the network
|
|
637
|
+
self.common_features = self.find_union_features()
|
|
637
638
|
self.gene_to_index = {gene: idx for idx, gene in enumerate(self.common_features)}
|
|
638
639
|
self.edge_index = self.create_edge_index()
|
|
639
640
|
self.samples = self.multiomic_dataset.samples
|
|
@@ -647,38 +648,48 @@ class MultiOmicDatasetNW(Dataset):
|
|
|
647
648
|
# Store labels for all samples
|
|
648
649
|
self.labels = {target_name: labels for target_name, labels in self.multiomic_dataset.ann.items()}
|
|
649
650
|
|
|
650
|
-
def
|
|
651
|
-
|
|
651
|
+
def find_union_features(self):
|
|
652
|
+
# Find the union of all features in the multiomic dataset
|
|
653
|
+
all_omic_features = set().union(*(set(features) for features in self.multiomic_dataset.features.values()))
|
|
654
|
+
# Find the union of proteins involved in interactions
|
|
652
655
|
interaction_genes = set(self.interaction_df['protein1']).union(set(self.interaction_df['protein2']))
|
|
653
|
-
|
|
656
|
+
# Return the intersection of omic features and interaction genes
|
|
657
|
+
return list(all_omic_features.intersection(interaction_genes))
|
|
654
658
|
|
|
655
659
|
def create_edge_index(self):
|
|
660
|
+
# Create edges only if both proteins are within the available features
|
|
656
661
|
filtered_df = self.interaction_df[
|
|
657
662
|
(self.interaction_df['protein1'].isin(self.common_features)) &
|
|
658
663
|
(self.interaction_df['protein2'].isin(self.common_features))
|
|
659
664
|
]
|
|
660
665
|
edge_list = [(self.gene_to_index[row['protein1']], self.gene_to_index[row['protein2']]) for index, row in filtered_df.iterrows()]
|
|
661
666
|
return torch.tensor(edge_list, dtype=torch.long).t()
|
|
662
|
-
|
|
667
|
+
|
|
663
668
|
def precompute_node_features(self):
|
|
664
|
-
# Find indices of common features in each data matrix
|
|
665
|
-
feature_indices = {data_type: [self.multiomic_dataset.features[data_type].get_loc(gene)
|
|
666
|
-
for gene in self.common_features]
|
|
667
|
-
for data_type in self.multiomic_dataset.dat}
|
|
668
|
-
# Create a tensor to store all features [num_samples, num_nodes, num_data_types]
|
|
669
669
|
num_samples = len(self.samples)
|
|
670
670
|
num_nodes = len(self.common_features)
|
|
671
671
|
num_data_types = len(self.multiomic_dataset.dat)
|
|
672
|
-
all_features = torch.
|
|
672
|
+
all_features = torch.full((num_samples, num_nodes, num_data_types), float('nan'), dtype=torch.float)
|
|
673
673
|
|
|
674
|
-
# Extract features for each data type and place them in the tensor
|
|
675
674
|
for i, data_type in enumerate(self.multiomic_dataset.dat):
|
|
676
|
-
# Get the data matrix
|
|
677
675
|
data_matrix = self.multiomic_dataset.dat[data_type]
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
676
|
+
feature_indices = {
|
|
677
|
+
gene: self.multiomic_dataset.features[data_type].get_loc(gene)
|
|
678
|
+
for gene in self.common_features if gene in self.multiomic_dataset.features[data_type]
|
|
679
|
+
}
|
|
680
|
+
valid_indices = torch.tensor(list(feature_indices.values()))
|
|
681
|
+
feature_positions = torch.tensor([self.gene_to_index[gene] for gene in feature_indices.keys()])
|
|
682
|
+
|
|
683
|
+
# Fill in the available data
|
|
684
|
+
all_features[:, feature_positions, i] = data_matrix[:, valid_indices]
|
|
685
|
+
|
|
686
|
+
# Precompute medians for all data types, ignoring NaN values
|
|
687
|
+
medians = torch.nanmedian(all_features, dim=1, keepdim=True).values # Use .values to get the actual median tensor
|
|
688
|
+
|
|
689
|
+
# Replace all NaN values in all_features with their corresponding median values
|
|
690
|
+
isnan = torch.isnan(all_features)
|
|
691
|
+
all_features[isnan] = medians.expand_as(all_features)[isnan]
|
|
692
|
+
|
|
682
693
|
return all_features
|
|
683
694
|
|
|
684
695
|
def subset(self, indices):
|
|
@@ -56,7 +56,7 @@ class HyperparameterTuning:
|
|
|
56
56
|
cv_splits=5, use_loss_weighting=True, early_stop_patience=-1, device_type=None, gnn_conv_type=None,
|
|
57
57
|
input_layers=None, output_layers=None): Initializes the hyperparameter tuner with specific settings.
|
|
58
58
|
|
|
59
|
-
get_batch_space(min_size=16, max_size=
|
|
59
|
+
get_batch_space(min_size=16, max_size=128): Determines the batch size search space based on the dataset size.
|
|
60
60
|
|
|
61
61
|
setup_trainer(params, current_step, total_steps, full_train=False): Sets up the trainer with appropriate callbacks
|
|
62
62
|
and configurations for either full training or validation based training.
|
|
@@ -80,7 +80,7 @@ class HyperparameterTuning:
|
|
|
80
80
|
val_size = 0.2, use_cv = False, cv_splits = 5,
|
|
81
81
|
use_loss_weighting = True, early_stop_patience = -1,
|
|
82
82
|
device_type = None, gnn_conv_type = None,
|
|
83
|
-
input_layers = None, output_layers = None):
|
|
83
|
+
input_layers = None, output_layers = None, num_workers = 2):
|
|
84
84
|
self.dataset = dataset # dataset for model initiation
|
|
85
85
|
self.loader_dataset = dataset # dataset for defining data loaders (this can be model specific)
|
|
86
86
|
self.model_class = model_class
|
|
@@ -107,6 +107,7 @@ class HyperparameterTuning:
|
|
|
107
107
|
self.gnn_conv_type = gnn_conv_type
|
|
108
108
|
self.input_layers = input_layers
|
|
109
109
|
self.output_layers = output_layers
|
|
110
|
+
self.num_workers = num_workers
|
|
110
111
|
|
|
111
112
|
self.DataLoader = torch.utils.data.DataLoader # use torch data loader by default
|
|
112
113
|
|
|
@@ -128,7 +129,7 @@ class HyperparameterTuning:
|
|
|
128
129
|
else:
|
|
129
130
|
raise ValueError(f"'{self.config_name}' not found in the default config.")
|
|
130
131
|
|
|
131
|
-
def get_batch_space(self, min_size = 32, max_size =
|
|
132
|
+
def get_batch_space(self, min_size = 32, max_size = 128):
|
|
132
133
|
m = int(np.log2(len(self.dataset) * 0.8))
|
|
133
134
|
st = int(np.log2(min_size))
|
|
134
135
|
end = int(np.log2(max_size))
|
|
@@ -214,9 +215,11 @@ class HyperparameterTuning:
|
|
|
214
215
|
train_subset = torch.utils.data.Subset(self.loader_dataset, train_index)
|
|
215
216
|
val_subset = torch.utils.data.Subset(self.loader_dataset, val_index)
|
|
216
217
|
train_loader = self.DataLoader(train_subset, batch_size=int(params['batch_size']),
|
|
217
|
-
pin_memory=True, shuffle=True, drop_last=True, num_workers =
|
|
218
|
+
pin_memory=True, shuffle=True, drop_last=True, num_workers = self.num_workers, prefetch_factor = None,
|
|
219
|
+
persistent_workers = self.num_workers > 0)
|
|
218
220
|
val_loader = self.DataLoader(val_subset, batch_size=int(params['batch_size']),
|
|
219
|
-
pin_memory=True, shuffle=False, num_workers =
|
|
221
|
+
pin_memory=True, shuffle=False, num_workers = self.num_workers, prefetch_factor = None,
|
|
222
|
+
persistent_workers = self.num_workers > 0)
|
|
220
223
|
|
|
221
224
|
model = self.model_class(**model_args)
|
|
222
225
|
trainer, early_stop_callback = self.setup_trainer(params, current_step, total_steps)
|
|
@@ -218,7 +218,7 @@ def evaluate_regressor(y_true, y_pred):
|
|
|
218
218
|
r2 = r_value**2
|
|
219
219
|
return {"mse": mse, "r2": r2, "pearson_corr": r_value}
|
|
220
220
|
|
|
221
|
-
def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
|
|
221
|
+
def evaluate_wrapper(method, y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
|
|
222
222
|
metrics_list = []
|
|
223
223
|
for var in y_pred_dict.keys():
|
|
224
224
|
if dataset.variable_types[var] == 'numerical':
|
|
@@ -235,6 +235,7 @@ def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var
|
|
|
235
235
|
|
|
236
236
|
for metric, value in metrics.items():
|
|
237
237
|
metrics_list.append({
|
|
238
|
+
'method': method,
|
|
238
239
|
'var': var,
|
|
239
240
|
'variable_type': dataset.variable_types[var],
|
|
240
241
|
'metric': metric,
|
|
@@ -263,8 +264,7 @@ def get_predicted_labels(y_pred_dict, dataset, split):
|
|
|
263
264
|
dfs.append(df)
|
|
264
265
|
return pd.concat(dfs, ignore_index=True)
|
|
265
266
|
|
|
266
|
-
|
|
267
|
-
def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_folds=5, n_jobs = 4):
|
|
267
|
+
def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, methods, n_folds=5, n_jobs=4):
|
|
268
268
|
def prepare_data(data_object):
|
|
269
269
|
# Concatenate Data Matrices
|
|
270
270
|
X = np.concatenate([tensor for tensor in data_object.dat.values()], axis=1)
|
|
@@ -281,52 +281,32 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
|
|
|
281
281
|
# Determine variable type
|
|
282
282
|
variable_type = train_dataset.variable_types[variable_name]
|
|
283
283
|
|
|
284
|
-
# Initialize models and parameter grids
|
|
285
|
-
if variable_type == 'categorical':
|
|
286
|
-
model_params = {
|
|
287
|
-
'RandomForestClassifier': {
|
|
288
|
-
'model': RandomForestClassifier(random_state=42),
|
|
289
|
-
'params': {
|
|
290
|
-
'n_estimators': [100, 200, 300],
|
|
291
|
-
'max_depth': [10, 20, None]
|
|
292
|
-
}
|
|
293
|
-
},
|
|
294
|
-
'SVC': {
|
|
295
|
-
'model': SVC(),
|
|
296
|
-
'params': {
|
|
297
|
-
'C': [0.1, 1, 10],
|
|
298
|
-
'kernel': ['rbf', 'poly']
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
elif variable_type == 'numerical':
|
|
303
|
-
model_params = {
|
|
304
|
-
'RandomForestRegressor': {
|
|
305
|
-
'model': RandomForestRegressor(random_state=42),
|
|
306
|
-
'params': {
|
|
307
|
-
'n_estimators': [100, 200, 300],
|
|
308
|
-
'max_depth': [10, 20, None]
|
|
309
|
-
}
|
|
310
|
-
},
|
|
311
|
-
'SVR': {
|
|
312
|
-
'model': SVR(),
|
|
313
|
-
'params': {
|
|
314
|
-
'C': [0.1, 1, 10],
|
|
315
|
-
'kernel': ['rbf', 'poly']
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
|
|
320
284
|
# Cross-Validation and Training
|
|
321
285
|
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
322
286
|
X_train, y_train = prepare_data(train_dataset)
|
|
323
|
-
print("Train:",X_train.shape)
|
|
287
|
+
print("Train:", X_train.shape)
|
|
324
288
|
X_test, y_test = prepare_data(test_dataset)
|
|
325
|
-
print("Test:",X_test.shape)
|
|
289
|
+
print("Test:", X_test.shape)
|
|
326
290
|
|
|
327
291
|
metrics_list = []
|
|
328
|
-
|
|
329
|
-
|
|
292
|
+
|
|
293
|
+
for method in methods:
|
|
294
|
+
if variable_type == 'categorical':
|
|
295
|
+
if method == 'RandomForest':
|
|
296
|
+
model = RandomForestClassifier(random_state=42)
|
|
297
|
+
params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
|
|
298
|
+
elif method == 'SVM':
|
|
299
|
+
model = SVC(random_state=42)
|
|
300
|
+
params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
|
|
301
|
+
elif variable_type == 'numerical':
|
|
302
|
+
if method == 'RandomForest':
|
|
303
|
+
model = RandomForestRegressor(random_state=42)
|
|
304
|
+
params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
|
|
305
|
+
elif method == 'SVM':
|
|
306
|
+
model = SVR()
|
|
307
|
+
params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
|
|
308
|
+
|
|
309
|
+
grid_search = GridSearchCV(model, params, cv=kf, n_jobs=n_jobs)
|
|
330
310
|
grid_search.fit(X_train, y_train)
|
|
331
311
|
best_model = grid_search.best_estimator_
|
|
332
312
|
|
|
@@ -341,7 +321,7 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
|
|
|
341
321
|
|
|
342
322
|
for metric, value in metrics.items():
|
|
343
323
|
metrics_list.append({
|
|
344
|
-
'method':
|
|
324
|
+
'method': method + ('Classifier' if variable_type == 'categorical' else 'Regressor'),
|
|
345
325
|
'var': variable_name,
|
|
346
326
|
'variable_type': variable_type,
|
|
347
327
|
'metric': metric,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|