flexynesis 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flexynesis-0.2.3 → flexynesis-0.2.4}/PKG-INFO +1 -1
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/__main__.py +81 -43
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/data.py +29 -18
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/utils.py +24 -44
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/PKG-INFO +1 -1
- {flexynesis-0.2.3 → flexynesis-0.2.4}/pyproject.toml +1 -1
- {flexynesis-0.2.3 → flexynesis-0.2.4}/LICENCE.md +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/README.md +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/__init__.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/cli.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/config.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/feature_selection.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/main.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/__init__.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/crossmodal_pred.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/direct_pred.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/gnn_early.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/supervised_vae.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/triplet_encoder.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/modules.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/SOURCES.txt +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/dependency_links.txt +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/entry_points.txt +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/requires.txt +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/top_level.txt +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/setup.cfg +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/__init__.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/unit/__init__.py +0 -0
- {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/unit/test_smoke.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from lightning import seed_everything
|
|
2
2
|
import lightning as pl
|
|
3
3
|
from typing import NamedTuple
|
|
4
|
-
import os, yaml, torch, time, random, warnings, argparse
|
|
4
|
+
import os, yaml, torch, time, random, warnings, argparse, sys
|
|
5
5
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import flexynesis
|
|
@@ -18,7 +18,8 @@ def main():
|
|
|
18
18
|
|
|
19
19
|
Args:
|
|
20
20
|
--data_path (str): Path to the folder with train/test data files. (Required)
|
|
21
|
-
--model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
|
|
21
|
+
--model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
|
|
22
|
+
"MultiTripletNetwork", "CrossModalPred", "RandomForest", "SVM", "RandomSurvivalForest"]. (Required)
|
|
22
23
|
--gnn_conv_type (str): If model_class is set to GNN, choose which graph convolution type to use. Choices are ["GC", "GCN", "SAGE"].
|
|
23
24
|
--target_variables (str): Which variables in 'clin.csv' to use for predictions, comma-separated if multiple. Optional if survival variables are not set to None.
|
|
24
25
|
--batch_variables (str): Which variables in 'clin.csv' to use for data integration/batch correction, comma-separated if multiple. Optional.
|
|
@@ -44,7 +45,7 @@ def main():
|
|
|
44
45
|
--hpo_patience (int): How many hyperparameter optimisation iterations to wait for when no improvements are observed. Default is 10; set to 0 to disable early stopping.
|
|
45
46
|
--use_cv (bool): If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done.
|
|
46
47
|
--use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
|
|
47
|
-
--evaluate_baseline_performance (
|
|
48
|
+
--evaluate_baseline_performance (bool): Enables modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset.
|
|
48
49
|
--threads (int): How many threads to use when using CPU. Default is 4.
|
|
49
50
|
--num_workers (int): How many workers to use for model training. Default is 2
|
|
50
51
|
--use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
|
|
@@ -57,7 +58,7 @@ def main():
|
|
|
57
58
|
|
|
58
59
|
parser.add_argument("--data_path", help="(Required) Path to the folder with train/test data files", type=str, required = True)
|
|
59
60
|
parser.add_argument("--model_class", help="(Required) The kind of model class to instantiate", type=str,
|
|
60
|
-
choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN"], required = True)
|
|
61
|
+
choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN", "RandomForest", "SVM", "RandomSurvivalForest"], required = True)
|
|
61
62
|
parser.add_argument("--gnn_conv_type", help="If model_class is set to GNN, choose which graph convolution type to use", type=str,
|
|
62
63
|
choices=["GC", "GCN", "SAGE"])
|
|
63
64
|
parser.add_argument("--target_variables",
|
|
@@ -98,7 +99,8 @@ def main():
|
|
|
98
99
|
parser.add_argument("--use_cv", action="store_true",
|
|
99
100
|
help="(Optional) If set, the a 5-fold cross-validation training will be done. Otherwise, a single trainig on 80 percent of the dataset is done.")
|
|
100
101
|
parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
|
|
101
|
-
parser.add_argument("--evaluate_baseline_performance",
|
|
102
|
+
parser.add_argument("--evaluate_baseline_performance", action="store_true",
|
|
103
|
+
help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset")
|
|
102
104
|
parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
|
|
103
105
|
parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
|
|
104
106
|
parser.add_argument("--use_gpu", action="store_true",
|
|
@@ -186,20 +188,24 @@ def main():
|
|
|
186
188
|
if not os.path.exists(args.outdir):
|
|
187
189
|
raise FileNotFoundError(f"Path to --outdir doesn't exist at:", {args.outdir})
|
|
188
190
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
191
|
+
available_models = {
|
|
192
|
+
"DirectPred": (DirectPred, "DirectPred"),
|
|
193
|
+
"supervised_vae": (supervised_vae, "supervised_vae"),
|
|
194
|
+
"MultiTripletNetwork": (MultiTripletNetwork, "MultiTripletNetwork"),
|
|
195
|
+
"CrossModalPred": (CrossModalPred, "CrossModalPred"),
|
|
196
|
+
"GNN": (GNN, "GNN"),
|
|
197
|
+
"RandomForest": ("RandomForest", None),
|
|
198
|
+
"SVM": ("SVM", None),
|
|
199
|
+
"RandomSurvivalForest": ("RandomSurvivalForest", None)
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
model_info = available_models.get(args.model_class)
|
|
203
|
+
|
|
204
|
+
if model_info is None:
|
|
205
|
+
raise ValueError(f"Unsupported model class {args.model_class}")
|
|
206
|
+
|
|
207
|
+
# Unpack the tuple into model class and config name
|
|
208
|
+
model_class, config_name = model_info
|
|
203
209
|
|
|
204
210
|
# import assays and labels
|
|
205
211
|
inputDir = args.data_path
|
|
@@ -221,6 +227,34 @@ def main():
|
|
|
221
227
|
downsample = args.subsample)
|
|
222
228
|
train_dataset, test_dataset = data_importer.import_data()
|
|
223
229
|
|
|
230
|
+
if args.model_class in ["RandomForest", "SVM"]:
|
|
231
|
+
if args.target_variables:
|
|
232
|
+
var = args.target_variables.strip().split(',')[0]
|
|
233
|
+
print(f"Training {args.model_class} on variable: {var}")
|
|
234
|
+
metrics = flexynesis.evaluate_baseline_performance(train_dataset, test_dataset, variable_name=var,
|
|
235
|
+
methods=[args.model_class], n_folds=5, n_jobs=args.threads)
|
|
236
|
+
metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
237
|
+
print(f"{args.model_class} evaluation complete. Results saved.")
|
|
238
|
+
# we skip everything related to deep learning models here
|
|
239
|
+
sys.exit(0)
|
|
240
|
+
else:
|
|
241
|
+
raise ValueError(f"At least one target variable is required to run RandomForest/SVM models. Set --target_variables argument")
|
|
242
|
+
|
|
243
|
+
if args.model_class == "RandomSurvivalForest":
|
|
244
|
+
if args.surv_event_var and args.surv_time_var:
|
|
245
|
+
print(f"Training {args.model_class} on survival variables: {args.surv_event_var} and {args.surv_time_var}")
|
|
246
|
+
metrics = flexynesis.evaluate_baseline_survival_performance(train_dataset, test_dataset,
|
|
247
|
+
args.surv_time_var,
|
|
248
|
+
args.surv_event_var,
|
|
249
|
+
n_folds = 5,
|
|
250
|
+
n_jobs = int(args.threads))
|
|
251
|
+
metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
252
|
+
print(f"{args.model_class} evaluation complete. Results saved.")
|
|
253
|
+
# we skip everything related to deep learning models here
|
|
254
|
+
sys.exit(0)
|
|
255
|
+
else:
|
|
256
|
+
raise ValueError(f"Missing survival variables. Set --surv_event_var --surv_time_var arguments")
|
|
257
|
+
|
|
224
258
|
if args.model_class == 'GNN':
|
|
225
259
|
# overlay datasets with network info
|
|
226
260
|
# this is a temporary solution
|
|
@@ -282,20 +316,16 @@ def main():
|
|
|
282
316
|
# update the test dataset to exclude finetuning samples
|
|
283
317
|
test_dataset = holdout_dataset
|
|
284
318
|
|
|
319
|
+
# get sample embeddings and save
|
|
320
|
+
print("[INFO] Extracting sample embeddings")
|
|
321
|
+
embeddings_train = model.transform(train_dataset)
|
|
322
|
+
embeddings_test = model.transform(test_dataset)
|
|
323
|
+
|
|
324
|
+
embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
|
|
325
|
+
embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
|
|
326
|
+
|
|
285
327
|
# evaluate predictions; (if any supervised learning happened)
|
|
286
328
|
if any([args.target_variables, args.surv_event_var, args.batch_variables]):
|
|
287
|
-
print("[INFO] Computing model evaluation metrics")
|
|
288
|
-
metrics_df = flexynesis.evaluate_wrapper(model.predict(test_dataset), test_dataset,
|
|
289
|
-
surv_event_var=model.surv_event_var,
|
|
290
|
-
surv_time_var=model.surv_time_var)
|
|
291
|
-
metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
292
|
-
|
|
293
|
-
# print known/predicted labels
|
|
294
|
-
predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
|
|
295
|
-
flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
|
|
296
|
-
ignore_index=True)
|
|
297
|
-
predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
|
|
298
|
-
|
|
299
329
|
if not args.disable_marker_finding: # unless marker discovery is disabled
|
|
300
330
|
# compute feature importance values
|
|
301
331
|
print("[INFO] Computing variable importance scores")
|
|
@@ -305,14 +335,19 @@ def main():
|
|
|
305
335
|
ignore_index = True)
|
|
306
336
|
df_imp.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'feature_importance.csv'])), header=True, index=False)
|
|
307
337
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
338
|
+
# print known/predicted labels
|
|
339
|
+
predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
|
|
340
|
+
flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
|
|
341
|
+
ignore_index=True)
|
|
342
|
+
predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
|
|
343
|
+
|
|
344
|
+
print("[INFO] Computing model evaluation metrics")
|
|
345
|
+
metrics_df = flexynesis.evaluate_wrapper(args.model_class, model.predict(test_dataset), test_dataset,
|
|
346
|
+
surv_event_var=model.surv_event_var,
|
|
347
|
+
surv_time_var=model.surv_time_var)
|
|
348
|
+
metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
|
|
349
|
+
|
|
350
|
+
|
|
316
351
|
# also filter embeddings to remove batch-associated dims and only keep target-variable associated dims
|
|
317
352
|
if args.batch_variables is not None:
|
|
318
353
|
print("[INFO] Printing filtered embeddings")
|
|
@@ -339,7 +374,7 @@ def main():
|
|
|
339
374
|
|
|
340
375
|
|
|
341
376
|
# evaluate off-the-shelf methods on the main target variable
|
|
342
|
-
if args.evaluate_baseline_performance
|
|
377
|
+
if args.evaluate_baseline_performance:
|
|
343
378
|
print("[INFO] Computing off-the-shelf method performance on first target variable:",model.target_variables[0])
|
|
344
379
|
var = model.target_variables[0]
|
|
345
380
|
metrics = pd.DataFrame()
|
|
@@ -351,9 +386,10 @@ def main():
|
|
|
351
386
|
|
|
352
387
|
if var != model.surv_event_var:
|
|
353
388
|
metrics = flexynesis.evaluate_baseline_performance(train, test,
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
389
|
+
variable_name = var,
|
|
390
|
+
methods = ['RandomForest', 'SVM'],
|
|
391
|
+
n_folds = 5,
|
|
392
|
+
n_jobs = int(args.threads))
|
|
357
393
|
if model.surv_event_var and model.surv_time_var:
|
|
358
394
|
print("[INFO] Computing off-the-shelf method performance on survival variable:",model.surv_time_var)
|
|
359
395
|
metrics_baseline_survival = flexynesis.evaluate_baseline_survival_performance(train, test,
|
|
@@ -369,5 +405,7 @@ def main():
|
|
|
369
405
|
# save the trained model in file
|
|
370
406
|
torch.save(model, os.path.join(args.outdir, '.'.join([args.prefix, 'final_model.pth'])))
|
|
371
407
|
|
|
408
|
+
|
|
372
409
|
if __name__ == "__main__":
|
|
373
410
|
main()
|
|
411
|
+
print("[INFO] Finished the analysis!")
|
|
@@ -627,13 +627,14 @@ class TripletMultiOmicDataset(Dataset):
|
|
|
627
627
|
for label in labels_set}
|
|
628
628
|
return labels_set, label_to_indices
|
|
629
629
|
|
|
630
|
+
|
|
630
631
|
class MultiOmicDatasetNW(Dataset):
|
|
631
632
|
def __init__(self, multiomic_dataset, interaction_df):
|
|
632
633
|
self.multiomic_dataset = multiomic_dataset
|
|
633
634
|
self.interaction_df = interaction_df
|
|
634
635
|
|
|
635
|
-
#
|
|
636
|
-
self.common_features = self.
|
|
636
|
+
# Compute union of features in the data matrices that also appear in the network
|
|
637
|
+
self.common_features = self.find_union_features()
|
|
637
638
|
self.gene_to_index = {gene: idx for idx, gene in enumerate(self.common_features)}
|
|
638
639
|
self.edge_index = self.create_edge_index()
|
|
639
640
|
self.samples = self.multiomic_dataset.samples
|
|
@@ -647,38 +648,48 @@ class MultiOmicDatasetNW(Dataset):
|
|
|
647
648
|
# Store labels for all samples
|
|
648
649
|
self.labels = {target_name: labels for target_name, labels in self.multiomic_dataset.ann.items()}
|
|
649
650
|
|
|
650
|
-
def
|
|
651
|
-
|
|
651
|
+
def find_union_features(self):
|
|
652
|
+
# Find the union of all features in the multiomic dataset
|
|
653
|
+
all_omic_features = set().union(*(set(features) for features in self.multiomic_dataset.features.values()))
|
|
654
|
+
# Find the union of proteins involved in interactions
|
|
652
655
|
interaction_genes = set(self.interaction_df['protein1']).union(set(self.interaction_df['protein2']))
|
|
653
|
-
|
|
656
|
+
# Return the intersection of omic features and interaction genes
|
|
657
|
+
return list(all_omic_features.intersection(interaction_genes))
|
|
654
658
|
|
|
655
659
|
def create_edge_index(self):
|
|
660
|
+
# Create edges only if both proteins are within the available features
|
|
656
661
|
filtered_df = self.interaction_df[
|
|
657
662
|
(self.interaction_df['protein1'].isin(self.common_features)) &
|
|
658
663
|
(self.interaction_df['protein2'].isin(self.common_features))
|
|
659
664
|
]
|
|
660
665
|
edge_list = [(self.gene_to_index[row['protein1']], self.gene_to_index[row['protein2']]) for index, row in filtered_df.iterrows()]
|
|
661
666
|
return torch.tensor(edge_list, dtype=torch.long).t()
|
|
662
|
-
|
|
667
|
+
|
|
663
668
|
def precompute_node_features(self):
|
|
664
|
-
# Find indices of common features in each data matrix
|
|
665
|
-
feature_indices = {data_type: [self.multiomic_dataset.features[data_type].get_loc(gene)
|
|
666
|
-
for gene in self.common_features]
|
|
667
|
-
for data_type in self.multiomic_dataset.dat}
|
|
668
|
-
# Create a tensor to store all features [num_samples, num_nodes, num_data_types]
|
|
669
669
|
num_samples = len(self.samples)
|
|
670
670
|
num_nodes = len(self.common_features)
|
|
671
671
|
num_data_types = len(self.multiomic_dataset.dat)
|
|
672
|
-
all_features = torch.
|
|
672
|
+
all_features = torch.full((num_samples, num_nodes, num_data_types), float('nan'), dtype=torch.float)
|
|
673
673
|
|
|
674
|
-
# Extract features for each data type and place them in the tensor
|
|
675
674
|
for i, data_type in enumerate(self.multiomic_dataset.dat):
|
|
676
|
-
# Get the data matrix
|
|
677
675
|
data_matrix = self.multiomic_dataset.dat[data_type]
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
676
|
+
feature_indices = {
|
|
677
|
+
gene: self.multiomic_dataset.features[data_type].get_loc(gene)
|
|
678
|
+
for gene in self.common_features if gene in self.multiomic_dataset.features[data_type]
|
|
679
|
+
}
|
|
680
|
+
valid_indices = torch.tensor(list(feature_indices.values()))
|
|
681
|
+
feature_positions = torch.tensor([self.gene_to_index[gene] for gene in feature_indices.keys()])
|
|
682
|
+
|
|
683
|
+
# Fill in the available data
|
|
684
|
+
all_features[:, feature_positions, i] = data_matrix[:, valid_indices]
|
|
685
|
+
|
|
686
|
+
# Precompute medians for all data types, ignoring NaN values
|
|
687
|
+
medians = torch.nanmedian(all_features, dim=1, keepdim=True).values # Use .values to get the actual median tensor
|
|
688
|
+
|
|
689
|
+
# Replace all NaN values in all_features with their corresponding median values
|
|
690
|
+
isnan = torch.isnan(all_features)
|
|
691
|
+
all_features[isnan] = medians.expand_as(all_features)[isnan]
|
|
692
|
+
|
|
682
693
|
return all_features
|
|
683
694
|
|
|
684
695
|
def subset(self, indices):
|
|
@@ -218,7 +218,7 @@ def evaluate_regressor(y_true, y_pred):
|
|
|
218
218
|
r2 = r_value**2
|
|
219
219
|
return {"mse": mse, "r2": r2, "pearson_corr": r_value}
|
|
220
220
|
|
|
221
|
-
def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
|
|
221
|
+
def evaluate_wrapper(method, y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
|
|
222
222
|
metrics_list = []
|
|
223
223
|
for var in y_pred_dict.keys():
|
|
224
224
|
if dataset.variable_types[var] == 'numerical':
|
|
@@ -235,6 +235,7 @@ def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var
|
|
|
235
235
|
|
|
236
236
|
for metric, value in metrics.items():
|
|
237
237
|
metrics_list.append({
|
|
238
|
+
'method': method,
|
|
238
239
|
'var': var,
|
|
239
240
|
'variable_type': dataset.variable_types[var],
|
|
240
241
|
'metric': metric,
|
|
@@ -263,8 +264,7 @@ def get_predicted_labels(y_pred_dict, dataset, split):
|
|
|
263
264
|
dfs.append(df)
|
|
264
265
|
return pd.concat(dfs, ignore_index=True)
|
|
265
266
|
|
|
266
|
-
|
|
267
|
-
def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_folds=5, n_jobs = 4):
|
|
267
|
+
def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, methods, n_folds=5, n_jobs=4):
|
|
268
268
|
def prepare_data(data_object):
|
|
269
269
|
# Concatenate Data Matrices
|
|
270
270
|
X = np.concatenate([tensor for tensor in data_object.dat.values()], axis=1)
|
|
@@ -281,52 +281,32 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
|
|
|
281
281
|
# Determine variable type
|
|
282
282
|
variable_type = train_dataset.variable_types[variable_name]
|
|
283
283
|
|
|
284
|
-
# Initialize models and parameter grids
|
|
285
|
-
if variable_type == 'categorical':
|
|
286
|
-
model_params = {
|
|
287
|
-
'RandomForestClassifier': {
|
|
288
|
-
'model': RandomForestClassifier(random_state=42),
|
|
289
|
-
'params': {
|
|
290
|
-
'n_estimators': [100, 200, 300],
|
|
291
|
-
'max_depth': [10, 20, None]
|
|
292
|
-
}
|
|
293
|
-
},
|
|
294
|
-
'SVC': {
|
|
295
|
-
'model': SVC(),
|
|
296
|
-
'params': {
|
|
297
|
-
'C': [0.1, 1, 10],
|
|
298
|
-
'kernel': ['rbf', 'poly']
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
elif variable_type == 'numerical':
|
|
303
|
-
model_params = {
|
|
304
|
-
'RandomForestRegressor': {
|
|
305
|
-
'model': RandomForestRegressor(random_state=42),
|
|
306
|
-
'params': {
|
|
307
|
-
'n_estimators': [100, 200, 300],
|
|
308
|
-
'max_depth': [10, 20, None]
|
|
309
|
-
}
|
|
310
|
-
},
|
|
311
|
-
'SVR': {
|
|
312
|
-
'model': SVR(),
|
|
313
|
-
'params': {
|
|
314
|
-
'C': [0.1, 1, 10],
|
|
315
|
-
'kernel': ['rbf', 'poly']
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
|
|
320
284
|
# Cross-Validation and Training
|
|
321
285
|
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
322
286
|
X_train, y_train = prepare_data(train_dataset)
|
|
323
|
-
print("Train:",X_train.shape)
|
|
287
|
+
print("Train:", X_train.shape)
|
|
324
288
|
X_test, y_test = prepare_data(test_dataset)
|
|
325
|
-
print("Test:",X_test.shape)
|
|
289
|
+
print("Test:", X_test.shape)
|
|
326
290
|
|
|
327
291
|
metrics_list = []
|
|
328
|
-
|
|
329
|
-
|
|
292
|
+
|
|
293
|
+
for method in methods:
|
|
294
|
+
if variable_type == 'categorical':
|
|
295
|
+
if method == 'RandomForest':
|
|
296
|
+
model = RandomForestClassifier(random_state=42)
|
|
297
|
+
params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
|
|
298
|
+
elif method == 'SVM':
|
|
299
|
+
model = SVC(random_state=42)
|
|
300
|
+
params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
|
|
301
|
+
elif variable_type == 'numerical':
|
|
302
|
+
if method == 'RandomForest':
|
|
303
|
+
model = RandomForestRegressor(random_state=42)
|
|
304
|
+
params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
|
|
305
|
+
elif method == 'SVM':
|
|
306
|
+
model = SVR()
|
|
307
|
+
params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
|
|
308
|
+
|
|
309
|
+
grid_search = GridSearchCV(model, params, cv=kf, n_jobs=n_jobs)
|
|
330
310
|
grid_search.fit(X_train, y_train)
|
|
331
311
|
best_model = grid_search.best_estimator_
|
|
332
312
|
|
|
@@ -341,7 +321,7 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
|
|
|
341
321
|
|
|
342
322
|
for metric, value in metrics.items():
|
|
343
323
|
metrics_list.append({
|
|
344
|
-
'method':
|
|
324
|
+
'method': method + ('Classifier' if variable_type == 'categorical' else 'Regressor'),
|
|
345
325
|
'var': variable_name,
|
|
346
326
|
'variable_type': variable_type,
|
|
347
327
|
'metric': metric,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: flexynesis
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
|
|
5
5
|
Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
|
|
6
6
|
Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|