flexynesis 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {flexynesis-0.2.2 → flexynesis-0.2.4}/PKG-INFO +1 -1
  2. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/__main__.py +85 -44
  3. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/config.py +4 -4
  4. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/data.py +29 -18
  5. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/main.py +8 -5
  6. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/utils.py +24 -44
  7. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/PKG-INFO +1 -1
  8. {flexynesis-0.2.2 → flexynesis-0.2.4}/pyproject.toml +1 -1
  9. {flexynesis-0.2.2 → flexynesis-0.2.4}/LICENCE.md +0 -0
  10. {flexynesis-0.2.2 → flexynesis-0.2.4}/README.md +0 -0
  11. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/__init__.py +0 -0
  12. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/cli.py +0 -0
  13. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/feature_selection.py +0 -0
  14. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/__init__.py +0 -0
  15. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/crossmodal_pred.py +0 -0
  16. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/direct_pred.py +0 -0
  17. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/gnn_early.py +0 -0
  18. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
  19. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
  20. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
  21. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/supervised_vae.py +0 -0
  22. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/models/triplet_encoder.py +0 -0
  23. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis/modules.py +0 -0
  24. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/SOURCES.txt +0 -0
  25. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/dependency_links.txt +0 -0
  26. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/entry_points.txt +0 -0
  27. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/requires.txt +0 -0
  28. {flexynesis-0.2.2 → flexynesis-0.2.4}/flexynesis.egg-info/top_level.txt +0 -0
  29. {flexynesis-0.2.2 → flexynesis-0.2.4}/setup.cfg +0 -0
  30. {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/__init__.py +0 -0
  31. {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/unit/__init__.py +0 -0
  32. {flexynesis-0.2.2 → flexynesis-0.2.4}/tests/unit/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -1,7 +1,7 @@
1
1
  from lightning import seed_everything
2
2
  import lightning as pl
3
3
  from typing import NamedTuple
4
- import os, yaml, torch, time, random, warnings, argparse
4
+ import os, yaml, torch, time, random, warnings, argparse, sys
5
5
  os.environ["OMP_NUM_THREADS"] = "1"
6
6
  import pandas as pd
7
7
  import flexynesis
@@ -18,7 +18,8 @@ def main():
18
18
 
19
19
  Args:
20
20
  --data_path (str): Path to the folder with train/test data files. (Required)
21
- --model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae", "MultiTripletNetwork", "CrossModalPred"]. (Required)
21
+ --model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
22
+ "MultiTripletNetwork", "CrossModalPred", "RandomForest", "SVM", "RandomSurvivalForest"]. (Required)
22
23
  --gnn_conv_type (str): If model_class is set to GNN, choose which graph convolution type to use. Choices are ["GC", "GCN", "SAGE"].
23
24
  --target_variables (str): Which variables in 'clin.csv' to use for predictions, comma-separated if multiple. Optional if survival variables are not set to None.
24
25
  --batch_variables (str): Which variables in 'clin.csv' to use for data integration/batch correction, comma-separated if multiple. Optional.
@@ -44,8 +45,9 @@ def main():
44
45
  --hpo_patience (int): How many hyperparameter optimisation iterations to wait for when no improvements are observed. Default is 10; set to 0 to disable early stopping.
45
46
  --use_cv (bool): If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done.
46
47
  --use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
47
- --evaluate_baseline_performance (str): Whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset. Choices are ['True', 'False']. Default is 'True'.
48
+ --evaluate_baseline_performance (bool): Enables modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset.
48
49
  --threads (int): How many threads to use when using CPU. Default is 4.
50
+ --num_workers (int): How many workers to use for model training. Default is 2
49
51
  --use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
50
52
  --disable_marker_finding (bool): If set, marker discovery after model training is disabled.
51
53
  --string_organism (int): STRING DB organism id. Default is 9606.
@@ -56,7 +58,7 @@ def main():
56
58
 
57
59
  parser.add_argument("--data_path", help="(Required) Path to the folder with train/test data files", type=str, required = True)
58
60
  parser.add_argument("--model_class", help="(Required) The kind of model class to instantiate", type=str,
59
- choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN"], required = True)
61
+ choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN", "RandomForest", "SVM", "RandomSurvivalForest"], required = True)
60
62
  parser.add_argument("--gnn_conv_type", help="If model_class is set to GNN, choose which graph convolution type to use", type=str,
61
63
  choices=["GC", "GCN", "SAGE"])
62
64
  parser.add_argument("--target_variables",
@@ -97,8 +99,10 @@ def main():
97
99
  parser.add_argument("--use_cv", action="store_true",
98
100
  help="(Optional) If set, the a 5-fold cross-validation training will be done. Otherwise, a single trainig on 80 percent of the dataset is done.")
99
101
  parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
100
- parser.add_argument("--evaluate_baseline_performance", help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset", type=str, choices=['True', 'False'], default = 'True')
102
+ parser.add_argument("--evaluate_baseline_performance", action="store_true",
103
+ help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset")
101
104
  parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
105
+ parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
102
106
  parser.add_argument("--use_gpu", action="store_true",
103
107
  help="(Optional) If set, the system will attempt to use CUDA/GPU if available.")
104
108
  parser.add_argument("--disable_marker_finding", action="store_true",
@@ -184,20 +188,24 @@ def main():
184
188
  if not os.path.exists(args.outdir):
185
189
  raise FileNotFoundError(f"Path to --outdir doesn't exist at:", {args.outdir})
186
190
 
187
- class AvailableModels(NamedTuple):
188
- # type AvailableModel = ModelClass: Type, ModelConfig: str
189
- DirectPred: tuple[DirectPred, str] = DirectPred, "DirectPred"
190
- supervised_vae: tuple[supervised_vae, str] = supervised_vae, "supervised_vae"
191
- MultiTripletNetwork: tuple[MultiTripletNetwork, str] = MultiTripletNetwork, "MultiTripletNetwork"
192
- CrossModalPred: tuple[CrossModalPred, str] = CrossModalPred, "CrossModalPred"
193
- GNN: tuple[GNN, str] = GNN, "GNN"
194
-
195
- available_models = AvailableModels()
196
- model_class = getattr(available_models, args.model_class, None)
197
- if model_class is None:
198
- raise ValueError(f"Invalid model_class: {args.model_class}")
199
- else:
200
- model_class, config_name = model_class
191
+ available_models = {
192
+ "DirectPred": (DirectPred, "DirectPred"),
193
+ "supervised_vae": (supervised_vae, "supervised_vae"),
194
+ "MultiTripletNetwork": (MultiTripletNetwork, "MultiTripletNetwork"),
195
+ "CrossModalPred": (CrossModalPred, "CrossModalPred"),
196
+ "GNN": (GNN, "GNN"),
197
+ "RandomForest": ("RandomForest", None),
198
+ "SVM": ("SVM", None),
199
+ "RandomSurvivalForest": ("RandomSurvivalForest", None)
200
+ }
201
+
202
+ model_info = available_models.get(args.model_class)
203
+
204
+ if model_info is None:
205
+ raise ValueError(f"Unsupported model class {args.model_class}")
206
+
207
+ # Unpack the tuple into model class and config name
208
+ model_class, config_name = model_info
201
209
 
202
210
  # import assays and labels
203
211
  inputDir = args.data_path
@@ -219,6 +227,34 @@ def main():
219
227
  downsample = args.subsample)
220
228
  train_dataset, test_dataset = data_importer.import_data()
221
229
 
230
+ if args.model_class in ["RandomForest", "SVM"]:
231
+ if args.target_variables:
232
+ var = args.target_variables.strip().split(',')[0]
233
+ print(f"Training {args.model_class} on variable: {var}")
234
+ metrics = flexynesis.evaluate_baseline_performance(train_dataset, test_dataset, variable_name=var,
235
+ methods=[args.model_class], n_folds=5, n_jobs=args.threads)
236
+ metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
237
+ print(f"{args.model_class} evaluation complete. Results saved.")
238
+ # we skip everything related to deep learning models here
239
+ sys.exit(0)
240
+ else:
241
+ raise ValueError(f"At least one target variable is required to run RandomForest/SVM models. Set --target_variables argument")
242
+
243
+ if args.model_class == "RandomSurvivalForest":
244
+ if args.surv_event_var and args.surv_time_var:
245
+ print(f"Training {args.model_class} on survival variables: {args.surv_event_var} and {args.surv_time_var}")
246
+ metrics = flexynesis.evaluate_baseline_survival_performance(train_dataset, test_dataset,
247
+ args.surv_time_var,
248
+ args.surv_event_var,
249
+ n_folds = 5,
250
+ n_jobs = int(args.threads))
251
+ metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
252
+ print(f"{args.model_class} evaluation complete. Results saved.")
253
+ # we skip everything related to deep learning models here
254
+ sys.exit(0)
255
+ else:
256
+ raise ValueError(f"Missing survival variables. Set --surv_event_var --surv_time_var arguments")
257
+
222
258
  if args.model_class == 'GNN':
223
259
  # overlay datasets with network info
224
260
  # this is a temporary solution
@@ -253,7 +289,8 @@ def main():
253
289
  device_type = device_type,
254
290
  gnn_conv_type = gnn_conv_type,
255
291
  input_layers = input_layers,
256
- output_layers = output_layers)
292
+ output_layers = output_layers,
293
+ num_workers = args.num_workers)
257
294
 
258
295
  # do a hyperparameter search training multiple models and get the best_configuration
259
296
  model, best_params = tuner.perform_tuning(hpo_patience = args.hpo_patience)
@@ -279,20 +316,16 @@ def main():
279
316
  # update the test dataset to exclude finetuning samples
280
317
  test_dataset = holdout_dataset
281
318
 
319
+ # get sample embeddings and save
320
+ print("[INFO] Extracting sample embeddings")
321
+ embeddings_train = model.transform(train_dataset)
322
+ embeddings_test = model.transform(test_dataset)
323
+
324
+ embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
325
+ embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
326
+
282
327
  # evaluate predictions; (if any supervised learning happened)
283
328
  if any([args.target_variables, args.surv_event_var, args.batch_variables]):
284
- print("[INFO] Computing model evaluation metrics")
285
- metrics_df = flexynesis.evaluate_wrapper(model.predict(test_dataset), test_dataset,
286
- surv_event_var=model.surv_event_var,
287
- surv_time_var=model.surv_time_var)
288
- metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
289
-
290
- # print known/predicted labels
291
- predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
292
- flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
293
- ignore_index=True)
294
- predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
295
-
296
329
  if not args.disable_marker_finding: # unless marker discovery is disabled
297
330
  # compute feature importance values
298
331
  print("[INFO] Computing variable importance scores")
@@ -302,14 +335,19 @@ def main():
302
335
  ignore_index = True)
303
336
  df_imp.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'feature_importance.csv'])), header=True, index=False)
304
337
 
305
- # get sample embeddings and save
306
- print("[INFO] Extracting sample embeddings")
307
- embeddings_train = model.transform(train_dataset)
308
- embeddings_test = model.transform(test_dataset)
309
-
310
- embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
311
- embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
312
-
338
+ # print known/predicted labels
339
+ predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
340
+ flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
341
+ ignore_index=True)
342
+ predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
343
+
344
+ print("[INFO] Computing model evaluation metrics")
345
+ metrics_df = flexynesis.evaluate_wrapper(args.model_class, model.predict(test_dataset), test_dataset,
346
+ surv_event_var=model.surv_event_var,
347
+ surv_time_var=model.surv_time_var)
348
+ metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
349
+
350
+
313
351
  # also filter embeddings to remove batch-associated dims and only keep target-variable associated dims
314
352
  if args.batch_variables is not None:
315
353
  print("[INFO] Printing filtered embeddings")
@@ -336,7 +374,7 @@ def main():
336
374
 
337
375
 
338
376
  # evaluate off-the-shelf methods on the main target variable
339
- if args.evaluate_baseline_performance == 'True':
377
+ if args.evaluate_baseline_performance:
340
378
  print("[INFO] Computing off-the-shelf method performance on first target variable:",model.target_variables[0])
341
379
  var = model.target_variables[0]
342
380
  metrics = pd.DataFrame()
@@ -348,9 +386,10 @@ def main():
348
386
 
349
387
  if var != model.surv_event_var:
350
388
  metrics = flexynesis.evaluate_baseline_performance(train, test,
351
- variable_name = var,
352
- n_folds=5,
353
- n_jobs = int(args.threads))
389
+ variable_name = var,
390
+ methods = ['RandomForest', 'SVM'],
391
+ n_folds = 5,
392
+ n_jobs = int(args.threads))
354
393
  if model.surv_event_var and model.surv_time_var:
355
394
  print("[INFO] Computing off-the-shelf method performance on survival variable:",model.surv_time_var)
356
395
  metrics_baseline_survival = flexynesis.evaluate_baseline_survival_performance(train, test,
@@ -366,5 +405,7 @@ def main():
366
405
  # save the trained model in file
367
406
  torch.save(model, os.path.join(args.outdir, '.'.join([args.prefix, 'final_model.pth'])))
368
407
 
408
+
369
409
  if __name__ == "__main__":
370
410
  main()
411
+ print("[INFO] Finished the analysis!")
@@ -6,28 +6,28 @@ epochs = [500]
6
6
  search_spaces = {
7
7
  'DirectPred': [
8
8
  Integer(16, 128, name='latent_dim'),
9
- Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
9
+ Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
10
10
  Real(0.0001, 0.01, prior='log-uniform', name='lr'),
11
11
  Integer(8, 32, name='supervisor_hidden_dim'),
12
12
  Categorical(epochs, name='epochs')
13
13
  ],
14
14
  'supervised_vae': [
15
15
  Integer(16, 128, name='latent_dim'),
16
- Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
16
+ Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
17
17
  Integer(8, 32, name='supervisor_hidden_dim'),
18
18
  Real(0.0001, 0.01, prior='log-uniform', name='lr'),
19
19
  Categorical(epochs, name='epochs')
20
20
  ],
21
21
  'CrossModalPred': [
22
22
  Integer(16, 128, name='latent_dim'),
23
- Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
23
+ Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
24
24
  Integer(8, 32, name='supervisor_hidden_dim'),
25
25
  Real(0.0001, 0.01, prior='log-uniform', name='lr'),
26
26
  Categorical(epochs, name='epochs')
27
27
  ],
28
28
  'MultiTripletNetwork': [
29
29
  Integer(16, 128, name='latent_dim'),
30
- Real(0.2, 1, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
30
+ Real(0.2, 0.5, name='hidden_dim_factor'), # relative size of the hidden_dim w.r.t input_dim
31
31
  Integer(8, 32, name='supervisor_hidden_dim'),
32
32
  Real(0.0001, 0.01, prior='log-uniform', name='lr'),
33
33
  Categorical(epochs, name='epochs')
@@ -627,13 +627,14 @@ class TripletMultiOmicDataset(Dataset):
627
627
  for label in labels_set}
628
628
  return labels_set, label_to_indices
629
629
 
630
+
630
631
  class MultiOmicDatasetNW(Dataset):
631
632
  def __init__(self, multiomic_dataset, interaction_df):
632
633
  self.multiomic_dataset = multiomic_dataset
633
634
  self.interaction_df = interaction_df
634
635
 
635
- # Precompute common features and edge index
636
- self.common_features = self.find_common_features()
636
+ # Compute union of features in the data matrices that also appear in the network
637
+ self.common_features = self.find_union_features()
637
638
  self.gene_to_index = {gene: idx for idx, gene in enumerate(self.common_features)}
638
639
  self.edge_index = self.create_edge_index()
639
640
  self.samples = self.multiomic_dataset.samples
@@ -647,38 +648,48 @@ class MultiOmicDatasetNW(Dataset):
647
648
  # Store labels for all samples
648
649
  self.labels = {target_name: labels for target_name, labels in self.multiomic_dataset.ann.items()}
649
650
 
650
- def find_common_features(self):
651
- common_features = set.intersection(*(set(features) for features in self.multiomic_dataset.features.values()))
651
+ def find_union_features(self):
652
+ # Find the union of all features in the multiomic dataset
653
+ all_omic_features = set().union(*(set(features) for features in self.multiomic_dataset.features.values()))
654
+ # Find the union of proteins involved in interactions
652
655
  interaction_genes = set(self.interaction_df['protein1']).union(set(self.interaction_df['protein2']))
653
- return list(common_features.intersection(interaction_genes))
656
+ # Return the intersection of omic features and interaction genes
657
+ return list(all_omic_features.intersection(interaction_genes))
654
658
 
655
659
  def create_edge_index(self):
660
+ # Create edges only if both proteins are within the available features
656
661
  filtered_df = self.interaction_df[
657
662
  (self.interaction_df['protein1'].isin(self.common_features)) &
658
663
  (self.interaction_df['protein2'].isin(self.common_features))
659
664
  ]
660
665
  edge_list = [(self.gene_to_index[row['protein1']], self.gene_to_index[row['protein2']]) for index, row in filtered_df.iterrows()]
661
666
  return torch.tensor(edge_list, dtype=torch.long).t()
662
-
667
+
663
668
  def precompute_node_features(self):
664
- # Find indices of common features in each data matrix
665
- feature_indices = {data_type: [self.multiomic_dataset.features[data_type].get_loc(gene)
666
- for gene in self.common_features]
667
- for data_type in self.multiomic_dataset.dat}
668
- # Create a tensor to store all features [num_samples, num_nodes, num_data_types]
669
669
  num_samples = len(self.samples)
670
670
  num_nodes = len(self.common_features)
671
671
  num_data_types = len(self.multiomic_dataset.dat)
672
- all_features = torch.empty((num_samples, num_nodes, num_data_types), dtype=torch.float)
672
+ all_features = torch.full((num_samples, num_nodes, num_data_types), float('nan'), dtype=torch.float)
673
673
 
674
- # Extract features for each data type and place them in the tensor
675
674
  for i, data_type in enumerate(self.multiomic_dataset.dat):
676
- # Get the data matrix
677
675
  data_matrix = self.multiomic_dataset.dat[data_type]
678
- # Use advanced indexing to extract features for all samples at once
679
- indices = feature_indices[data_type]
680
- if indices: # Ensure there are common features in this data type
681
- all_features[:, :, i] = data_matrix[:, indices]
676
+ feature_indices = {
677
+ gene: self.multiomic_dataset.features[data_type].get_loc(gene)
678
+ for gene in self.common_features if gene in self.multiomic_dataset.features[data_type]
679
+ }
680
+ valid_indices = torch.tensor(list(feature_indices.values()))
681
+ feature_positions = torch.tensor([self.gene_to_index[gene] for gene in feature_indices.keys()])
682
+
683
+ # Fill in the available data
684
+ all_features[:, feature_positions, i] = data_matrix[:, valid_indices]
685
+
686
+ # Precompute medians for all data types, ignoring NaN values
687
+ medians = torch.nanmedian(all_features, dim=1, keepdim=True).values # Use .values to get the actual median tensor
688
+
689
+ # Replace all NaN values in all_features with their corresponding median values
690
+ isnan = torch.isnan(all_features)
691
+ all_features[isnan] = medians.expand_as(all_features)[isnan]
692
+
682
693
  return all_features
683
694
 
684
695
  def subset(self, indices):
@@ -56,7 +56,7 @@ class HyperparameterTuning:
56
56
  cv_splits=5, use_loss_weighting=True, early_stop_patience=-1, device_type=None, gnn_conv_type=None,
57
57
  input_layers=None, output_layers=None): Initializes the hyperparameter tuner with specific settings.
58
58
 
59
- get_batch_space(min_size=16, max_size=256): Determines the batch size search space based on the dataset size.
59
+ get_batch_space(min_size=16, max_size=128): Determines the batch size search space based on the dataset size.
60
60
 
61
61
  setup_trainer(params, current_step, total_steps, full_train=False): Sets up the trainer with appropriate callbacks
62
62
  and configurations for either full training or validation based training.
@@ -80,7 +80,7 @@ class HyperparameterTuning:
80
80
  val_size = 0.2, use_cv = False, cv_splits = 5,
81
81
  use_loss_weighting = True, early_stop_patience = -1,
82
82
  device_type = None, gnn_conv_type = None,
83
- input_layers = None, output_layers = None):
83
+ input_layers = None, output_layers = None, num_workers = 2):
84
84
  self.dataset = dataset # dataset for model initiation
85
85
  self.loader_dataset = dataset # dataset for defining data loaders (this can be model specific)
86
86
  self.model_class = model_class
@@ -107,6 +107,7 @@ class HyperparameterTuning:
107
107
  self.gnn_conv_type = gnn_conv_type
108
108
  self.input_layers = input_layers
109
109
  self.output_layers = output_layers
110
+ self.num_workers = num_workers
110
111
 
111
112
  self.DataLoader = torch.utils.data.DataLoader # use torch data loader by default
112
113
 
@@ -128,7 +129,7 @@ class HyperparameterTuning:
128
129
  else:
129
130
  raise ValueError(f"'{self.config_name}' not found in the default config.")
130
131
 
131
- def get_batch_space(self, min_size = 32, max_size = 256):
132
+ def get_batch_space(self, min_size = 32, max_size = 128):
132
133
  m = int(np.log2(len(self.dataset) * 0.8))
133
134
  st = int(np.log2(min_size))
134
135
  end = int(np.log2(max_size))
@@ -214,9 +215,11 @@ class HyperparameterTuning:
214
215
  train_subset = torch.utils.data.Subset(self.loader_dataset, train_index)
215
216
  val_subset = torch.utils.data.Subset(self.loader_dataset, val_index)
216
217
  train_loader = self.DataLoader(train_subset, batch_size=int(params['batch_size']),
217
- pin_memory=True, shuffle=True, drop_last=True, num_workers = 4, prefetch_factor = None, persistent_workers = True)
218
+ pin_memory=True, shuffle=True, drop_last=True, num_workers = self.num_workers, prefetch_factor = None,
219
+ persistent_workers = self.num_workers > 0)
218
220
  val_loader = self.DataLoader(val_subset, batch_size=int(params['batch_size']),
219
- pin_memory=True, shuffle=False, num_workers = 4, prefetch_factor = None, persistent_workers = True)
221
+ pin_memory=True, shuffle=False, num_workers = self.num_workers, prefetch_factor = None,
222
+ persistent_workers = self.num_workers > 0)
220
223
 
221
224
  model = self.model_class(**model_args)
222
225
  trainer, early_stop_callback = self.setup_trainer(params, current_step, total_steps)
@@ -218,7 +218,7 @@ def evaluate_regressor(y_true, y_pred):
218
218
  r2 = r_value**2
219
219
  return {"mse": mse, "r2": r2, "pearson_corr": r_value}
220
220
 
221
- def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
221
+ def evaluate_wrapper(method, y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
222
222
  metrics_list = []
223
223
  for var in y_pred_dict.keys():
224
224
  if dataset.variable_types[var] == 'numerical':
@@ -235,6 +235,7 @@ def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var
235
235
 
236
236
  for metric, value in metrics.items():
237
237
  metrics_list.append({
238
+ 'method': method,
238
239
  'var': var,
239
240
  'variable_type': dataset.variable_types[var],
240
241
  'metric': metric,
@@ -263,8 +264,7 @@ def get_predicted_labels(y_pred_dict, dataset, split):
263
264
  dfs.append(df)
264
265
  return pd.concat(dfs, ignore_index=True)
265
266
 
266
- # evaluate performance of off-the-shelf methods such as Random Forests and SVMs on regression/classification tasks
267
- def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_folds=5, n_jobs = 4):
267
+ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, methods, n_folds=5, n_jobs=4):
268
268
  def prepare_data(data_object):
269
269
  # Concatenate Data Matrices
270
270
  X = np.concatenate([tensor for tensor in data_object.dat.values()], axis=1)
@@ -281,52 +281,32 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
281
281
  # Determine variable type
282
282
  variable_type = train_dataset.variable_types[variable_name]
283
283
 
284
- # Initialize models and parameter grids
285
- if variable_type == 'categorical':
286
- model_params = {
287
- 'RandomForestClassifier': {
288
- 'model': RandomForestClassifier(random_state=42),
289
- 'params': {
290
- 'n_estimators': [100, 200, 300],
291
- 'max_depth': [10, 20, None]
292
- }
293
- },
294
- 'SVC': {
295
- 'model': SVC(),
296
- 'params': {
297
- 'C': [0.1, 1, 10],
298
- 'kernel': ['rbf', 'poly']
299
- }
300
- }
301
- }
302
- elif variable_type == 'numerical':
303
- model_params = {
304
- 'RandomForestRegressor': {
305
- 'model': RandomForestRegressor(random_state=42),
306
- 'params': {
307
- 'n_estimators': [100, 200, 300],
308
- 'max_depth': [10, 20, None]
309
- }
310
- },
311
- 'SVR': {
312
- 'model': SVR(),
313
- 'params': {
314
- 'C': [0.1, 1, 10],
315
- 'kernel': ['rbf', 'poly']
316
- }
317
- }
318
- }
319
-
320
284
  # Cross-Validation and Training
321
285
  kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
322
286
  X_train, y_train = prepare_data(train_dataset)
323
- print("Train:",X_train.shape)
287
+ print("Train:", X_train.shape)
324
288
  X_test, y_test = prepare_data(test_dataset)
325
- print("Test:",X_test.shape)
289
+ print("Test:", X_test.shape)
326
290
 
327
291
  metrics_list = []
328
- for model_name, mp in model_params.items():
329
- grid_search = GridSearchCV(mp['model'], mp['params'], cv=kf, n_jobs=n_jobs)
292
+
293
+ for method in methods:
294
+ if variable_type == 'categorical':
295
+ if method == 'RandomForest':
296
+ model = RandomForestClassifier(random_state=42)
297
+ params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
298
+ elif method == 'SVM':
299
+ model = SVC(random_state=42)
300
+ params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
301
+ elif variable_type == 'numerical':
302
+ if method == 'RandomForest':
303
+ model = RandomForestRegressor(random_state=42)
304
+ params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
305
+ elif method == 'SVM':
306
+ model = SVR()
307
+ params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
308
+
309
+ grid_search = GridSearchCV(model, params, cv=kf, n_jobs=n_jobs)
330
310
  grid_search.fit(X_train, y_train)
331
311
  best_model = grid_search.best_estimator_
332
312
 
@@ -341,7 +321,7 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
341
321
 
342
322
  for metric, value in metrics.items():
343
323
  metrics_list.append({
344
- 'method': model_name,
324
+ 'method': method + ('Classifier' if variable_type == 'categorical' else 'Regressor'),
345
325
  'var': variable_name,
346
326
  'variable_type': variable_type,
347
327
  'metric': metric,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "flexynesis"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  authors = [
9
9
  {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
10
10
  {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},
File without changes
File without changes
File without changes
File without changes
File without changes