flexynesis 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {flexynesis-0.2.3 → flexynesis-0.2.4}/PKG-INFO +1 -1
  2. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/__main__.py +81 -43
  3. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/data.py +29 -18
  4. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/utils.py +24 -44
  5. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/PKG-INFO +1 -1
  6. {flexynesis-0.2.3 → flexynesis-0.2.4}/pyproject.toml +1 -1
  7. {flexynesis-0.2.3 → flexynesis-0.2.4}/LICENCE.md +0 -0
  8. {flexynesis-0.2.3 → flexynesis-0.2.4}/README.md +0 -0
  9. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/__init__.py +0 -0
  10. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/cli.py +0 -0
  11. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/config.py +0 -0
  12. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/feature_selection.py +0 -0
  13. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/main.py +0 -0
  14. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/__init__.py +0 -0
  15. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/crossmodal_pred.py +0 -0
  16. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/direct_pred.py +0 -0
  17. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/gnn_early.py +0 -0
  18. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_cnn.py +0 -0
  19. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/direct_pred_gcnn.py +0 -0
  20. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/on_ice/modules_on_ice.py +0 -0
  21. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/supervised_vae.py +0 -0
  22. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/models/triplet_encoder.py +0 -0
  23. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis/modules.py +0 -0
  24. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/SOURCES.txt +0 -0
  25. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/dependency_links.txt +0 -0
  26. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/entry_points.txt +0 -0
  27. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/requires.txt +0 -0
  28. {flexynesis-0.2.3 → flexynesis-0.2.4}/flexynesis.egg-info/top_level.txt +0 -0
  29. {flexynesis-0.2.3 → flexynesis-0.2.4}/setup.cfg +0 -0
  30. {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/__init__.py +0 -0
  31. {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/unit/__init__.py +0 -0
  32. {flexynesis-0.2.3 → flexynesis-0.2.4}/tests/unit/test_smoke.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -1,7 +1,7 @@
1
1
  from lightning import seed_everything
2
2
  import lightning as pl
3
3
  from typing import NamedTuple
4
- import os, yaml, torch, time, random, warnings, argparse
4
+ import os, yaml, torch, time, random, warnings, argparse, sys
5
5
  os.environ["OMP_NUM_THREADS"] = "1"
6
6
  import pandas as pd
7
7
  import flexynesis
@@ -18,7 +18,8 @@ def main():
18
18
 
19
19
  Args:
20
20
  --data_path (str): Path to the folder with train/test data files. (Required)
21
- --model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae", "MultiTripletNetwork", "CrossModalPred"]. (Required)
21
+ --model_class (str): The kind of model class to instantiate. Choices are ["DirectPred", "GNN", "supervised_vae",
22
+ "MultiTripletNetwork", "CrossModalPred", "RandomForest", "SVM", "RandomSurvivalForest"]. (Required)
22
23
  --gnn_conv_type (str): If model_class is set to GNN, choose which graph convolution type to use. Choices are ["GC", "GCN", "SAGE"].
23
24
  --target_variables (str): Which variables in 'clin.csv' to use for predictions, comma-separated if multiple. Optional if survival variables are not set to None.
24
25
  --batch_variables (str): Which variables in 'clin.csv' to use for data integration/batch correction, comma-separated if multiple. Optional.
@@ -44,7 +45,7 @@ def main():
44
45
  --hpo_patience (int): How many hyperparameter optimisation iterations to wait for when no improvements are observed. Default is 10; set to 0 to disable early stopping.
45
46
  --use_cv (bool): If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done.
46
47
  --use_loss_weighting (str): Whether to apply loss-balancing using uncertainty weights method. Choices are ['True', 'False']. Default is 'True'.
47
- --evaluate_baseline_performance (str): Whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset. Choices are ['True', 'False']. Default is 'True'.
48
+ --evaluate_baseline_performance (bool): Enables modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset.
48
49
  --threads (int): How many threads to use when using CPU. Default is 4.
49
50
  --num_workers (int): How many workers to use for model training. Default is 2
50
51
  --use_gpu (bool): If set, the system will attempt to use CUDA/GPU if available.
@@ -57,7 +58,7 @@ def main():
57
58
 
58
59
  parser.add_argument("--data_path", help="(Required) Path to the folder with train/test data files", type=str, required = True)
59
60
  parser.add_argument("--model_class", help="(Required) The kind of model class to instantiate", type=str,
60
- choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN"], required = True)
61
+ choices=["DirectPred", "supervised_vae", "MultiTripletNetwork", "CrossModalPred", "GNN", "RandomForest", "SVM", "RandomSurvivalForest"], required = True)
61
62
  parser.add_argument("--gnn_conv_type", help="If model_class is set to GNN, choose which graph convolution type to use", type=str,
62
63
  choices=["GC", "GCN", "SAGE"])
63
64
  parser.add_argument("--target_variables",
@@ -98,7 +99,8 @@ def main():
98
99
  parser.add_argument("--use_cv", action="store_true",
99
100
  help="(Optional) If set, the a 5-fold cross-validation training will be done. Otherwise, a single trainig on 80 percent of the dataset is done.")
100
101
  parser.add_argument("--use_loss_weighting", help="whether to apply loss-balancing using uncertainty weights method", type=str, choices=['True', 'False'], default = 'True')
101
- parser.add_argument("--evaluate_baseline_performance", help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset", type=str, choices=['True', 'False'], default = 'True')
102
+ parser.add_argument("--evaluate_baseline_performance", action="store_true",
103
+ help="whether to run Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset")
102
104
  parser.add_argument("--threads", help="(Optional) How many threads to use when using CPU (default is 4)", type=int, default = 4)
103
105
  parser.add_argument("--num_workers", help="(Optional) How many workers to use for model training (default is 2)", type=int, default = 2)
104
106
  parser.add_argument("--use_gpu", action="store_true",
@@ -186,20 +188,24 @@ def main():
186
188
  if not os.path.exists(args.outdir):
187
189
  raise FileNotFoundError(f"Path to --outdir doesn't exist at:", {args.outdir})
188
190
 
189
- class AvailableModels(NamedTuple):
190
- # type AvailableModel = ModelClass: Type, ModelConfig: str
191
- DirectPred: tuple[DirectPred, str] = DirectPred, "DirectPred"
192
- supervised_vae: tuple[supervised_vae, str] = supervised_vae, "supervised_vae"
193
- MultiTripletNetwork: tuple[MultiTripletNetwork, str] = MultiTripletNetwork, "MultiTripletNetwork"
194
- CrossModalPred: tuple[CrossModalPred, str] = CrossModalPred, "CrossModalPred"
195
- GNN: tuple[GNN, str] = GNN, "GNN"
196
-
197
- available_models = AvailableModels()
198
- model_class = getattr(available_models, args.model_class, None)
199
- if model_class is None:
200
- raise ValueError(f"Invalid model_class: {args.model_class}")
201
- else:
202
- model_class, config_name = model_class
191
+ available_models = {
192
+ "DirectPred": (DirectPred, "DirectPred"),
193
+ "supervised_vae": (supervised_vae, "supervised_vae"),
194
+ "MultiTripletNetwork": (MultiTripletNetwork, "MultiTripletNetwork"),
195
+ "CrossModalPred": (CrossModalPred, "CrossModalPred"),
196
+ "GNN": (GNN, "GNN"),
197
+ "RandomForest": ("RandomForest", None),
198
+ "SVM": ("SVM", None),
199
+ "RandomSurvivalForest": ("RandomSurvivalForest", None)
200
+ }
201
+
202
+ model_info = available_models.get(args.model_class)
203
+
204
+ if model_info is None:
205
+ raise ValueError(f"Unsupported model class {args.model_class}")
206
+
207
+ # Unpack the tuple into model class and config name
208
+ model_class, config_name = model_info
203
209
 
204
210
  # import assays and labels
205
211
  inputDir = args.data_path
@@ -221,6 +227,34 @@ def main():
221
227
  downsample = args.subsample)
222
228
  train_dataset, test_dataset = data_importer.import_data()
223
229
 
230
+ if args.model_class in ["RandomForest", "SVM"]:
231
+ if args.target_variables:
232
+ var = args.target_variables.strip().split(',')[0]
233
+ print(f"Training {args.model_class} on variable: {var}")
234
+ metrics = flexynesis.evaluate_baseline_performance(train_dataset, test_dataset, variable_name=var,
235
+ methods=[args.model_class], n_folds=5, n_jobs=args.threads)
236
+ metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
237
+ print(f"{args.model_class} evaluation complete. Results saved.")
238
+ # we skip everything related to deep learning models here
239
+ sys.exit(0)
240
+ else:
241
+ raise ValueError(f"At least one target variable is required to run RandomForest/SVM models. Set --target_variables argument")
242
+
243
+ if args.model_class == "RandomSurvivalForest":
244
+ if args.surv_event_var and args.surv_time_var:
245
+ print(f"Training {args.model_class} on survival variables: {args.surv_event_var} and {args.surv_time_var}")
246
+ metrics = flexynesis.evaluate_baseline_survival_performance(train_dataset, test_dataset,
247
+ args.surv_time_var,
248
+ args.surv_event_var,
249
+ n_folds = 5,
250
+ n_jobs = int(args.threads))
251
+ metrics.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
252
+ print(f"{args.model_class} evaluation complete. Results saved.")
253
+ # we skip everything related to deep learning models here
254
+ sys.exit(0)
255
+ else:
256
+ raise ValueError(f"Missing survival variables. Set --surv_event_var --surv_time_var arguments")
257
+
224
258
  if args.model_class == 'GNN':
225
259
  # overlay datasets with network info
226
260
  # this is a temporary solution
@@ -282,20 +316,16 @@ def main():
282
316
  # update the test dataset to exclude finetuning samples
283
317
  test_dataset = holdout_dataset
284
318
 
319
+ # get sample embeddings and save
320
+ print("[INFO] Extracting sample embeddings")
321
+ embeddings_train = model.transform(train_dataset)
322
+ embeddings_test = model.transform(test_dataset)
323
+
324
+ embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
325
+ embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
326
+
285
327
  # evaluate predictions; (if any supervised learning happened)
286
328
  if any([args.target_variables, args.surv_event_var, args.batch_variables]):
287
- print("[INFO] Computing model evaluation metrics")
288
- metrics_df = flexynesis.evaluate_wrapper(model.predict(test_dataset), test_dataset,
289
- surv_event_var=model.surv_event_var,
290
- surv_time_var=model.surv_time_var)
291
- metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
292
-
293
- # print known/predicted labels
294
- predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
295
- flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
296
- ignore_index=True)
297
- predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
298
-
299
329
  if not args.disable_marker_finding: # unless marker discovery is disabled
300
330
  # compute feature importance values
301
331
  print("[INFO] Computing variable importance scores")
@@ -305,14 +335,19 @@ def main():
305
335
  ignore_index = True)
306
336
  df_imp.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'feature_importance.csv'])), header=True, index=False)
307
337
 
308
- # get sample embeddings and save
309
- print("[INFO] Extracting sample embeddings")
310
- embeddings_train = model.transform(train_dataset)
311
- embeddings_test = model.transform(test_dataset)
312
-
313
- embeddings_train.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.csv'])), header=True)
314
- embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)
315
-
338
+ # print known/predicted labels
339
+ predicted_labels = pd.concat([flexynesis.get_predicted_labels(model.predict(train_dataset), train_dataset, 'train'),
340
+ flexynesis.get_predicted_labels(model.predict(test_dataset), test_dataset, 'test')],
341
+ ignore_index=True)
342
+ predicted_labels.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'predicted_labels.csv'])), header=True, index=False)
343
+
344
+ print("[INFO] Computing model evaluation metrics")
345
+ metrics_df = flexynesis.evaluate_wrapper(args.model_class, model.predict(test_dataset), test_dataset,
346
+ surv_event_var=model.surv_event_var,
347
+ surv_time_var=model.surv_time_var)
348
+ metrics_df.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'stats.csv'])), header=True, index=False)
349
+
350
+
316
351
  # also filter embeddings to remove batch-associated dims and only keep target-variable associated dims
317
352
  if args.batch_variables is not None:
318
353
  print("[INFO] Printing filtered embeddings")
@@ -339,7 +374,7 @@ def main():
339
374
 
340
375
 
341
376
  # evaluate off-the-shelf methods on the main target variable
342
- if args.evaluate_baseline_performance == 'True':
377
+ if args.evaluate_baseline_performance:
343
378
  print("[INFO] Computing off-the-shelf method performance on first target variable:",model.target_variables[0])
344
379
  var = model.target_variables[0]
345
380
  metrics = pd.DataFrame()
@@ -351,9 +386,10 @@ def main():
351
386
 
352
387
  if var != model.surv_event_var:
353
388
  metrics = flexynesis.evaluate_baseline_performance(train, test,
354
- variable_name = var,
355
- n_folds=5,
356
- n_jobs = int(args.threads))
389
+ variable_name = var,
390
+ methods = ['RandomForest', 'SVM'],
391
+ n_folds = 5,
392
+ n_jobs = int(args.threads))
357
393
  if model.surv_event_var and model.surv_time_var:
358
394
  print("[INFO] Computing off-the-shelf method performance on survival variable:",model.surv_time_var)
359
395
  metrics_baseline_survival = flexynesis.evaluate_baseline_survival_performance(train, test,
@@ -369,5 +405,7 @@ def main():
369
405
  # save the trained model in file
370
406
  torch.save(model, os.path.join(args.outdir, '.'.join([args.prefix, 'final_model.pth'])))
371
407
 
408
+
372
409
  if __name__ == "__main__":
373
410
  main()
411
+ print("[INFO] Finished the analysis!")
@@ -627,13 +627,14 @@ class TripletMultiOmicDataset(Dataset):
627
627
  for label in labels_set}
628
628
  return labels_set, label_to_indices
629
629
 
630
+
630
631
  class MultiOmicDatasetNW(Dataset):
631
632
  def __init__(self, multiomic_dataset, interaction_df):
632
633
  self.multiomic_dataset = multiomic_dataset
633
634
  self.interaction_df = interaction_df
634
635
 
635
- # Precompute common features and edge index
636
- self.common_features = self.find_common_features()
636
+ # Compute union of features in the data matrices that also appear in the network
637
+ self.common_features = self.find_union_features()
637
638
  self.gene_to_index = {gene: idx for idx, gene in enumerate(self.common_features)}
638
639
  self.edge_index = self.create_edge_index()
639
640
  self.samples = self.multiomic_dataset.samples
@@ -647,38 +648,48 @@ class MultiOmicDatasetNW(Dataset):
647
648
  # Store labels for all samples
648
649
  self.labels = {target_name: labels for target_name, labels in self.multiomic_dataset.ann.items()}
649
650
 
650
- def find_common_features(self):
651
- common_features = set.intersection(*(set(features) for features in self.multiomic_dataset.features.values()))
651
+ def find_union_features(self):
652
+ # Find the union of all features in the multiomic dataset
653
+ all_omic_features = set().union(*(set(features) for features in self.multiomic_dataset.features.values()))
654
+ # Find the union of proteins involved in interactions
652
655
  interaction_genes = set(self.interaction_df['protein1']).union(set(self.interaction_df['protein2']))
653
- return list(common_features.intersection(interaction_genes))
656
+ # Return the intersection of omic features and interaction genes
657
+ return list(all_omic_features.intersection(interaction_genes))
654
658
 
655
659
  def create_edge_index(self):
660
+ # Create edges only if both proteins are within the available features
656
661
  filtered_df = self.interaction_df[
657
662
  (self.interaction_df['protein1'].isin(self.common_features)) &
658
663
  (self.interaction_df['protein2'].isin(self.common_features))
659
664
  ]
660
665
  edge_list = [(self.gene_to_index[row['protein1']], self.gene_to_index[row['protein2']]) for index, row in filtered_df.iterrows()]
661
666
  return torch.tensor(edge_list, dtype=torch.long).t()
662
-
667
+
663
668
  def precompute_node_features(self):
664
- # Find indices of common features in each data matrix
665
- feature_indices = {data_type: [self.multiomic_dataset.features[data_type].get_loc(gene)
666
- for gene in self.common_features]
667
- for data_type in self.multiomic_dataset.dat}
668
- # Create a tensor to store all features [num_samples, num_nodes, num_data_types]
669
669
  num_samples = len(self.samples)
670
670
  num_nodes = len(self.common_features)
671
671
  num_data_types = len(self.multiomic_dataset.dat)
672
- all_features = torch.empty((num_samples, num_nodes, num_data_types), dtype=torch.float)
672
+ all_features = torch.full((num_samples, num_nodes, num_data_types), float('nan'), dtype=torch.float)
673
673
 
674
- # Extract features for each data type and place them in the tensor
675
674
  for i, data_type in enumerate(self.multiomic_dataset.dat):
676
- # Get the data matrix
677
675
  data_matrix = self.multiomic_dataset.dat[data_type]
678
- # Use advanced indexing to extract features for all samples at once
679
- indices = feature_indices[data_type]
680
- if indices: # Ensure there are common features in this data type
681
- all_features[:, :, i] = data_matrix[:, indices]
676
+ feature_indices = {
677
+ gene: self.multiomic_dataset.features[data_type].get_loc(gene)
678
+ for gene in self.common_features if gene in self.multiomic_dataset.features[data_type]
679
+ }
680
+ valid_indices = torch.tensor(list(feature_indices.values()))
681
+ feature_positions = torch.tensor([self.gene_to_index[gene] for gene in feature_indices.keys()])
682
+
683
+ # Fill in the available data
684
+ all_features[:, feature_positions, i] = data_matrix[:, valid_indices]
685
+
686
+ # Precompute medians for all data types, ignoring NaN values
687
+ medians = torch.nanmedian(all_features, dim=1, keepdim=True).values # Use .values to get the actual median tensor
688
+
689
+ # Replace all NaN values in all_features with their corresponding median values
690
+ isnan = torch.isnan(all_features)
691
+ all_features[isnan] = medians.expand_as(all_features)[isnan]
692
+
682
693
  return all_features
683
694
 
684
695
  def subset(self, indices):
@@ -218,7 +218,7 @@ def evaluate_regressor(y_true, y_pred):
218
218
  r2 = r_value**2
219
219
  return {"mse": mse, "r2": r2, "pearson_corr": r_value}
220
220
 
221
- def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
221
+ def evaluate_wrapper(method, y_pred_dict, dataset, surv_event_var = None, surv_time_var = None):
222
222
  metrics_list = []
223
223
  for var in y_pred_dict.keys():
224
224
  if dataset.variable_types[var] == 'numerical':
@@ -235,6 +235,7 @@ def evaluate_wrapper(y_pred_dict, dataset, surv_event_var = None, surv_time_var
235
235
 
236
236
  for metric, value in metrics.items():
237
237
  metrics_list.append({
238
+ 'method': method,
238
239
  'var': var,
239
240
  'variable_type': dataset.variable_types[var],
240
241
  'metric': metric,
@@ -263,8 +264,7 @@ def get_predicted_labels(y_pred_dict, dataset, split):
263
264
  dfs.append(df)
264
265
  return pd.concat(dfs, ignore_index=True)
265
266
 
266
- # evaluate performance of off-the-shelf methods such as Random Forests and SVMs on regression/classification tasks
267
- def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_folds=5, n_jobs = 4):
267
+ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, methods, n_folds=5, n_jobs=4):
268
268
  def prepare_data(data_object):
269
269
  # Concatenate Data Matrices
270
270
  X = np.concatenate([tensor for tensor in data_object.dat.values()], axis=1)
@@ -281,52 +281,32 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
281
281
  # Determine variable type
282
282
  variable_type = train_dataset.variable_types[variable_name]
283
283
 
284
- # Initialize models and parameter grids
285
- if variable_type == 'categorical':
286
- model_params = {
287
- 'RandomForestClassifier': {
288
- 'model': RandomForestClassifier(random_state=42),
289
- 'params': {
290
- 'n_estimators': [100, 200, 300],
291
- 'max_depth': [10, 20, None]
292
- }
293
- },
294
- 'SVC': {
295
- 'model': SVC(),
296
- 'params': {
297
- 'C': [0.1, 1, 10],
298
- 'kernel': ['rbf', 'poly']
299
- }
300
- }
301
- }
302
- elif variable_type == 'numerical':
303
- model_params = {
304
- 'RandomForestRegressor': {
305
- 'model': RandomForestRegressor(random_state=42),
306
- 'params': {
307
- 'n_estimators': [100, 200, 300],
308
- 'max_depth': [10, 20, None]
309
- }
310
- },
311
- 'SVR': {
312
- 'model': SVR(),
313
- 'params': {
314
- 'C': [0.1, 1, 10],
315
- 'kernel': ['rbf', 'poly']
316
- }
317
- }
318
- }
319
-
320
284
  # Cross-Validation and Training
321
285
  kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
322
286
  X_train, y_train = prepare_data(train_dataset)
323
- print("Train:",X_train.shape)
287
+ print("Train:", X_train.shape)
324
288
  X_test, y_test = prepare_data(test_dataset)
325
- print("Test:",X_test.shape)
289
+ print("Test:", X_test.shape)
326
290
 
327
291
  metrics_list = []
328
- for model_name, mp in model_params.items():
329
- grid_search = GridSearchCV(mp['model'], mp['params'], cv=kf, n_jobs=n_jobs)
292
+
293
+ for method in methods:
294
+ if variable_type == 'categorical':
295
+ if method == 'RandomForest':
296
+ model = RandomForestClassifier(random_state=42)
297
+ params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
298
+ elif method == 'SVM':
299
+ model = SVC(random_state=42)
300
+ params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
301
+ elif variable_type == 'numerical':
302
+ if method == 'RandomForest':
303
+ model = RandomForestRegressor(random_state=42)
304
+ params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None]}
305
+ elif method == 'SVM':
306
+ model = SVR()
307
+ params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'poly']}
308
+
309
+ grid_search = GridSearchCV(model, params, cv=kf, n_jobs=n_jobs)
330
310
  grid_search.fit(X_train, y_train)
331
311
  best_model = grid_search.best_estimator_
332
312
 
@@ -341,7 +321,7 @@ def evaluate_baseline_performance(train_dataset, test_dataset, variable_name, n_
341
321
 
342
322
  for metric, value in metrics.items():
343
323
  metrics_list.append({
344
- 'method': model_name,
324
+ 'method': method + ('Classifier' if variable_type == 'categorical' else 'Regressor'),
345
325
  'var': variable_name,
346
326
  'variable_type': variable_type,
347
327
  'metric': metric,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: flexynesis
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: A deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
5
5
  Author-email: Bora Uyar <bora.uyar@mdc-berlin.de>, Taras Savchyn <Taras.Savchyn@mdc-berlin.de>, Ricardo Wurmus <Ricardo.Wurmus@mdc-berlin.de>, Ahmet Sarigun <Ahmet.Sariguen@mdc-berlin.de>
6
6
  Project-URL: homepage, https://github.com/BIMSBbioinfo/flexynesis
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "flexynesis"
7
- version = "0.2.3"
7
+ version = "0.2.4"
8
8
  authors = [
9
9
  {name = "Bora Uyar", email = "bora.uyar@mdc-berlin.de"},
10
10
  {name = "Taras Savchyn", email = "Taras.Savchyn@mdc-berlin.de"},
File without changes
File without changes
File without changes
File without changes
File without changes