mediml 0.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. MEDiml/MEDscan.py +1696 -0
  2. MEDiml/__init__.py +21 -0
  3. MEDiml/biomarkers/BatchExtractor.py +806 -0
  4. MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
  5. MEDiml/biomarkers/__init__.py +16 -0
  6. MEDiml/biomarkers/diagnostics.py +125 -0
  7. MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
  8. MEDiml/biomarkers/glcm.py +1602 -0
  9. MEDiml/biomarkers/gldzm.py +523 -0
  10. MEDiml/biomarkers/glrlm.py +1315 -0
  11. MEDiml/biomarkers/glszm.py +555 -0
  12. MEDiml/biomarkers/int_vol_hist.py +527 -0
  13. MEDiml/biomarkers/intensity_histogram.py +615 -0
  14. MEDiml/biomarkers/local_intensity.py +89 -0
  15. MEDiml/biomarkers/morph.py +1756 -0
  16. MEDiml/biomarkers/ngldm.py +780 -0
  17. MEDiml/biomarkers/ngtdm.py +414 -0
  18. MEDiml/biomarkers/stats.py +373 -0
  19. MEDiml/biomarkers/utils.py +389 -0
  20. MEDiml/filters/TexturalFilter.py +299 -0
  21. MEDiml/filters/__init__.py +9 -0
  22. MEDiml/filters/apply_filter.py +134 -0
  23. MEDiml/filters/gabor.py +215 -0
  24. MEDiml/filters/laws.py +283 -0
  25. MEDiml/filters/log.py +147 -0
  26. MEDiml/filters/mean.py +121 -0
  27. MEDiml/filters/textural_filters_kernels.py +1738 -0
  28. MEDiml/filters/utils.py +107 -0
  29. MEDiml/filters/wavelet.py +237 -0
  30. MEDiml/learning/DataCleaner.py +198 -0
  31. MEDiml/learning/DesignExperiment.py +480 -0
  32. MEDiml/learning/FSR.py +667 -0
  33. MEDiml/learning/Normalization.py +112 -0
  34. MEDiml/learning/RadiomicsLearner.py +714 -0
  35. MEDiml/learning/Results.py +2237 -0
  36. MEDiml/learning/Stats.py +694 -0
  37. MEDiml/learning/__init__.py +10 -0
  38. MEDiml/learning/cleaning_utils.py +107 -0
  39. MEDiml/learning/ml_utils.py +1015 -0
  40. MEDiml/processing/__init__.py +6 -0
  41. MEDiml/processing/compute_suv_map.py +121 -0
  42. MEDiml/processing/discretisation.py +149 -0
  43. MEDiml/processing/interpolation.py +275 -0
  44. MEDiml/processing/resegmentation.py +66 -0
  45. MEDiml/processing/segmentation.py +912 -0
  46. MEDiml/utils/__init__.py +25 -0
  47. MEDiml/utils/batch_patients.py +45 -0
  48. MEDiml/utils/create_radiomics_table.py +131 -0
  49. MEDiml/utils/data_frame_export.py +42 -0
  50. MEDiml/utils/find_process_names.py +16 -0
  51. MEDiml/utils/get_file_paths.py +34 -0
  52. MEDiml/utils/get_full_rad_names.py +21 -0
  53. MEDiml/utils/get_institutions_from_ids.py +16 -0
  54. MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
  55. MEDiml/utils/get_patient_names.py +26 -0
  56. MEDiml/utils/get_radiomic_names.py +27 -0
  57. MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
  58. MEDiml/utils/image_reader_SITK.py +37 -0
  59. MEDiml/utils/image_volume_obj.py +22 -0
  60. MEDiml/utils/imref.py +340 -0
  61. MEDiml/utils/initialize_features_names.py +62 -0
  62. MEDiml/utils/inpolygon.py +159 -0
  63. MEDiml/utils/interp3.py +43 -0
  64. MEDiml/utils/json_utils.py +78 -0
  65. MEDiml/utils/mode.py +31 -0
  66. MEDiml/utils/parse_contour_string.py +58 -0
  67. MEDiml/utils/save_MEDscan.py +30 -0
  68. MEDiml/utils/strfind.py +32 -0
  69. MEDiml/utils/textureTools.py +188 -0
  70. MEDiml/utils/texture_features_names.py +115 -0
  71. MEDiml/utils/write_radiomics_csv.py +47 -0
  72. MEDiml/wrangling/DataManager.py +1724 -0
  73. MEDiml/wrangling/ProcessDICOM.py +512 -0
  74. MEDiml/wrangling/__init__.py +3 -0
  75. mediml-0.9.9.dist-info/LICENSE.md +674 -0
  76. mediml-0.9.9.dist-info/METADATA +232 -0
  77. mediml-0.9.9.dist-info/RECORD +78 -0
  78. mediml-0.9.9.dist-info/WHEEL +4 -0
@@ -0,0 +1,714 @@
1
+ import logging
2
+ import os
3
+ import time
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+ from typing import Dict, List, Tuple
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ from numpyencoder import NumpyEncoder
11
+ from pycaret.classification import *
12
+ from sklearn import metrics
13
+ from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
14
+ from xgboost import XGBClassifier
15
+
16
+ from MEDiml.learning.DataCleaner import DataCleaner
17
+ from MEDiml.learning.DesignExperiment import DesignExperiment
18
+ from MEDiml.learning.FSR import FSR
19
+ from MEDiml.learning.ml_utils import (average_results, combine_rad_tables,
20
+ feature_imporance_analysis,
21
+ finalize_rad_table, get_ml_test_table,
22
+ get_radiomics_table, intersect,
23
+ intersect_var_tables, save_model)
24
+ from MEDiml.learning.Normalization import Normalization
25
+ from MEDiml.learning.Results import Results
26
+
27
+ from ..utils.json_utils import load_json, save_json
28
+
29
+
30
+ class RadiomicsLearner:
31
+ def __init__(self, path_study: Path, path_settings: Path, experiment_label: str) -> None:
32
+ """
33
+ Constructor of the class DesignExperiment.
34
+
35
+ Args:
36
+ path_study (Path): Path to the main study folder where the outcomes,
37
+ learning patients and holdout patients dictionaries are found.
38
+ path_settings (Path): Path to the settings folder.
39
+ experiment_label (str): String specifying the label to attach to a given learning experiment in
40
+ "path_experiments". This label will be attached to the ml__$experiments_label$.json file as well
41
+ as the learn__$experiment_label$ folder. This label is used to keep track of different experiments
42
+ with different settings (e.g. radiomics, scans, machine learning algorithms, etc.).
43
+
44
+ Returns:
45
+ None
46
+ """
47
+ self.path_study = Path(path_study)
48
+ self.path_settings = Path(path_settings)
49
+ self.experiment_label = experiment_label
50
+
51
+ def __load_ml_info(self, ml_dict_paths: Dict) -> Dict:
52
+ """
53
+ Initializes the test dictionary information (training patients, test patients, ML dict, etc).
54
+
55
+ Args:
56
+ ml_dict_paths (Dict): Dictionary containing the paths to the different files needed
57
+ to run the machine learning experiment.
58
+
59
+ Returns:
60
+ dict: Dictionary containing the information of the machine learning test.
61
+ """
62
+ ml_dict = dict()
63
+
64
+ # Training and test patients
65
+ ml_dict['patientsTrain'] = load_json(ml_dict_paths['patientsTrain'])
66
+ ml_dict['patientsTest'] = load_json(ml_dict_paths['patientsTest'])
67
+
68
+ # Outcome table for training and test patients
69
+ outcome_table = pd.read_csv(ml_dict_paths['outcomes'], index_col=0)
70
+ ml_dict['outcome_table_binary'] = outcome_table.iloc[:, [0]]
71
+ if outcome_table.shape[1] == 2:
72
+ ml_dict['outcome_table_time'] = outcome_table.iloc[:, [1]]
73
+
74
+ # Machine learning dictionary
75
+ ml_dict['ml'] = load_json(ml_dict_paths['ml'])
76
+ ml_dict['path_results'] = ml_dict_paths['results']
77
+
78
+ return ml_dict
79
+
80
+ def __find_balanced_threshold(
81
+ self,
82
+ model: XGBClassifier,
83
+ variable_table: pd.DataFrame,
84
+ outcome_table_binary: pd.DataFrame
85
+ ) -> float:
86
+ """
87
+ Finds the balanced threshold for the given machine learning test.
88
+
89
+ Args:
90
+ model (XGBClassifier): Trained XGBoost classifier for the given machine learning run.
91
+ variable_table (pd.DataFrame): Radiomics table.
92
+ outcome_table_binary (pd.DataFrame): Outcome table with binary labels.
93
+
94
+ Returns:
95
+ float: Balanced threshold for the given machine learning test.
96
+ """
97
+ # Check is there is a feature mismatch
98
+ if model.feature_names_in_.shape[0] != variable_table.columns.shape[0]:
99
+ variable_table = variable_table.loc[:, model.feature_names_in_]
100
+
101
+ # Getting the probability responses for each patient
102
+ prob_xgb = np.zeros((variable_table.index.shape[0], 1)) * np.nan
103
+ patient_ids = list(variable_table.index.values)
104
+ for p in range(variable_table.index.shape[0]):
105
+ prob_xgb[p] = self.predict_xgb(model, variable_table.loc[[patient_ids[p]], :])
106
+
107
+ # Calculating the ROC curve
108
+ fpr, tpr, thresholds = metrics.roc_curve(outcome_table_binary.iloc[:, 0], prob_xgb)
109
+
110
+ # Calculating the optimal threshold by minizing fpr (false positive rate) and maximizing tpr (true positive rate)
111
+ minimum = np.argmin(np.power(fpr, 2) + np.power(1-tpr, 2))
112
+
113
+ return thresholds[minimum]
114
+
115
+ def get_hold_out_set_table(self, ml: Dict, var_id: str, patients_id: List):
116
+ """
117
+ Loads and pre-processes different radiomics tables then combines them to be used for hold-out testing.
118
+
119
+ Args:
120
+ ml (Dict): The machine learning dictionary containing the information of the machine learning test.
121
+ var_id (str): String specifying the ID of the radiomics variable in ml.
122
+ --> Ex: var1
123
+ patients_id (List): List of patients of the hold-out set.
124
+
125
+ Returns:
126
+ pd.DataFrame: Radiomics table for the hold-out set.
127
+ """
128
+ # Loading and pre-processing
129
+ rad_var_struct = ml['variables'][var_id]
130
+ rad_tables_holdout = list()
131
+ for item in rad_var_struct['path'].values():
132
+ # Reading the table
133
+ path_radiomics_csv = item['csv']
134
+ path_radiomics_txt = item['txt']
135
+ image_type = item['type']
136
+ rad_table_holdout = get_radiomics_table(path_radiomics_csv, path_radiomics_txt, image_type, patients_id)
137
+ rad_tables_holdout.append(rad_table_holdout)
138
+
139
+ # Combine the tables
140
+ rad_tables_holdout = combine_rad_tables(rad_tables_holdout)
141
+ rad_tables_holdout.Properties['userData']['flags_processing'] = {}
142
+
143
+ return rad_tables_holdout
144
+
145
+ def pre_process_variables(self, ml: Dict, outcome_table_binary: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
146
+ """
147
+ Loads and pre-processes different radiomics tables from different variable types
148
+ found in the ml dict.
149
+
150
+ Note:
151
+ only patients of the training/learning set should be found in this outcome table.
152
+
153
+ Args:
154
+ ml (Dict): The machine learning dictionary containing the information of the machine learning test.
155
+ outcome_table_binary (pd.DataFrame): outcome table with binary labels. This table may be used to
156
+ pre-process some variables with the "FDA" feature set reduction algorithm.
157
+
158
+ Returns:
159
+ Tuple: Two dict of processed radiomics tables, one dict for training and one for
160
+ testing (no feature set reduction).
161
+ """
162
+ # Get a list of unique variables found in the ml variables combinations dict
163
+ variables_id = [s.split('_') for s in ml['variables']['combinations']]
164
+ variables_id = list(set([x for sublist in variables_id for x in sublist]))
165
+
166
+ # For each variable, load the corresponding radiomics table and pre-process it
167
+ processed_var_tables, processed_var_tables_test = {var_id : self.pre_process_radiomics_table(
168
+ ml,
169
+ var_id,
170
+ outcome_table_binary
171
+ ) for var_id in variables_id}
172
+
173
+ return processed_var_tables, processed_var_tables_test
174
+
175
+ def pre_process_radiomics_table(
176
+ self,
177
+ ml: Dict,
178
+ var_id: str,
179
+ outcome_table_binary: pd.DataFrame,
180
+ patients_train: list
181
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
182
+ """
183
+ For the given variable, this function loads the corresponding radiomics tables and pre-processes them
184
+ (cleaning, normalization and feature set reduction).
185
+
186
+ Note:
187
+ Only patients of the training/learning set should be found in the given outcome table.
188
+
189
+ Args:
190
+ ml (Dict): The machine learning dictionary containing the information of the machine learning test
191
+ (parameters, options, etc.).
192
+ var_id (str): String specifying the ID of the radiomics variable in ml. For example: 'var1'.
193
+ outcome_table_binary (pd.DataFrame): outcome table with binary labels. This table may
194
+ be used to pre-process some variables with the "FDA" feature set reduction algorithm.
195
+
196
+ patients_train (list): List of patients to use for training.
197
+
198
+ Returns:
199
+ Tuple[pd.DataFrame, pd.DataFrame]: Two dataframes of processed radiomics tables, one for training
200
+ and one for testing (no feature set reduction).
201
+ """
202
+ # Initialization
203
+ patient_ids = list(outcome_table_binary.index)
204
+ outcome_table_binary_training = outcome_table_binary.loc[patients_train]
205
+ var_names = ['var_datacleaning', 'var_normalization', 'var_fSetReduction']
206
+ flags_preprocessing = {key: key in ml['variables'][var_id].keys() for key in var_names}
207
+ flags_preprocessing_test = flags_preprocessing.copy()
208
+ flags_preprocessing_test['var_fSetReduction'] = False
209
+
210
+ # Pre-processing
211
+ rad_var_struct = ml['variables'][var_id]
212
+ rad_tables_learning = list()
213
+ for item in rad_var_struct['path'].values():
214
+ # Loading the table
215
+ path_radiomics_csv = item['csv']
216
+ path_radiomics_txt = item['txt']
217
+ image_type = item['type']
218
+ rad_table_learning = get_radiomics_table(path_radiomics_csv, path_radiomics_txt, image_type, patient_ids)
219
+
220
+ # Data cleaning
221
+ if flags_preprocessing['var_datacleaning']:
222
+ cleaning_dict = ml['datacleaning'][ml['variables'][var_id]['var_datacleaning']]['feature']['continuous']
223
+ data_cleaner = DataCleaner(rad_table_learning)
224
+ rad_table_learning = data_cleaner(cleaning_dict)
225
+ if rad_table_learning is None:
226
+ continue
227
+
228
+ # Normalization (ComBat)
229
+ if flags_preprocessing['var_normalization']:
230
+ normalization_method = ml['variables'][var_id]['var_normalization']
231
+ # Some information must be stored to re-apply combat for testing data
232
+ if 'combat' in normalization_method.lower():
233
+ # Training data
234
+ rad_table_learning.Properties['userData']['normalization'] = dict()
235
+ rad_table_learning.Properties['userData']['normalization']['original_data'] = dict()
236
+ rad_table_learning.Properties['userData']['normalization']['original_data']['path_radiomics_csv'] = path_radiomics_csv
237
+ rad_table_learning.Properties['userData']['normalization']['original_data']['path_radiomics_txt'] = path_radiomics_txt
238
+ rad_table_learning.Properties['userData']['normalization']['original_data']['image_type'] = image_type
239
+ rad_table_learning.Properties['userData']['normalization']['original_data']['patient_ids'] = patient_ids
240
+ if flags_preprocessing['var_datacleaning']:
241
+ data_cln_method = ml['variables'][var_id]['var_datacleaning']
242
+ rad_table_learning.Properties['userData']['normalization']['original_data']['datacleaning_method'] = data_cln_method
243
+
244
+ # Apply ComBat
245
+ normalization = Normalization('combat')
246
+ rad_table_learning = normalization.apply_combat(variable_table=rad_table_learning) # Training data
247
+ else:
248
+ raise NotImplementedError(f'Normalization method: {normalization_method} not recognized.')
249
+
250
+ # Save the table
251
+ rad_tables_learning.append(rad_table_learning)
252
+
253
+ # Seperate training and testing data before feature set reduction
254
+ rad_tables_testing = deepcopy(rad_tables_learning)
255
+ rad_tables_training = []
256
+ for rad_tab in rad_tables_learning:
257
+ patients_ids = intersect(patients_train, list(rad_tab.index))
258
+ rad_tables_training.append(deepcopy(rad_tab.loc[patients_ids]))
259
+
260
+ # Deepcopy properties
261
+ temp_properties = list()
262
+ for rad_tab in rad_tables_testing:
263
+ temp_properties.append(deepcopy(rad_tab.Properties))
264
+
265
+ # Feature set reduction (for training data only)
266
+ if flags_preprocessing['var_fSetReduction']:
267
+ f_set_reduction_method = ml['variables'][var_id]['var_fSetReduction']['method']
268
+ fsr = FSR(f_set_reduction_method)
269
+
270
+ # Apply FDA
271
+ rad_tables_training = fsr.apply_fsr(
272
+ ml,
273
+ rad_tables_training,
274
+ outcome_table_binary_training,
275
+ path_save_logging=ml['path_results']
276
+ )
277
+
278
+ # Re-assign properties
279
+ for i in range(len(rad_tables_testing)):
280
+ rad_tables_testing[i].Properties = temp_properties[i]
281
+ del temp_properties
282
+
283
+ # Finalization steps
284
+ rad_tables_training.Properties['userData']['flags_preprocessing'] = flags_preprocessing
285
+ rad_tables_testing = combine_rad_tables(rad_tables_testing)
286
+ rad_tables_testing.Properties['userData']['flags_processing'] = flags_preprocessing_test
287
+
288
+ return rad_tables_training, rad_tables_testing
289
+
290
+ def train_xgboost_model(
291
+ self,
292
+ var_table_train: pd.DataFrame,
293
+ outcome_table_binary_train: pd.DataFrame,
294
+ var_importance_threshold: float = 0.05,
295
+ optimal_threshold: float = None,
296
+ optimization_metric: str = 'MCC',
297
+ method : str = "pycaret",
298
+ use_gpu: bool = False,
299
+ seed: int = None,
300
+ ) -> Dict:
301
+ """
302
+ Trains an XGBoost model for the given machine learning test.
303
+
304
+ Args:
305
+ var_table_train (pd.DataFrame): Radiomics table for the training/learning set.
306
+ outcome_table_binary_train (pd.DataFrame): Outcome table with binary labels for the training/learning set.
307
+ var_importance_threshold (float): Threshold for the variable importance. Variables with importance below
308
+ this threshold will be removed from the model.
309
+ optimal_threshold (float, optional): Optimal threshold for the XGBoost model. If not given, it will be
310
+ computed using the training set.
311
+ optimization_metric (str, optional): String specifying the metric to use to optimize the ml model.
312
+ method (str, optional): String specifying the method to use to train the XGBoost model.
313
+ - "pycaret": Use PyCaret to train the model (automatic).
314
+ - "grid_search": Grid search with cross-validation to find the best parameters.
315
+ - "random_search": Random search with cross-validation to find the best parameters.
316
+ use_gpu (bool, optional): Boolean specifying if the GPU should be used to train the model. Default is True.
317
+ seed (int, optional): Integer specifying the seed to use for the random number generator.
318
+
319
+ Returns:
320
+ Dict: Dictionary containing info about the trained XGBoost model.
321
+ """
322
+
323
+ # Safety check (make sure that the outcome table and the variable table have the same patients)
324
+ var_table_train, outcome_table_binary_train = intersect_var_tables(var_table_train, outcome_table_binary_train)
325
+
326
+ # Finalize the new radiomics table with the remaining variables
327
+ var_table_train = finalize_rad_table(var_table_train)
328
+
329
+ if method.lower() == "pycaret":
330
+ # Set up data for PyCaret
331
+ temp_data = pd.merge(var_table_train, outcome_table_binary_train, left_index=True, right_index=True)
332
+
333
+ # PyCaret setup
334
+ setup(
335
+ data=temp_data,
336
+ feature_selection=True,
337
+ n_features_to_select=1-var_importance_threshold,
338
+ fold=5,
339
+ target=temp_data.columns[-1],
340
+ use_gpu=use_gpu,
341
+ feature_selection_estimator="xgboost",
342
+ session_id=seed
343
+ )
344
+
345
+ # Set seed
346
+ if seed is not None:
347
+ set_config('seed', seed)
348
+
349
+ # Creating XGBoost model using PyCaret
350
+ classifier = create_model('xgboost', verbose=False)
351
+
352
+ # Tuning XGBoost model using PyCaret
353
+ classifier = tune_model(classifier, optimize=optimization_metric)
354
+
355
+ else:
356
+ # Initial training to filter features using variable importance
357
+ # XGB Classifier
358
+ classifier = XGBClassifier()
359
+ classifier.fit(var_table_train, outcome_table_binary_train)
360
+ var_importance = classifier.feature_importances_
361
+
362
+ # Normalize var_importance if necessary
363
+ if np.sum(var_importance) != 1:
364
+ var_importance_threshold = var_importance_threshold / np.sum(var_importance)
365
+ var_importance = var_importance / np.sum(var_importance)
366
+
367
+ # Filter variables
368
+ var_table_train = var_table_train.iloc[:, var_importance >= var_importance_threshold]
369
+
370
+ # Check if variable table is empty after filtering
371
+ if var_table_train.shape[1] == 0:
372
+ raise ValueError('Variable table is empty after variable importance filtering. Use a smaller threshold.')
373
+
374
+ # Suggested scale_pos_weight
375
+ scale_pos_weight = 1 - (outcome_table_binary_train == 0).sum().values[0] \
376
+ / (outcome_table_binary_train == 1).sum().values[0]
377
+
378
+ # XGB Classifier
379
+ classifier = XGBClassifier(scale_pos_weight=scale_pos_weight)
380
+
381
+ # Tune XGBoost parameters
382
+ params = {
383
+ 'max_depth': [3, 4, 5],
384
+ 'learning_rate': [0.1 , 0.01, 0.001],
385
+ 'n_estimators': [50, 100, 200]
386
+ }
387
+
388
+ if method.lower() == "grid_search":
389
+ # Set up grid search with cross-validation
390
+ grid_search = GridSearchCV(
391
+ estimator=classifier,
392
+ param_grid=params,
393
+ cv=5,
394
+ n_jobs=-1,
395
+ verbose=3,
396
+ scoring='matthews_corrcoef'
397
+ )
398
+ elif method.lower() == "random_search":
399
+ # Set up random search with cross-validation
400
+ grid_search = RandomizedSearchCV(
401
+ estimator=classifier,
402
+ param_distributions=params,
403
+ cv=5,
404
+ n_jobs=-1,
405
+ verbose=3,
406
+ scoring='matthews_corrcoef'
407
+ )
408
+ else:
409
+ raise NotImplementedError(f'Method: {method} not recognized. Use "grid_search", "random_search", "auto" or "pycaret".')
410
+
411
+ # Fit the grid search
412
+ grid_search.fit(var_table_train, outcome_table_binary_train)
413
+
414
+ # Get the best parameters
415
+ best_params = grid_search.best_params_
416
+
417
+ # Fit the XGB Classifier with the best parameters
418
+ classifier = XGBClassifier(**best_params)
419
+ classifier.fit(var_table_train, outcome_table_binary_train)
420
+
421
+ # Saving the information of the model in a dictionary
422
+ model_xgb = dict()
423
+ model_xgb['algo'] = 'xgb'
424
+ model_xgb['type'] = 'binary'
425
+ model_xgb['method'] = method
426
+ if optimal_threshold:
427
+ model_xgb['threshold'] = optimal_threshold
428
+ else:
429
+ try:
430
+ model_xgb['threshold'] = self.__find_balanced_threshold(classifier, var_table_train, outcome_table_binary_train)
431
+ except Exception as e:
432
+ print('Error in finding optimal threshold, it will be set to 0.5:' + str(e))
433
+ model_xgb['threshold'] = 0.5
434
+ model_xgb['model'] = classifier
435
+ model_xgb['var_names'] = list(classifier.feature_names_in_)
436
+ model_xgb['var_info'] = deepcopy(var_table_train.Properties['userData'])
437
+ if method == "auto":
438
+ model_xgb['optimization'] = "auto"
439
+ elif method == "pycaret":
440
+ model_xgb['optimization'] = classifier.get_params()
441
+ else:
442
+ model_xgb['optimization'] = best_params
443
+
444
+ return model_xgb
445
+
446
+ def test_xgb_model(self, model_dict: Dict, variable_table: pd.DataFrame, patient_list: List) -> List:
447
+ """
448
+ Tests the XGBoost model for the given dataset patients.
449
+
450
+ Args:
451
+ model_dict (Dict): Dictionary containing info about the trained XGBoost model.
452
+ variable_table (pd.DataFrame): Radiomics table for the test set (should not be normalized).
453
+ patient_list (List): List of patients to test.
454
+
455
+ Returns:
456
+ List: List the model response for the training and test sets.
457
+ """
458
+ # Initialization
459
+ n_test = len(patient_list)
460
+ var_names = model_dict['var_names']
461
+ var_def = model_dict['var_info']['variables']['var_def']
462
+ model_response = list()
463
+
464
+ # Preparing the variable table
465
+ variable_table = get_ml_test_table(variable_table, var_names, var_def)
466
+
467
+ # Test the model
468
+ for i in range(n_test):
469
+ # Get the patient IDs
470
+ patient_ids = patient_list[i]
471
+
472
+ # Getting predictions for each patient
473
+ n_patients = len(patient_ids)
474
+ varargout = np.zeros((n_patients, 1)) * np.nan # NaN if the computation fails
475
+ for p in range(n_patients):
476
+ try:
477
+ varargout[p] = self.predict_xgb(model_dict['model'], variable_table.loc[[patient_ids[p]], :])
478
+ except Exception as e:
479
+ print('Error in computing prediction for patient ' + str(patient_ids[p]) + ': ' + str(e))
480
+ varargout[p] = np.nan
481
+
482
+ # Save the predictions
483
+ model_response.append(varargout)
484
+
485
+ return model_response
486
+
487
+ def predict_xgb(self, xgb_model: XGBClassifier, variable_table: pd.DataFrame) -> float:
488
+ """
489
+ Computes the prediction of the XGBoost model for the given variable table.
490
+
491
+ Args:
492
+ xgb_model (XGBClassifier): XGBClassifier model.
493
+ variable_table (pd.DataFrame): Variable table for the prediction.
494
+
495
+ Returns:
496
+ float: Prediction of the XGBoost model.
497
+ """
498
+
499
+ # Predictions
500
+ predictions = xgb_model.predict_proba(variable_table)
501
+
502
+ # Get the probability of the positive class
503
+ predictions = predictions[:, 1][0]
504
+
505
+ return predictions
506
+
507
+ def ml_run(self, path_ml: Path, holdout_test: bool = True, method: str = 'auto') -> None:
508
+ """
509
+ This function runs the machine learning test for the ceated experiment.
510
+
511
+ Args:
512
+ path_ml (Path): Path to the main dictionary containing info about the ml current experiment.
513
+ holdout_test (bool, optional): Boolean specifying if the hold-out test should be performed.
514
+
515
+ Returns:
516
+ None.
517
+ """
518
+ # Set up logging file for the batch
519
+ log_file = os.path.dirname(path_ml) + '/batch.log'
520
+ logging.basicConfig(filename=log_file, level=logging.INFO, format='%(message)s', filemode='w')
521
+
522
+ # Start the timer
523
+ batch_start = time.time()
524
+
525
+ logging.info("\n\n********************MACHINE LEARNING RUN********************\n\n")
526
+
527
+ # --> A. Initialization phase
528
+ # Load the test dictionary and machine learning information
529
+ ml_dict_paths = load_json(path_ml) # Test information dictionary
530
+ ml_info_dict = self.__load_ml_info(ml_dict_paths) # Machine learning information dictionary
531
+
532
+ # Machine learning assets
533
+ patients_train = ml_info_dict['patientsTrain']
534
+ patients_test = ml_info_dict['patientsTest']
535
+ patients_holdout = load_json(self.path_study / 'patientsHoldOut.json') if holdout_test else None
536
+ outcome_table_binary = ml_info_dict['outcome_table_binary']
537
+ ml = ml_info_dict['ml']
538
+ path_results = ml_info_dict['path_results']
539
+ ml['path_results'] = path_results
540
+
541
+ # --> B. Machine Learning phase
542
+ # B.1. Pre-processing features
543
+ start = time.time()
544
+ logging.info("\n\n--> PRE-PROCESSING TRAINING VARIABLES")
545
+
546
+ # Not all variables will be used to train the model, only the user-selected variable
547
+ var_id = str(ml['variables']['varStudy'])
548
+
549
+ # Pre-processing of the radiomics tables/variables
550
+ processed_training_table, processed_testing_table = self.pre_process_radiomics_table(
551
+ ml,
552
+ var_id,
553
+ outcome_table_binary.copy(),
554
+ patients_train
555
+ )
556
+ logging.info(f"...Done in {time.time()-start} s")
557
+
558
+ # B.2. Pre-learning initialization
559
+ # Patient definitions (training and test sets)
560
+ patient_ids = list(outcome_table_binary.index)
561
+ patients_train = intersect(intersect(patient_ids, patients_train), processed_training_table.index)
562
+ patients_test = intersect(intersect(patient_ids, patients_test), processed_testing_table.index)
563
+ patients_holdout = intersect(patient_ids, patients_holdout) if holdout_test else None
564
+
565
+ # Initializing outcome tables for training and test sets
566
+ outcome_table_binary_train = outcome_table_binary.loc[patients_train, :]
567
+ outcome_table_binary_test = outcome_table_binary.loc[patients_test, :]
568
+ outcome_table_binary_holdout = outcome_table_binary.loc[patients_holdout, :] if holdout_test else None
569
+
570
+ # Serperate variable table for training sets (repetitive but double-checking)
571
+ var_table_train = processed_training_table.loc[patients_train, :]
572
+
573
+ # Initializing XGBoost model settings
574
+ var_importance_threshold = ml['algorithms']['XGBoost']['varImportanceThreshold']
575
+ optimal_threshold = ml['algorithms']['XGBoost']['optimalThreshold']
576
+ optimization_metric = ml['algorithms']['XGBoost']['optimizationMetric']
577
+ method = ml['algorithms']['XGBoost']['method'] if 'method' in ml['algorithms']['XGBoost'].keys() else method
578
+ use_gpu = ml['algorithms']['XGBoost']['useGPU'] if 'useGPU' in ml['algorithms']['XGBoost'].keys() else True
579
+ seed = ml['algorithms']['XGBoost']['seed'] if 'seed' in ml['algorithms']['XGBoost'].keys() else None
580
+
581
+ # B.2. Training the XGBoost model
582
+ tstart = time.time()
583
+ logging.info(f"\n\n--> TRAINING XGBOOST MODEL FOR VARIABLE {var_id}")
584
+
585
+ # Training the model
586
+ model = self.train_xgboost_model(
587
+ var_table_train,
588
+ outcome_table_binary_train,
589
+ var_importance_threshold,
590
+ optimal_threshold,
591
+ method=method,
592
+ use_gpu=use_gpu,
593
+ optimization_metric=optimization_metric,
594
+ seed=seed
595
+ )
596
+
597
+ # Saving the trained model using pickle
598
+ name_save_model = ml['algorithms']['XGBoost']['nameSave']
599
+ model_id = name_save_model + '_' + str(ml['variables']['varStudy'])
600
+ path_model = os.path.dirname(path_results) + '/' + (model_id + '.pickle')
601
+ model_dict = save_model(model, str(ml['variables']['varStudy']), path_model, ml=ml)
602
+
603
+ logging.info("{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f} min".format(" " * 4, (time.time()-tstart) / 60))
604
+
605
+ # --> C. Testing phase
606
+ # C.1. Testing the XGBoost model and computing model response
607
+ tstart = time.time()
608
+ logging.info(f"\n\n--> TESTING XGBOOST MODEL FOR VARIABLE {var_id}")
609
+
610
+ response_train, response_test = self.test_xgb_model(
611
+ model,
612
+ processed_testing_table,
613
+ [patients_train, patients_test]
614
+ )
615
+
616
+ logging.info('{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f}'.format(" " * 4, (time.time() - tstart)/60))
617
+
618
+ if holdout_test:
619
+ # --> D. Holdoutset testing phase
620
+ # D.1. Prepare holdout test data
621
+ var_table_all_holdout = self.get_hold_out_set_table(ml, var_id, patients_holdout)
622
+
623
+ # D.2. Testing the XGBoost model and computing model response on the holdout set
624
+ tstart = time.time()
625
+ logging.info(f"\n\n--> TESTING XGBOOST MODEL FOR VARIABLE {var_id} ON THE HOLDOUT SET")
626
+
627
+ response_holdout = self.test_xgb_model(model, var_table_all_holdout, [patients_holdout])[0]
628
+
629
+ logging.info('{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f}'.format(" " * 4, (time.time() - tstart)/60))
630
+
631
+ # E. Computing performance metrics
632
+ tstart = time.time()
633
+
634
+ # Initialize the Results class
635
+ result = Results(model_dict, model_id)
636
+ if holdout_test:
637
+ run_results = result.to_json(
638
+ response_train=response_train,
639
+ response_test=response_test,
640
+ response_holdout=response_holdout,
641
+ patients_train=patients_train,
642
+ patients_test=patients_test,
643
+ patients_holdout=patients_holdout
644
+ )
645
+ else:
646
+ run_results = result.to_json(
647
+ response_train=response_train,
648
+ response_test=response_test,
649
+ response_holdout=None,
650
+ patients_train=patients_train,
651
+ patients_test=patients_test,
652
+ patients_holdout=None
653
+ )
654
+
655
+ # Calculating performance metrics for training phase and saving the ROC curve
656
+ run_results[model_id]['train']['metrics'] = result.get_model_performance(
657
+ response_train,
658
+ outcome_table_binary_train,
659
+ )
660
+
661
+ # Calculating performance metrics for testing phase and saving the ROC curve
662
+ run_results[model_id]['test']['metrics'] = result.get_model_performance(
663
+ response_test,
664
+ outcome_table_binary_test,
665
+ )
666
+
667
+ if holdout_test:
668
+ # Calculating performance metrics for holdout phase and saving the ROC curve
669
+ run_results[model_id]['holdout']['metrics'] = result.get_model_performance(
670
+ response_holdout,
671
+ outcome_table_binary_holdout,
672
+ )
673
+
674
+ logging.info('\n\n--> COMPUTING PERFORMANCE METRICS ... Done in {:.2f} sec'.format(time.time()-tstart))
675
+
676
+ # F. Saving the results dictionary
677
+ save_json(path_results, run_results, cls=NumpyEncoder)
678
+
679
+ # Total computing time
680
+ logging.info("\n\n*********************************************************************")
681
+ logging.info('{} TOTAL COMPUTATION TIME: {:.2f} hours'.format(" " * 13, (time.time()-batch_start)/3600))
682
+ logging.info("*********************************************************************")
683
+
684
+ def run_experiment(self, holdout_test: bool = True, method: str = "pycaret") -> None:
685
+ """
686
+ Run the machine learning experiment for each split/run
687
+
688
+ Args:
689
+ holdout_test (bool, optional): Boolean specifying if the hold-out test should be performed.
690
+ method (str, optional): String specifying the method to use to train the XGBoost model.
691
+ - "pycaret": Use PyCaret to train the model (automatic).
692
+ - "grid_search": Grid search with cross-validation to find the best parameters.
693
+ - "random_search": Random search with cross-validation to find the best parameters.
694
+
695
+ Returns:
696
+ None
697
+ """
698
+ # Initialize the DesignExperiment class
699
+ experiment = DesignExperiment(self.path_study, self.path_settings, self.experiment_label)
700
+
701
+ # Generate the machine learning experiment
702
+ path_file_ml_paths = experiment.generate_experiment()
703
+
704
+ # Run the different machine learning tests for the experiment
705
+ tests_dict = load_json(path_file_ml_paths) # Tests dictionary
706
+ for run in tests_dict.keys():
707
+ self.ml_run(tests_dict[run], holdout_test, method)
708
+
709
+ # Average results of the different splits/runs
710
+ average_results(self.path_study / f'learn__{self.experiment_label}', save=True)
711
+
712
+ # Analyze the features importance for all the runs
713
+ feature_imporance_analysis(self.path_study / f'learn__{self.experiment_label}')
714
+