mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,714 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Tuple
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from numpyencoder import NumpyEncoder
|
|
11
|
+
from pycaret.classification import *
|
|
12
|
+
from sklearn import metrics
|
|
13
|
+
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
|
14
|
+
from xgboost import XGBClassifier
|
|
15
|
+
|
|
16
|
+
from MEDiml.learning.DataCleaner import DataCleaner
|
|
17
|
+
from MEDiml.learning.DesignExperiment import DesignExperiment
|
|
18
|
+
from MEDiml.learning.FSR import FSR
|
|
19
|
+
from MEDiml.learning.ml_utils import (average_results, combine_rad_tables,
|
|
20
|
+
feature_imporance_analysis,
|
|
21
|
+
finalize_rad_table, get_ml_test_table,
|
|
22
|
+
get_radiomics_table, intersect,
|
|
23
|
+
intersect_var_tables, save_model)
|
|
24
|
+
from MEDiml.learning.Normalization import Normalization
|
|
25
|
+
from MEDiml.learning.Results import Results
|
|
26
|
+
|
|
27
|
+
from ..utils.json_utils import load_json, save_json
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RadiomicsLearner:
|
|
31
|
+
def __init__(self, path_study: Path, path_settings: Path, experiment_label: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Constructor of the class DesignExperiment.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path_study (Path): Path to the main study folder where the outcomes,
|
|
37
|
+
learning patients and holdout patients dictionaries are found.
|
|
38
|
+
path_settings (Path): Path to the settings folder.
|
|
39
|
+
experiment_label (str): String specifying the label to attach to a given learning experiment in
|
|
40
|
+
"path_experiments". This label will be attached to the ml__$experiments_label$.json file as well
|
|
41
|
+
as the learn__$experiment_label$ folder. This label is used to keep track of different experiments
|
|
42
|
+
with different settings (e.g. radiomics, scans, machine learning algorithms, etc.).
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
None
|
|
46
|
+
"""
|
|
47
|
+
self.path_study = Path(path_study)
|
|
48
|
+
self.path_settings = Path(path_settings)
|
|
49
|
+
self.experiment_label = experiment_label
|
|
50
|
+
|
|
51
|
+
def __load_ml_info(self, ml_dict_paths: Dict) -> Dict:
|
|
52
|
+
"""
|
|
53
|
+
Initializes the test dictionary information (training patients, test patients, ML dict, etc).
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
ml_dict_paths (Dict): Dictionary containing the paths to the different files needed
|
|
57
|
+
to run the machine learning experiment.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
dict: Dictionary containing the information of the machine learning test.
|
|
61
|
+
"""
|
|
62
|
+
ml_dict = dict()
|
|
63
|
+
|
|
64
|
+
# Training and test patients
|
|
65
|
+
ml_dict['patientsTrain'] = load_json(ml_dict_paths['patientsTrain'])
|
|
66
|
+
ml_dict['patientsTest'] = load_json(ml_dict_paths['patientsTest'])
|
|
67
|
+
|
|
68
|
+
# Outcome table for training and test patients
|
|
69
|
+
outcome_table = pd.read_csv(ml_dict_paths['outcomes'], index_col=0)
|
|
70
|
+
ml_dict['outcome_table_binary'] = outcome_table.iloc[:, [0]]
|
|
71
|
+
if outcome_table.shape[1] == 2:
|
|
72
|
+
ml_dict['outcome_table_time'] = outcome_table.iloc[:, [1]]
|
|
73
|
+
|
|
74
|
+
# Machine learning dictionary
|
|
75
|
+
ml_dict['ml'] = load_json(ml_dict_paths['ml'])
|
|
76
|
+
ml_dict['path_results'] = ml_dict_paths['results']
|
|
77
|
+
|
|
78
|
+
return ml_dict
|
|
79
|
+
|
|
80
|
+
def __find_balanced_threshold(
|
|
81
|
+
self,
|
|
82
|
+
model: XGBClassifier,
|
|
83
|
+
variable_table: pd.DataFrame,
|
|
84
|
+
outcome_table_binary: pd.DataFrame
|
|
85
|
+
) -> float:
|
|
86
|
+
"""
|
|
87
|
+
Finds the balanced threshold for the given machine learning test.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
model (XGBClassifier): Trained XGBoost classifier for the given machine learning run.
|
|
91
|
+
variable_table (pd.DataFrame): Radiomics table.
|
|
92
|
+
outcome_table_binary (pd.DataFrame): Outcome table with binary labels.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
float: Balanced threshold for the given machine learning test.
|
|
96
|
+
"""
|
|
97
|
+
# Check is there is a feature mismatch
|
|
98
|
+
if model.feature_names_in_.shape[0] != variable_table.columns.shape[0]:
|
|
99
|
+
variable_table = variable_table.loc[:, model.feature_names_in_]
|
|
100
|
+
|
|
101
|
+
# Getting the probability responses for each patient
|
|
102
|
+
prob_xgb = np.zeros((variable_table.index.shape[0], 1)) * np.nan
|
|
103
|
+
patient_ids = list(variable_table.index.values)
|
|
104
|
+
for p in range(variable_table.index.shape[0]):
|
|
105
|
+
prob_xgb[p] = self.predict_xgb(model, variable_table.loc[[patient_ids[p]], :])
|
|
106
|
+
|
|
107
|
+
# Calculating the ROC curve
|
|
108
|
+
fpr, tpr, thresholds = metrics.roc_curve(outcome_table_binary.iloc[:, 0], prob_xgb)
|
|
109
|
+
|
|
110
|
+
# Calculating the optimal threshold by minizing fpr (false positive rate) and maximizing tpr (true positive rate)
|
|
111
|
+
minimum = np.argmin(np.power(fpr, 2) + np.power(1-tpr, 2))
|
|
112
|
+
|
|
113
|
+
return thresholds[minimum]
|
|
114
|
+
|
|
115
|
+
def get_hold_out_set_table(self, ml: Dict, var_id: str, patients_id: List):
|
|
116
|
+
"""
|
|
117
|
+
Loads and pre-processes different radiomics tables then combines them to be used for hold-out testing.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
ml (Dict): The machine learning dictionary containing the information of the machine learning test.
|
|
121
|
+
var_id (str): String specifying the ID of the radiomics variable in ml.
|
|
122
|
+
--> Ex: var1
|
|
123
|
+
patients_id (List): List of patients of the hold-out set.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
pd.DataFrame: Radiomics table for the hold-out set.
|
|
127
|
+
"""
|
|
128
|
+
# Loading and pre-processing
|
|
129
|
+
rad_var_struct = ml['variables'][var_id]
|
|
130
|
+
rad_tables_holdout = list()
|
|
131
|
+
for item in rad_var_struct['path'].values():
|
|
132
|
+
# Reading the table
|
|
133
|
+
path_radiomics_csv = item['csv']
|
|
134
|
+
path_radiomics_txt = item['txt']
|
|
135
|
+
image_type = item['type']
|
|
136
|
+
rad_table_holdout = get_radiomics_table(path_radiomics_csv, path_radiomics_txt, image_type, patients_id)
|
|
137
|
+
rad_tables_holdout.append(rad_table_holdout)
|
|
138
|
+
|
|
139
|
+
# Combine the tables
|
|
140
|
+
rad_tables_holdout = combine_rad_tables(rad_tables_holdout)
|
|
141
|
+
rad_tables_holdout.Properties['userData']['flags_processing'] = {}
|
|
142
|
+
|
|
143
|
+
return rad_tables_holdout
|
|
144
|
+
|
|
145
|
+
def pre_process_variables(self, ml: Dict, outcome_table_binary: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
146
|
+
"""
|
|
147
|
+
Loads and pre-processes different radiomics tables from different variable types
|
|
148
|
+
found in the ml dict.
|
|
149
|
+
|
|
150
|
+
Note:
|
|
151
|
+
only patients of the training/learning set should be found in this outcome table.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
ml (Dict): The machine learning dictionary containing the information of the machine learning test.
|
|
155
|
+
outcome_table_binary (pd.DataFrame): outcome table with binary labels. This table may be used to
|
|
156
|
+
pre-process some variables with the "FDA" feature set reduction algorithm.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Tuple: Two dict of processed radiomics tables, one dict for training and one for
|
|
160
|
+
testing (no feature set reduction).
|
|
161
|
+
"""
|
|
162
|
+
# Get a list of unique variables found in the ml variables combinations dict
|
|
163
|
+
variables_id = [s.split('_') for s in ml['variables']['combinations']]
|
|
164
|
+
variables_id = list(set([x for sublist in variables_id for x in sublist]))
|
|
165
|
+
|
|
166
|
+
# For each variable, load the corresponding radiomics table and pre-process it
|
|
167
|
+
processed_var_tables, processed_var_tables_test = {var_id : self.pre_process_radiomics_table(
|
|
168
|
+
ml,
|
|
169
|
+
var_id,
|
|
170
|
+
outcome_table_binary
|
|
171
|
+
) for var_id in variables_id}
|
|
172
|
+
|
|
173
|
+
return processed_var_tables, processed_var_tables_test
|
|
174
|
+
|
|
175
|
+
def pre_process_radiomics_table(
|
|
176
|
+
self,
|
|
177
|
+
ml: Dict,
|
|
178
|
+
var_id: str,
|
|
179
|
+
outcome_table_binary: pd.DataFrame,
|
|
180
|
+
patients_train: list
|
|
181
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
182
|
+
"""
|
|
183
|
+
For the given variable, this function loads the corresponding radiomics tables and pre-processes them
|
|
184
|
+
(cleaning, normalization and feature set reduction).
|
|
185
|
+
|
|
186
|
+
Note:
|
|
187
|
+
Only patients of the training/learning set should be found in the given outcome table.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
ml (Dict): The machine learning dictionary containing the information of the machine learning test
|
|
191
|
+
(parameters, options, etc.).
|
|
192
|
+
var_id (str): String specifying the ID of the radiomics variable in ml. For example: 'var1'.
|
|
193
|
+
outcome_table_binary (pd.DataFrame): outcome table with binary labels. This table may
|
|
194
|
+
be used to pre-process some variables with the "FDA" feature set reduction algorithm.
|
|
195
|
+
|
|
196
|
+
patients_train (list): List of patients to use for training.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple[pd.DataFrame, pd.DataFrame]: Two dataframes of processed radiomics tables, one for training
|
|
200
|
+
and one for testing (no feature set reduction).
|
|
201
|
+
"""
|
|
202
|
+
# Initialization
|
|
203
|
+
patient_ids = list(outcome_table_binary.index)
|
|
204
|
+
outcome_table_binary_training = outcome_table_binary.loc[patients_train]
|
|
205
|
+
var_names = ['var_datacleaning', 'var_normalization', 'var_fSetReduction']
|
|
206
|
+
flags_preprocessing = {key: key in ml['variables'][var_id].keys() for key in var_names}
|
|
207
|
+
flags_preprocessing_test = flags_preprocessing.copy()
|
|
208
|
+
flags_preprocessing_test['var_fSetReduction'] = False
|
|
209
|
+
|
|
210
|
+
# Pre-processing
|
|
211
|
+
rad_var_struct = ml['variables'][var_id]
|
|
212
|
+
rad_tables_learning = list()
|
|
213
|
+
for item in rad_var_struct['path'].values():
|
|
214
|
+
# Loading the table
|
|
215
|
+
path_radiomics_csv = item['csv']
|
|
216
|
+
path_radiomics_txt = item['txt']
|
|
217
|
+
image_type = item['type']
|
|
218
|
+
rad_table_learning = get_radiomics_table(path_radiomics_csv, path_radiomics_txt, image_type, patient_ids)
|
|
219
|
+
|
|
220
|
+
# Data cleaning
|
|
221
|
+
if flags_preprocessing['var_datacleaning']:
|
|
222
|
+
cleaning_dict = ml['datacleaning'][ml['variables'][var_id]['var_datacleaning']]['feature']['continuous']
|
|
223
|
+
data_cleaner = DataCleaner(rad_table_learning)
|
|
224
|
+
rad_table_learning = data_cleaner(cleaning_dict)
|
|
225
|
+
if rad_table_learning is None:
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Normalization (ComBat)
|
|
229
|
+
if flags_preprocessing['var_normalization']:
|
|
230
|
+
normalization_method = ml['variables'][var_id]['var_normalization']
|
|
231
|
+
# Some information must be stored to re-apply combat for testing data
|
|
232
|
+
if 'combat' in normalization_method.lower():
|
|
233
|
+
# Training data
|
|
234
|
+
rad_table_learning.Properties['userData']['normalization'] = dict()
|
|
235
|
+
rad_table_learning.Properties['userData']['normalization']['original_data'] = dict()
|
|
236
|
+
rad_table_learning.Properties['userData']['normalization']['original_data']['path_radiomics_csv'] = path_radiomics_csv
|
|
237
|
+
rad_table_learning.Properties['userData']['normalization']['original_data']['path_radiomics_txt'] = path_radiomics_txt
|
|
238
|
+
rad_table_learning.Properties['userData']['normalization']['original_data']['image_type'] = image_type
|
|
239
|
+
rad_table_learning.Properties['userData']['normalization']['original_data']['patient_ids'] = patient_ids
|
|
240
|
+
if flags_preprocessing['var_datacleaning']:
|
|
241
|
+
data_cln_method = ml['variables'][var_id]['var_datacleaning']
|
|
242
|
+
rad_table_learning.Properties['userData']['normalization']['original_data']['datacleaning_method'] = data_cln_method
|
|
243
|
+
|
|
244
|
+
# Apply ComBat
|
|
245
|
+
normalization = Normalization('combat')
|
|
246
|
+
rad_table_learning = normalization.apply_combat(variable_table=rad_table_learning) # Training data
|
|
247
|
+
else:
|
|
248
|
+
raise NotImplementedError(f'Normalization method: {normalization_method} not recognized.')
|
|
249
|
+
|
|
250
|
+
# Save the table
|
|
251
|
+
rad_tables_learning.append(rad_table_learning)
|
|
252
|
+
|
|
253
|
+
# Seperate training and testing data before feature set reduction
|
|
254
|
+
rad_tables_testing = deepcopy(rad_tables_learning)
|
|
255
|
+
rad_tables_training = []
|
|
256
|
+
for rad_tab in rad_tables_learning:
|
|
257
|
+
patients_ids = intersect(patients_train, list(rad_tab.index))
|
|
258
|
+
rad_tables_training.append(deepcopy(rad_tab.loc[patients_ids]))
|
|
259
|
+
|
|
260
|
+
# Deepcopy properties
|
|
261
|
+
temp_properties = list()
|
|
262
|
+
for rad_tab in rad_tables_testing:
|
|
263
|
+
temp_properties.append(deepcopy(rad_tab.Properties))
|
|
264
|
+
|
|
265
|
+
# Feature set reduction (for training data only)
|
|
266
|
+
if flags_preprocessing['var_fSetReduction']:
|
|
267
|
+
f_set_reduction_method = ml['variables'][var_id]['var_fSetReduction']['method']
|
|
268
|
+
fsr = FSR(f_set_reduction_method)
|
|
269
|
+
|
|
270
|
+
# Apply FDA
|
|
271
|
+
rad_tables_training = fsr.apply_fsr(
|
|
272
|
+
ml,
|
|
273
|
+
rad_tables_training,
|
|
274
|
+
outcome_table_binary_training,
|
|
275
|
+
path_save_logging=ml['path_results']
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Re-assign properties
|
|
279
|
+
for i in range(len(rad_tables_testing)):
|
|
280
|
+
rad_tables_testing[i].Properties = temp_properties[i]
|
|
281
|
+
del temp_properties
|
|
282
|
+
|
|
283
|
+
# Finalization steps
|
|
284
|
+
rad_tables_training.Properties['userData']['flags_preprocessing'] = flags_preprocessing
|
|
285
|
+
rad_tables_testing = combine_rad_tables(rad_tables_testing)
|
|
286
|
+
rad_tables_testing.Properties['userData']['flags_processing'] = flags_preprocessing_test
|
|
287
|
+
|
|
288
|
+
return rad_tables_training, rad_tables_testing
|
|
289
|
+
|
|
290
|
+
def train_xgboost_model(
|
|
291
|
+
self,
|
|
292
|
+
var_table_train: pd.DataFrame,
|
|
293
|
+
outcome_table_binary_train: pd.DataFrame,
|
|
294
|
+
var_importance_threshold: float = 0.05,
|
|
295
|
+
optimal_threshold: float = None,
|
|
296
|
+
optimization_metric: str = 'MCC',
|
|
297
|
+
method : str = "pycaret",
|
|
298
|
+
use_gpu: bool = False,
|
|
299
|
+
seed: int = None,
|
|
300
|
+
) -> Dict:
|
|
301
|
+
"""
|
|
302
|
+
Trains an XGBoost model for the given machine learning test.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
var_table_train (pd.DataFrame): Radiomics table for the training/learning set.
|
|
306
|
+
outcome_table_binary_train (pd.DataFrame): Outcome table with binary labels for the training/learning set.
|
|
307
|
+
var_importance_threshold (float): Threshold for the variable importance. Variables with importance below
|
|
308
|
+
this threshold will be removed from the model.
|
|
309
|
+
optimal_threshold (float, optional): Optimal threshold for the XGBoost model. If not given, it will be
|
|
310
|
+
computed using the training set.
|
|
311
|
+
optimization_metric (str, optional): String specifying the metric to use to optimize the ml model.
|
|
312
|
+
method (str, optional): String specifying the method to use to train the XGBoost model.
|
|
313
|
+
- "pycaret": Use PyCaret to train the model (automatic).
|
|
314
|
+
- "grid_search": Grid search with cross-validation to find the best parameters.
|
|
315
|
+
- "random_search": Random search with cross-validation to find the best parameters.
|
|
316
|
+
use_gpu (bool, optional): Boolean specifying if the GPU should be used to train the model. Default is True.
|
|
317
|
+
seed (int, optional): Integer specifying the seed to use for the random number generator.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Dict: Dictionary containing info about the trained XGBoost model.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
# Safety check (make sure that the outcome table and the variable table have the same patients)
|
|
324
|
+
var_table_train, outcome_table_binary_train = intersect_var_tables(var_table_train, outcome_table_binary_train)
|
|
325
|
+
|
|
326
|
+
# Finalize the new radiomics table with the remaining variables
|
|
327
|
+
var_table_train = finalize_rad_table(var_table_train)
|
|
328
|
+
|
|
329
|
+
if method.lower() == "pycaret":
|
|
330
|
+
# Set up data for PyCaret
|
|
331
|
+
temp_data = pd.merge(var_table_train, outcome_table_binary_train, left_index=True, right_index=True)
|
|
332
|
+
|
|
333
|
+
# PyCaret setup
|
|
334
|
+
setup(
|
|
335
|
+
data=temp_data,
|
|
336
|
+
feature_selection=True,
|
|
337
|
+
n_features_to_select=1-var_importance_threshold,
|
|
338
|
+
fold=5,
|
|
339
|
+
target=temp_data.columns[-1],
|
|
340
|
+
use_gpu=use_gpu,
|
|
341
|
+
feature_selection_estimator="xgboost",
|
|
342
|
+
session_id=seed
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Set seed
|
|
346
|
+
if seed is not None:
|
|
347
|
+
set_config('seed', seed)
|
|
348
|
+
|
|
349
|
+
# Creating XGBoost model using PyCaret
|
|
350
|
+
classifier = create_model('xgboost', verbose=False)
|
|
351
|
+
|
|
352
|
+
# Tuning XGBoost model using PyCaret
|
|
353
|
+
classifier = tune_model(classifier, optimize=optimization_metric)
|
|
354
|
+
|
|
355
|
+
else:
|
|
356
|
+
# Initial training to filter features using variable importance
|
|
357
|
+
# XGB Classifier
|
|
358
|
+
classifier = XGBClassifier()
|
|
359
|
+
classifier.fit(var_table_train, outcome_table_binary_train)
|
|
360
|
+
var_importance = classifier.feature_importances_
|
|
361
|
+
|
|
362
|
+
# Normalize var_importance if necessary
|
|
363
|
+
if np.sum(var_importance) != 1:
|
|
364
|
+
var_importance_threshold = var_importance_threshold / np.sum(var_importance)
|
|
365
|
+
var_importance = var_importance / np.sum(var_importance)
|
|
366
|
+
|
|
367
|
+
# Filter variables
|
|
368
|
+
var_table_train = var_table_train.iloc[:, var_importance >= var_importance_threshold]
|
|
369
|
+
|
|
370
|
+
# Check if variable table is empty after filtering
|
|
371
|
+
if var_table_train.shape[1] == 0:
|
|
372
|
+
raise ValueError('Variable table is empty after variable importance filtering. Use a smaller threshold.')
|
|
373
|
+
|
|
374
|
+
# Suggested scale_pos_weight
|
|
375
|
+
scale_pos_weight = 1 - (outcome_table_binary_train == 0).sum().values[0] \
|
|
376
|
+
/ (outcome_table_binary_train == 1).sum().values[0]
|
|
377
|
+
|
|
378
|
+
# XGB Classifier
|
|
379
|
+
classifier = XGBClassifier(scale_pos_weight=scale_pos_weight)
|
|
380
|
+
|
|
381
|
+
# Tune XGBoost parameters
|
|
382
|
+
params = {
|
|
383
|
+
'max_depth': [3, 4, 5],
|
|
384
|
+
'learning_rate': [0.1 , 0.01, 0.001],
|
|
385
|
+
'n_estimators': [50, 100, 200]
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if method.lower() == "grid_search":
|
|
389
|
+
# Set up grid search with cross-validation
|
|
390
|
+
grid_search = GridSearchCV(
|
|
391
|
+
estimator=classifier,
|
|
392
|
+
param_grid=params,
|
|
393
|
+
cv=5,
|
|
394
|
+
n_jobs=-1,
|
|
395
|
+
verbose=3,
|
|
396
|
+
scoring='matthews_corrcoef'
|
|
397
|
+
)
|
|
398
|
+
elif method.lower() == "random_search":
|
|
399
|
+
# Set up random search with cross-validation
|
|
400
|
+
grid_search = RandomizedSearchCV(
|
|
401
|
+
estimator=classifier,
|
|
402
|
+
param_distributions=params,
|
|
403
|
+
cv=5,
|
|
404
|
+
n_jobs=-1,
|
|
405
|
+
verbose=3,
|
|
406
|
+
scoring='matthews_corrcoef'
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
raise NotImplementedError(f'Method: {method} not recognized. Use "grid_search", "random_search", "auto" or "pycaret".')
|
|
410
|
+
|
|
411
|
+
# Fit the grid search
|
|
412
|
+
grid_search.fit(var_table_train, outcome_table_binary_train)
|
|
413
|
+
|
|
414
|
+
# Get the best parameters
|
|
415
|
+
best_params = grid_search.best_params_
|
|
416
|
+
|
|
417
|
+
# Fit the XGB Classifier with the best parameters
|
|
418
|
+
classifier = XGBClassifier(**best_params)
|
|
419
|
+
classifier.fit(var_table_train, outcome_table_binary_train)
|
|
420
|
+
|
|
421
|
+
# Saving the information of the model in a dictionary
|
|
422
|
+
model_xgb = dict()
|
|
423
|
+
model_xgb['algo'] = 'xgb'
|
|
424
|
+
model_xgb['type'] = 'binary'
|
|
425
|
+
model_xgb['method'] = method
|
|
426
|
+
if optimal_threshold:
|
|
427
|
+
model_xgb['threshold'] = optimal_threshold
|
|
428
|
+
else:
|
|
429
|
+
try:
|
|
430
|
+
model_xgb['threshold'] = self.__find_balanced_threshold(classifier, var_table_train, outcome_table_binary_train)
|
|
431
|
+
except Exception as e:
|
|
432
|
+
print('Error in finding optimal threshold, it will be set to 0.5:' + str(e))
|
|
433
|
+
model_xgb['threshold'] = 0.5
|
|
434
|
+
model_xgb['model'] = classifier
|
|
435
|
+
model_xgb['var_names'] = list(classifier.feature_names_in_)
|
|
436
|
+
model_xgb['var_info'] = deepcopy(var_table_train.Properties['userData'])
|
|
437
|
+
if method == "auto":
|
|
438
|
+
model_xgb['optimization'] = "auto"
|
|
439
|
+
elif method == "pycaret":
|
|
440
|
+
model_xgb['optimization'] = classifier.get_params()
|
|
441
|
+
else:
|
|
442
|
+
model_xgb['optimization'] = best_params
|
|
443
|
+
|
|
444
|
+
return model_xgb
|
|
445
|
+
|
|
446
|
+
def test_xgb_model(self, model_dict: Dict, variable_table: pd.DataFrame, patient_list: List) -> List:
|
|
447
|
+
"""
|
|
448
|
+
Tests the XGBoost model for the given dataset patients.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
model_dict (Dict): Dictionary containing info about the trained XGBoost model.
|
|
452
|
+
variable_table (pd.DataFrame): Radiomics table for the test set (should not be normalized).
|
|
453
|
+
patient_list (List): List of patients to test.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
List: List the model response for the training and test sets.
|
|
457
|
+
"""
|
|
458
|
+
# Initialization
|
|
459
|
+
n_test = len(patient_list)
|
|
460
|
+
var_names = model_dict['var_names']
|
|
461
|
+
var_def = model_dict['var_info']['variables']['var_def']
|
|
462
|
+
model_response = list()
|
|
463
|
+
|
|
464
|
+
# Preparing the variable table
|
|
465
|
+
variable_table = get_ml_test_table(variable_table, var_names, var_def)
|
|
466
|
+
|
|
467
|
+
# Test the model
|
|
468
|
+
for i in range(n_test):
|
|
469
|
+
# Get the patient IDs
|
|
470
|
+
patient_ids = patient_list[i]
|
|
471
|
+
|
|
472
|
+
# Getting predictions for each patient
|
|
473
|
+
n_patients = len(patient_ids)
|
|
474
|
+
varargout = np.zeros((n_patients, 1)) * np.nan # NaN if the computation fails
|
|
475
|
+
for p in range(n_patients):
|
|
476
|
+
try:
|
|
477
|
+
varargout[p] = self.predict_xgb(model_dict['model'], variable_table.loc[[patient_ids[p]], :])
|
|
478
|
+
except Exception as e:
|
|
479
|
+
print('Error in computing prediction for patient ' + str(patient_ids[p]) + ': ' + str(e))
|
|
480
|
+
varargout[p] = np.nan
|
|
481
|
+
|
|
482
|
+
# Save the predictions
|
|
483
|
+
model_response.append(varargout)
|
|
484
|
+
|
|
485
|
+
return model_response
|
|
486
|
+
|
|
487
|
+
def predict_xgb(self, xgb_model: XGBClassifier, variable_table: pd.DataFrame) -> float:
|
|
488
|
+
"""
|
|
489
|
+
Computes the prediction of the XGBoost model for the given variable table.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
xgb_model (XGBClassifier): XGBClassifier model.
|
|
493
|
+
variable_table (pd.DataFrame): Variable table for the prediction.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
float: Prediction of the XGBoost model.
|
|
497
|
+
"""
|
|
498
|
+
|
|
499
|
+
# Predictions
|
|
500
|
+
predictions = xgb_model.predict_proba(variable_table)
|
|
501
|
+
|
|
502
|
+
# Get the probability of the positive class
|
|
503
|
+
predictions = predictions[:, 1][0]
|
|
504
|
+
|
|
505
|
+
return predictions
|
|
506
|
+
|
|
507
|
+
def ml_run(self, path_ml: Path, holdout_test: bool = True, method: str = 'auto') -> None:
|
|
508
|
+
"""
|
|
509
|
+
This function runs the machine learning test for the ceated experiment.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
path_ml (Path): Path to the main dictionary containing info about the ml current experiment.
|
|
513
|
+
holdout_test (bool, optional): Boolean specifying if the hold-out test should be performed.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
None.
|
|
517
|
+
"""
|
|
518
|
+
# Set up logging file for the batch
|
|
519
|
+
log_file = os.path.dirname(path_ml) + '/batch.log'
|
|
520
|
+
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(message)s', filemode='w')
|
|
521
|
+
|
|
522
|
+
# Start the timer
|
|
523
|
+
batch_start = time.time()
|
|
524
|
+
|
|
525
|
+
logging.info("\n\n********************MACHINE LEARNING RUN********************\n\n")
|
|
526
|
+
|
|
527
|
+
# --> A. Initialization phase
|
|
528
|
+
# Load the test dictionary and machine learning information
|
|
529
|
+
ml_dict_paths = load_json(path_ml) # Test information dictionary
|
|
530
|
+
ml_info_dict = self.__load_ml_info(ml_dict_paths) # Machine learning information dictionary
|
|
531
|
+
|
|
532
|
+
# Machine learning assets
|
|
533
|
+
patients_train = ml_info_dict['patientsTrain']
|
|
534
|
+
patients_test = ml_info_dict['patientsTest']
|
|
535
|
+
patients_holdout = load_json(self.path_study / 'patientsHoldOut.json') if holdout_test else None
|
|
536
|
+
outcome_table_binary = ml_info_dict['outcome_table_binary']
|
|
537
|
+
ml = ml_info_dict['ml']
|
|
538
|
+
path_results = ml_info_dict['path_results']
|
|
539
|
+
ml['path_results'] = path_results
|
|
540
|
+
|
|
541
|
+
# --> B. Machine Learning phase
|
|
542
|
+
# B.1. Pre-processing features
|
|
543
|
+
start = time.time()
|
|
544
|
+
logging.info("\n\n--> PRE-PROCESSING TRAINING VARIABLES")
|
|
545
|
+
|
|
546
|
+
# Not all variables will be used to train the model, only the user-selected variable
|
|
547
|
+
var_id = str(ml['variables']['varStudy'])
|
|
548
|
+
|
|
549
|
+
# Pre-processing of the radiomics tables/variables
|
|
550
|
+
processed_training_table, processed_testing_table = self.pre_process_radiomics_table(
|
|
551
|
+
ml,
|
|
552
|
+
var_id,
|
|
553
|
+
outcome_table_binary.copy(),
|
|
554
|
+
patients_train
|
|
555
|
+
)
|
|
556
|
+
logging.info(f"...Done in {time.time()-start} s")
|
|
557
|
+
|
|
558
|
+
# B.2. Pre-learning initialization
|
|
559
|
+
# Patient definitions (training and test sets)
|
|
560
|
+
patient_ids = list(outcome_table_binary.index)
|
|
561
|
+
patients_train = intersect(intersect(patient_ids, patients_train), processed_training_table.index)
|
|
562
|
+
patients_test = intersect(intersect(patient_ids, patients_test), processed_testing_table.index)
|
|
563
|
+
patients_holdout = intersect(patient_ids, patients_holdout) if holdout_test else None
|
|
564
|
+
|
|
565
|
+
# Initializing outcome tables for training and test sets
|
|
566
|
+
outcome_table_binary_train = outcome_table_binary.loc[patients_train, :]
|
|
567
|
+
outcome_table_binary_test = outcome_table_binary.loc[patients_test, :]
|
|
568
|
+
outcome_table_binary_holdout = outcome_table_binary.loc[patients_holdout, :] if holdout_test else None
|
|
569
|
+
|
|
570
|
+
# Serperate variable table for training sets (repetitive but double-checking)
|
|
571
|
+
var_table_train = processed_training_table.loc[patients_train, :]
|
|
572
|
+
|
|
573
|
+
# Initializing XGBoost model settings
|
|
574
|
+
var_importance_threshold = ml['algorithms']['XGBoost']['varImportanceThreshold']
|
|
575
|
+
optimal_threshold = ml['algorithms']['XGBoost']['optimalThreshold']
|
|
576
|
+
optimization_metric = ml['algorithms']['XGBoost']['optimizationMetric']
|
|
577
|
+
method = ml['algorithms']['XGBoost']['method'] if 'method' in ml['algorithms']['XGBoost'].keys() else method
|
|
578
|
+
use_gpu = ml['algorithms']['XGBoost']['useGPU'] if 'useGPU' in ml['algorithms']['XGBoost'].keys() else True
|
|
579
|
+
seed = ml['algorithms']['XGBoost']['seed'] if 'seed' in ml['algorithms']['XGBoost'].keys() else None
|
|
580
|
+
|
|
581
|
+
# B.2. Training the XGBoost model
|
|
582
|
+
tstart = time.time()
|
|
583
|
+
logging.info(f"\n\n--> TRAINING XGBOOST MODEL FOR VARIABLE {var_id}")
|
|
584
|
+
|
|
585
|
+
# Training the model
|
|
586
|
+
model = self.train_xgboost_model(
|
|
587
|
+
var_table_train,
|
|
588
|
+
outcome_table_binary_train,
|
|
589
|
+
var_importance_threshold,
|
|
590
|
+
optimal_threshold,
|
|
591
|
+
method=method,
|
|
592
|
+
use_gpu=use_gpu,
|
|
593
|
+
optimization_metric=optimization_metric,
|
|
594
|
+
seed=seed
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Saving the trained model using pickle
|
|
598
|
+
name_save_model = ml['algorithms']['XGBoost']['nameSave']
|
|
599
|
+
model_id = name_save_model + '_' + str(ml['variables']['varStudy'])
|
|
600
|
+
path_model = os.path.dirname(path_results) + '/' + (model_id + '.pickle')
|
|
601
|
+
model_dict = save_model(model, str(ml['variables']['varStudy']), path_model, ml=ml)
|
|
602
|
+
|
|
603
|
+
logging.info("{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f} min".format(" " * 4, (time.time()-tstart) / 60))
|
|
604
|
+
|
|
605
|
+
# --> C. Testing phase
|
|
606
|
+
# C.1. Testing the XGBoost model and computing model response
|
|
607
|
+
tstart = time.time()
|
|
608
|
+
logging.info(f"\n\n--> TESTING XGBOOST MODEL FOR VARIABLE {var_id}")
|
|
609
|
+
|
|
610
|
+
response_train, response_test = self.test_xgb_model(
|
|
611
|
+
model,
|
|
612
|
+
processed_testing_table,
|
|
613
|
+
[patients_train, patients_test]
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
logging.info('{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f}'.format(" " * 4, (time.time() - tstart)/60))
|
|
617
|
+
|
|
618
|
+
if holdout_test:
|
|
619
|
+
# --> D. Holdoutset testing phase
|
|
620
|
+
# D.1. Prepare holdout test data
|
|
621
|
+
var_table_all_holdout = self.get_hold_out_set_table(ml, var_id, patients_holdout)
|
|
622
|
+
|
|
623
|
+
# D.2. Testing the XGBoost model and computing model response on the holdout set
|
|
624
|
+
tstart = time.time()
|
|
625
|
+
logging.info(f"\n\n--> TESTING XGBOOST MODEL FOR VARIABLE {var_id} ON THE HOLDOUT SET")
|
|
626
|
+
|
|
627
|
+
response_holdout = self.test_xgb_model(model, var_table_all_holdout, [patients_holdout])[0]
|
|
628
|
+
|
|
629
|
+
logging.info('{}--> DONE. TOTAL TIME OF LEARNING PROCESS: {:.2f}'.format(" " * 4, (time.time() - tstart)/60))
|
|
630
|
+
|
|
631
|
+
# E. Computing performance metrics
|
|
632
|
+
tstart = time.time()
|
|
633
|
+
|
|
634
|
+
# Initialize the Results class
|
|
635
|
+
result = Results(model_dict, model_id)
|
|
636
|
+
if holdout_test:
|
|
637
|
+
run_results = result.to_json(
|
|
638
|
+
response_train=response_train,
|
|
639
|
+
response_test=response_test,
|
|
640
|
+
response_holdout=response_holdout,
|
|
641
|
+
patients_train=patients_train,
|
|
642
|
+
patients_test=patients_test,
|
|
643
|
+
patients_holdout=patients_holdout
|
|
644
|
+
)
|
|
645
|
+
else:
|
|
646
|
+
run_results = result.to_json(
|
|
647
|
+
response_train=response_train,
|
|
648
|
+
response_test=response_test,
|
|
649
|
+
response_holdout=None,
|
|
650
|
+
patients_train=patients_train,
|
|
651
|
+
patients_test=patients_test,
|
|
652
|
+
patients_holdout=None
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
# Calculating performance metrics for training phase and saving the ROC curve
|
|
656
|
+
run_results[model_id]['train']['metrics'] = result.get_model_performance(
|
|
657
|
+
response_train,
|
|
658
|
+
outcome_table_binary_train,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# Calculating performance metrics for testing phase and saving the ROC curve
|
|
662
|
+
run_results[model_id]['test']['metrics'] = result.get_model_performance(
|
|
663
|
+
response_test,
|
|
664
|
+
outcome_table_binary_test,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
if holdout_test:
|
|
668
|
+
# Calculating performance metrics for holdout phase and saving the ROC curve
|
|
669
|
+
run_results[model_id]['holdout']['metrics'] = result.get_model_performance(
|
|
670
|
+
response_holdout,
|
|
671
|
+
outcome_table_binary_holdout,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
logging.info('\n\n--> COMPUTING PERFORMANCE METRICS ... Done in {:.2f} sec'.format(time.time()-tstart))
|
|
675
|
+
|
|
676
|
+
# F. Saving the results dictionary
|
|
677
|
+
save_json(path_results, run_results, cls=NumpyEncoder)
|
|
678
|
+
|
|
679
|
+
# Total computing time
|
|
680
|
+
logging.info("\n\n*********************************************************************")
|
|
681
|
+
logging.info('{} TOTAL COMPUTATION TIME: {:.2f} hours'.format(" " * 13, (time.time()-batch_start)/3600))
|
|
682
|
+
logging.info("*********************************************************************")
|
|
683
|
+
|
|
684
|
+
def run_experiment(self, holdout_test: bool = True, method: str = "pycaret") -> None:
|
|
685
|
+
"""
|
|
686
|
+
Run the machine learning experiment for each split/run
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
holdout_test (bool, optional): Boolean specifying if the hold-out test should be performed.
|
|
690
|
+
method (str, optional): String specifying the method to use to train the XGBoost model.
|
|
691
|
+
- "pycaret": Use PyCaret to train the model (automatic).
|
|
692
|
+
- "grid_search": Grid search with cross-validation to find the best parameters.
|
|
693
|
+
- "random_search": Random search with cross-validation to find the best parameters.
|
|
694
|
+
|
|
695
|
+
Returns:
|
|
696
|
+
None
|
|
697
|
+
"""
|
|
698
|
+
# Initialize the DesignExperiment class
|
|
699
|
+
experiment = DesignExperiment(self.path_study, self.path_settings, self.experiment_label)
|
|
700
|
+
|
|
701
|
+
# Generate the machine learning experiment
|
|
702
|
+
path_file_ml_paths = experiment.generate_experiment()
|
|
703
|
+
|
|
704
|
+
# Run the different machine learning tests for the experiment
|
|
705
|
+
tests_dict = load_json(path_file_ml_paths) # Tests dictionary
|
|
706
|
+
for run in tests_dict.keys():
|
|
707
|
+
self.ml_run(tests_dict[run], holdout_test, method)
|
|
708
|
+
|
|
709
|
+
# Average results of the different splits/runs
|
|
710
|
+
average_results(self.path_study / f'learn__{self.experiment_label}', save=True)
|
|
711
|
+
|
|
712
|
+
# Analyze the features importance for all the runs
|
|
713
|
+
feature_imporance_analysis(self.path_study / f'learn__{self.experiment_label}')
|
|
714
|
+
|