mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1015 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import pickle
|
|
5
|
+
import re
|
|
6
|
+
import string
|
|
7
|
+
from copy import deepcopy
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import matplotlib.pyplot as plt
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import seaborn as sns
|
|
16
|
+
from numpyencoder import NumpyEncoder
|
|
17
|
+
from sklearn.model_selection import StratifiedKFold
|
|
18
|
+
|
|
19
|
+
from MEDiml.utils import get_institutions_from_ids
|
|
20
|
+
from MEDiml.utils.get_full_rad_names import get_full_rad_names
|
|
21
|
+
from MEDiml.utils.json_utils import load_json, save_json
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Define useful constants
|
|
25
|
+
# Metrics to process
|
|
26
|
+
list_metrics = [
|
|
27
|
+
'AUC', 'AUPRC', 'BAC', 'Sensitivity', 'Specificity',
|
|
28
|
+
'Precision', 'NPV', 'F1_score', 'Accuracy', 'MCC',
|
|
29
|
+
'TN', 'FP', 'FN', 'TP'
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
def average_results(path_results: Path, save: bool = False) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Averages the results (AUC, BAC, Sensitivity and Specifity) of all the runs of the same experiment,
|
|
35
|
+
for training, testing and holdout sets.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
path_results(Path): path to the folder containing the results of the experiment.
|
|
39
|
+
save (bool, optional): If True, saves the results in the same folder as the model.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
None.
|
|
43
|
+
"""
|
|
44
|
+
# Get all tests paths
|
|
45
|
+
list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
|
|
46
|
+
|
|
47
|
+
# Initialize dictionaries
|
|
48
|
+
results_avg = {
|
|
49
|
+
'train': {},
|
|
50
|
+
'test': {},
|
|
51
|
+
'holdout': {}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Metrics to process
|
|
55
|
+
metrics = ['AUC', 'AUPRC', 'BAC', 'Sensitivity', 'Specificity',
|
|
56
|
+
'Precision', 'NPV', 'F1_score', 'Accuracy', 'MCC',
|
|
57
|
+
'TN', 'FP', 'FN', 'TP']
|
|
58
|
+
|
|
59
|
+
# Process metrics
|
|
60
|
+
for dataset in ['train', 'test', 'holdout']:
|
|
61
|
+
dataset_dict = results_avg[dataset]
|
|
62
|
+
for metric in metrics:
|
|
63
|
+
metric_values = []
|
|
64
|
+
for path_test in list_path_tests:
|
|
65
|
+
results_dict = load_json(path_test / 'run_results.json')
|
|
66
|
+
if dataset in results_dict[list(results_dict.keys())[0]].keys():
|
|
67
|
+
if 'metrics' in results_dict[list(results_dict.keys())[0]][dataset].keys():
|
|
68
|
+
metric_values.append(results_dict[list(results_dict.keys())[0]][dataset]['metrics'][metric])
|
|
69
|
+
else:
|
|
70
|
+
continue
|
|
71
|
+
else:
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Fill the dictionary
|
|
75
|
+
if metric_values:
|
|
76
|
+
dataset_dict[f'{metric}_mean'] = np.nanmean(metric_values)
|
|
77
|
+
dataset_dict[f'{metric}_std'] = np.nanstd(metric_values)
|
|
78
|
+
dataset_dict[f'{metric}_max'] = np.nanmax(metric_values)
|
|
79
|
+
dataset_dict[f'{metric}_min'] = np.nanmin(metric_values)
|
|
80
|
+
dataset_dict[f'{metric}_2.5%'] = np.nanpercentile(metric_values, 2.5)
|
|
81
|
+
dataset_dict[f'{metric}_97.5%'] = np.nanpercentile(metric_values, 97.5)
|
|
82
|
+
|
|
83
|
+
# Save the results
|
|
84
|
+
if save:
|
|
85
|
+
save_json(path_results / 'results_avg.json', results_avg, cls=NumpyEncoder)
|
|
86
|
+
return path_results / 'results_avg.json'
|
|
87
|
+
|
|
88
|
+
return results_avg
|
|
89
|
+
|
|
90
|
+
def combine_rad_tables(rad_tables: List) -> pd.DataFrame:
|
|
91
|
+
"""
|
|
92
|
+
Combines a list of radiomics tables into one single table.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
rad_tables (List): List of radiomics tables.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
pd.DataFrame: Single combined radiomics table.
|
|
99
|
+
"""
|
|
100
|
+
# Initialization
|
|
101
|
+
n_tables = len(rad_tables)
|
|
102
|
+
|
|
103
|
+
base_idx = 0
|
|
104
|
+
for idx, table in enumerate(rad_tables):
|
|
105
|
+
if not table.empty:
|
|
106
|
+
base_idx = idx
|
|
107
|
+
break
|
|
108
|
+
# Finding patient intersection
|
|
109
|
+
for t in range(n_tables):
|
|
110
|
+
if rad_tables[t].shape[1] > 0 and t != base_idx:
|
|
111
|
+
rad_tables[base_idx], rad_tables[t] = intersect_var_tables(rad_tables[base_idx], rad_tables[t])
|
|
112
|
+
|
|
113
|
+
# Check for NaNs
|
|
114
|
+
'''for table in rad_tables:
|
|
115
|
+
assert(table.isna().sum().sum() == 0)'''
|
|
116
|
+
|
|
117
|
+
# Initializing the radiomics table template
|
|
118
|
+
radiomics_table = pd.DataFrame()
|
|
119
|
+
radiomics_table.Properties = {}
|
|
120
|
+
radiomics_table._metadata += ['Properties']
|
|
121
|
+
radiomics_table.Properties['userData'] = {}
|
|
122
|
+
radiomics_table.Properties['VariableNames'] = []
|
|
123
|
+
radiomics_table.Properties['userData']['normalization'] = {}
|
|
124
|
+
|
|
125
|
+
# Combining radiomics table one by one
|
|
126
|
+
count = 0
|
|
127
|
+
continuous = []
|
|
128
|
+
str_names = '||'
|
|
129
|
+
for t in range(n_tables):
|
|
130
|
+
rad_table_id = 'radTab' + str(t+1)
|
|
131
|
+
if rad_tables[t].shape[1] > 0 and rad_tables[t].shape[0] > 0:
|
|
132
|
+
features = rad_tables[t].columns.values
|
|
133
|
+
description = rad_tables[t].Properties['Description']
|
|
134
|
+
full_rad_names = get_full_rad_names(rad_tables[t].Properties['userData']['variables']['var_def'],
|
|
135
|
+
features)
|
|
136
|
+
if 'normalization' in rad_tables[t].Properties['userData']:
|
|
137
|
+
radiomics_table.Properties['userData']['normalization'][rad_table_id] = rad_tables[t].Properties[
|
|
138
|
+
'userData']['normalization']
|
|
139
|
+
for f, feature in enumerate(features):
|
|
140
|
+
count += 1
|
|
141
|
+
var_name = 'radVar' + str(count)
|
|
142
|
+
radiomics_table[var_name] = rad_tables[t][feature]
|
|
143
|
+
radiomics_table.Properties['VariableNames'].append(var_name)
|
|
144
|
+
continuous.append(var_name)
|
|
145
|
+
if description:
|
|
146
|
+
str_names += 'radVar' + str(count) + ':' + description + '___' + full_rad_names[f] + '||'
|
|
147
|
+
else:
|
|
148
|
+
str_names += 'radVar' + str(count) + ':' + full_rad_names[f] + '||'
|
|
149
|
+
|
|
150
|
+
# Updating the radiomics table properties
|
|
151
|
+
radiomics_table.Properties['Description'] = ''
|
|
152
|
+
radiomics_table.Properties['DimensionNames'] = ['PatientID']
|
|
153
|
+
radiomics_table.Properties['userData']['variables'] = {}
|
|
154
|
+
radiomics_table.Properties['userData']['variables']['var_def'] = str_names
|
|
155
|
+
radiomics_table.Properties['userData']['variables']['continuous'] = continuous
|
|
156
|
+
|
|
157
|
+
return radiomics_table
|
|
158
|
+
|
|
159
|
+
def combine_tables_from_list(var_list: List, combination: List) -> pd.DataFrame:
|
|
160
|
+
"""
|
|
161
|
+
Concatenates all variable tables in ``var_list`` according to ``var_ids``.
|
|
162
|
+
|
|
163
|
+
Unlike ``combine_rad_tables`` This method concatenates variable tables instead of creating a new table from
|
|
164
|
+
the intersection of the tables.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
var_list (List): List of tables. Each key is a given var_id and holds a radiomic table.
|
|
168
|
+
--> Ex: .var1: variable table 1
|
|
169
|
+
.var2: variable table 2
|
|
170
|
+
.var3: variable table 3
|
|
171
|
+
combination (list): List of strings to identify the table to combine in var_list.
|
|
172
|
+
--> Ex: {'var1','var3'}
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
pd.DataFrame: variable_table: Combined radiomics table.
|
|
176
|
+
"""
|
|
177
|
+
def concatenate_varid(var_names, var_id):
|
|
178
|
+
return np.asarray([var_id + "__" + var_name for var_name in var_names.tolist()])
|
|
179
|
+
|
|
180
|
+
# Initialization
|
|
181
|
+
variables = dict()
|
|
182
|
+
variables['continuous'] = np.array([])
|
|
183
|
+
variable_tables = list()
|
|
184
|
+
|
|
185
|
+
# Using the first table as template
|
|
186
|
+
var_id = combination[0]
|
|
187
|
+
variable_table = deepcopy(var_list[var_id]) # first table from the list
|
|
188
|
+
variable_table.Properties = deepcopy(var_list[var_id].Properties)
|
|
189
|
+
new_columns = [var_id + '__' + col for col in variable_table.columns]
|
|
190
|
+
variable_table.columns = new_columns
|
|
191
|
+
variable_table.Properties['VariableNames'] = new_columns
|
|
192
|
+
variable_table.Properties['userData'] = dict() # Re-Initializing
|
|
193
|
+
variable_table.Properties['userData'][var_id] = deepcopy(var_list[var_id].Properties['userData'])
|
|
194
|
+
variables['continuous'] = np.concatenate((variables['continuous'], var_list[var_id].Properties[
|
|
195
|
+
'userData']['variables']['continuous']))
|
|
196
|
+
variable_tables.append(variable_table)
|
|
197
|
+
|
|
198
|
+
# Concatenating all other tables
|
|
199
|
+
for var_id in combination[1:]:
|
|
200
|
+
variable_table.Properties['userData'][var_id] = var_list[var_id].Properties['userData']
|
|
201
|
+
patient_ids = intersect(list(variable_table.index), (var_list[var_id].index))
|
|
202
|
+
var_list[var_id] = var_list[var_id].loc[patient_ids]
|
|
203
|
+
variable_table = variable_table.loc[patient_ids]
|
|
204
|
+
old_columns = list(variable_table.columns)
|
|
205
|
+
old_properties = deepcopy(variable_table.Properties) # for unknown reason Properties are erased after concat
|
|
206
|
+
variable_table = pd.concat([variable_table, var_list[var_id]], axis=1)
|
|
207
|
+
variable_table.columns = old_columns + [var_id + "__" + col for col in var_list[var_id].columns]
|
|
208
|
+
variable_table.Properties = old_properties
|
|
209
|
+
variable_table.Properties['VariableNames'] = list(variable_table.columns)
|
|
210
|
+
variables['continuous'] = np.concatenate((variables['continuous'], var_list[var_id].Properties['userData']['variables']['continuous']))
|
|
211
|
+
|
|
212
|
+
# Updating the radiomics table properties
|
|
213
|
+
variable_table.Properties['Description'] = "Data table"
|
|
214
|
+
variables['continuous'] = concatenate_varid(variables['continuous'], var_id)
|
|
215
|
+
variable_table.Properties['userData']['variables'] = variables
|
|
216
|
+
|
|
217
|
+
return variable_table
|
|
218
|
+
|
|
219
|
+
def convert_comibnations_to_list(combinations_string: str) -> Tuple[List, List]:
|
|
220
|
+
"""
|
|
221
|
+
Converts a cell of strings specifying variable ids combinations to
|
|
222
|
+
a cell of cells of strings.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
combinations_string (str): Cell of strings specifying var_ids combinations
|
|
226
|
+
separated by underscores.
|
|
227
|
+
--> Ex: {'var1_var2';'var2_var3';'var1_var2_var3'}
|
|
228
|
+
|
|
229
|
+
Rerturs:
|
|
230
|
+
- List: List of strings of the seperated var_ids.
|
|
231
|
+
--> Ex: {{'var1','var2'};{'var2','var3'};{'var1','var2','var3'}}
|
|
232
|
+
- List: List of strings specifying the "alphabetical" IDs of combined variables
|
|
233
|
+
in ``combinations``. var1 --> A, var2 -> B, etc.
|
|
234
|
+
--> Ex: {'model_AB';'model_BC';'model_ABC'}
|
|
235
|
+
"""
|
|
236
|
+
# Building combinations
|
|
237
|
+
combinations = [s.split('_') for s in combinations_string]
|
|
238
|
+
|
|
239
|
+
# Building model_ids
|
|
240
|
+
alphabet = string.ascii_uppercase
|
|
241
|
+
model_ids = list()
|
|
242
|
+
for combination in combinations:
|
|
243
|
+
model_ids.append('model_' + ''.join([alphabet[int(var[3:])-1] for var in combination]))
|
|
244
|
+
|
|
245
|
+
return combinations, model_ids
|
|
246
|
+
|
|
247
|
+
def count_class_imbalance(path_csv_outcomes: Path) -> Dict:
|
|
248
|
+
"""
|
|
249
|
+
Counts the class imbalance in a given outcome table.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
path_csv_outcomes (Path): Path to the outcome table.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dict: Dictionary containing the count of each class.
|
|
256
|
+
"""
|
|
257
|
+
# Initialization
|
|
258
|
+
outcomes = pandas.read_csv(path_csv_outcomes, sep=',')
|
|
259
|
+
outcomes.dropna(inplace=True)
|
|
260
|
+
outcomes.reset_index(inplace=True, drop=True)
|
|
261
|
+
name_outcome = outcomes.columns[-1]
|
|
262
|
+
|
|
263
|
+
# Counting the percentage of each class
|
|
264
|
+
class_0_perc = np.sum(outcomes[name_outcome] == 0) / len(outcomes)
|
|
265
|
+
class_1_perc = np.sum(outcomes[name_outcome] == 1) / len(outcomes)
|
|
266
|
+
|
|
267
|
+
return {'class_0': class_0_perc, 'class_1': class_1_perc}
|
|
268
|
+
|
|
269
|
+
def create_experiment_folder(path_outcome_folder: str, method: str = 'Random') -> str:
|
|
270
|
+
"""
|
|
271
|
+
Creates the experiment folder where the hold-out splits will be saved and returns the path
|
|
272
|
+
to the folder.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
path_outcome_folder (str): Full path to the outcome folder (folder containing the outcome table etc).
|
|
276
|
+
method (str): String specifying the split type. Default is 'Random'.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
str: Full path to the experiment folder.
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
# Creating the outcome folder if it does not exist
|
|
283
|
+
if not os.path.isdir(path_outcome_folder):
|
|
284
|
+
os.makedirs(path_outcome_folder)
|
|
285
|
+
|
|
286
|
+
# Creating the experiment folder if it does not exist
|
|
287
|
+
list_outcome = os.listdir(path_outcome_folder)
|
|
288
|
+
if not list_outcome:
|
|
289
|
+
flag_exist_split = False
|
|
290
|
+
else:
|
|
291
|
+
n_exist = 0
|
|
292
|
+
flag_exist_split = False
|
|
293
|
+
for i in range(len(list_outcome)):
|
|
294
|
+
if 'holdOut__' + method + '__' in list_outcome[i]:
|
|
295
|
+
n_exist = n_exist + 1
|
|
296
|
+
flag_exist_split = True
|
|
297
|
+
|
|
298
|
+
# If path experiment folder exists already, create a new one (sequentially)
|
|
299
|
+
if not flag_exist_split:
|
|
300
|
+
path_split = str(path_outcome_folder) + '/holdOut__' + method + '__001'
|
|
301
|
+
else:
|
|
302
|
+
path_split = str(path_outcome_folder) + '/holdOut__' + method + '__' + \
|
|
303
|
+
str(n_exist+1).zfill(3)
|
|
304
|
+
|
|
305
|
+
os.mkdir(path_split)
|
|
306
|
+
return path_split
|
|
307
|
+
|
|
308
|
+
def create_holdout_set(
|
|
309
|
+
path_outcome_file: Union[str, Path],
|
|
310
|
+
outcome_name: str,
|
|
311
|
+
path_save_experiments: Union[str, Path] = None,
|
|
312
|
+
method: str = 'random',
|
|
313
|
+
percentage: float = 0.2,
|
|
314
|
+
n_split: int = 1,
|
|
315
|
+
seed : int = 1) -> None:
|
|
316
|
+
"""
|
|
317
|
+
Creates a hold-out patient set to be used for final independent testing after a final
|
|
318
|
+
model is chosen. All the information is saved in a JSON file.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
path_outcome_file (str): Full path to where the outcome CSV file is stored.
|
|
322
|
+
outcome_name (str): Name of the outcome. For example, 'OS' for overral survivor.
|
|
323
|
+
path_save_experiments (str): Full path to the folder where the experiments
|
|
324
|
+
will be saved.
|
|
325
|
+
method (str): Method to use for creating the hold-out set. Options are:
|
|
326
|
+
- 'random': Randomly selects patients for the hold-out set.
|
|
327
|
+
- 'all_learn': No hold-out set is created. All patients are used for learning.
|
|
328
|
+
- 'institution': TODO.
|
|
329
|
+
percentage (float): Percentage of patients to use for the hold-out set. Default is 0.2.
|
|
330
|
+
n_split (int): Number of splits to create. Default is 1.
|
|
331
|
+
seed (int): Seed to use for the random split. Default is 1.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
None.
|
|
335
|
+
"""
|
|
336
|
+
# Initilization
|
|
337
|
+
outcome_name = outcome_name.upper()
|
|
338
|
+
outcome_table = pandas.read_csv(path_outcome_file, sep=',')
|
|
339
|
+
outcome_table.dropna(inplace=True)
|
|
340
|
+
outcome_table.reset_index(inplace=True, drop=True)
|
|
341
|
+
patient_ids = outcome_table['PatientID']
|
|
342
|
+
|
|
343
|
+
# Creating experiment folders and patient test split(s)
|
|
344
|
+
outcome_name = re.sub(r'\W', "", outcome_name)
|
|
345
|
+
path_outcome = str(path_save_experiments) + '/' + outcome_name
|
|
346
|
+
name_outcome_in_table_binary = outcome_name + '_binary'
|
|
347
|
+
|
|
348
|
+
# Column names in the outcome table
|
|
349
|
+
with open(path_outcome_file, 'r') as infile:
|
|
350
|
+
reader = csv.DictReader(infile, delimiter=',')
|
|
351
|
+
var_names = reader.fieldnames
|
|
352
|
+
|
|
353
|
+
# Include time to event if it exists
|
|
354
|
+
flag_time = False
|
|
355
|
+
if(outcome_name + '_eventFreeTime' in str(var_names)):
|
|
356
|
+
name_outcome_in_table_time = outcome_name + '_eventFreeTime'
|
|
357
|
+
flag_time = True
|
|
358
|
+
|
|
359
|
+
# Check if the outcome name for binary is correct
|
|
360
|
+
if name_outcome_in_table_binary not in outcome_table.columns:
|
|
361
|
+
name_outcome_in_table_binary = var_names[-1]
|
|
362
|
+
|
|
363
|
+
# Run the split
|
|
364
|
+
# Random
|
|
365
|
+
if 'random' in method.lower():
|
|
366
|
+
# Creating the experiment folder
|
|
367
|
+
path_split = create_experiment_folder(path_outcome, 'random')
|
|
368
|
+
|
|
369
|
+
# Getting the random split
|
|
370
|
+
patients_learn_temp, patients_hold_out_temp = get_stratified_splits(
|
|
371
|
+
outcome_table[['PatientID', name_outcome_in_table_binary]],
|
|
372
|
+
n_split, percentage, seed, False)
|
|
373
|
+
|
|
374
|
+
# Getting the patient IDs in the learning and hold-out sets
|
|
375
|
+
if n_split > 1:
|
|
376
|
+
patients_learn = np.empty((n_split, len(patients_learn_temp[0])), dtype=object)
|
|
377
|
+
patients_hold_out = np.empty((n_split, len(patients_hold_out_temp[0])), dtype=object)
|
|
378
|
+
for s in range(n_split):
|
|
379
|
+
patients_learn[s] = patient_ids[patients_learn_temp[s]]
|
|
380
|
+
patients_hold_out[s] = patient_ids[patients_hold_out_temp[s]]
|
|
381
|
+
else:
|
|
382
|
+
patients_learn = patient_ids[patients_learn_temp.values.tolist()]
|
|
383
|
+
patients_learn.reset_index(inplace=True, drop=True)
|
|
384
|
+
patients_hold_out = patient_ids[patients_hold_out_temp.values.tolist()]
|
|
385
|
+
patients_hold_out.reset_index(inplace=True, drop=True)
|
|
386
|
+
|
|
387
|
+
# All Learn
|
|
388
|
+
elif 'all_learn' in method.lower():
|
|
389
|
+
# Creating the experiment folder
|
|
390
|
+
path_split = create_experiment_folder(path_outcome, 'all_learn')
|
|
391
|
+
|
|
392
|
+
# Getting the split (all Learn so no hold out)
|
|
393
|
+
patients_learn = patient_ids
|
|
394
|
+
patients_hold_out = []
|
|
395
|
+
else:
|
|
396
|
+
raise ValueError('Method not recognized. Use "random" or "all_learn".')
|
|
397
|
+
|
|
398
|
+
# Creating final outcome table and saving it
|
|
399
|
+
if flag_time:
|
|
400
|
+
outcomes = outcome_table[
|
|
401
|
+
['PatientID', name_outcome_in_table_binary, name_outcome_in_table_time]]
|
|
402
|
+
else:
|
|
403
|
+
outcomes = outcome_table[['PatientID', name_outcome_in_table_binary]]
|
|
404
|
+
|
|
405
|
+
# Finalize the outcome table
|
|
406
|
+
outcomes = outcomes.dropna(inplace=False) # Drop NaNs
|
|
407
|
+
outcomes.reset_index(inplace=True, drop=True) # Reset index
|
|
408
|
+
|
|
409
|
+
# Save the outcome table
|
|
410
|
+
paths_exp_outcomes = str(path_split + '/outcomes.csv')
|
|
411
|
+
outcomes.to_csv(paths_exp_outcomes, index=False)
|
|
412
|
+
|
|
413
|
+
# Save dict of patientsLearn
|
|
414
|
+
paths_exp_patientsLearn = str(path_split) + '/patientsLearn.json'
|
|
415
|
+
patients_learn.to_json(paths_exp_patientsLearn, orient='values', indent=4)
|
|
416
|
+
|
|
417
|
+
# Save dict of patientsHoldOut
|
|
418
|
+
if method == 'random':
|
|
419
|
+
paths_exp_patients_hold_out = str(path_split) + '/patientsHoldOut.json'
|
|
420
|
+
patients_hold_out.to_json(paths_exp_patients_hold_out, orient='values', indent=4)
|
|
421
|
+
|
|
422
|
+
# Save dict of all the paths
|
|
423
|
+
data={
|
|
424
|
+
"outcomes" : paths_exp_outcomes,
|
|
425
|
+
"patientsLearn": paths_exp_patientsLearn,
|
|
426
|
+
"patientsHoldOut": paths_exp_patients_hold_out,
|
|
427
|
+
"pathWORK": path_split
|
|
428
|
+
}
|
|
429
|
+
else:
|
|
430
|
+
data={
|
|
431
|
+
"outcomes" : paths_exp_outcomes,
|
|
432
|
+
"patientsLearn": paths_exp_patientsLearn,
|
|
433
|
+
"pathWORK": path_split
|
|
434
|
+
}
|
|
435
|
+
paths_exp = str(path_split + '/paths_exp.json')
|
|
436
|
+
with open(paths_exp, 'w') as f:
|
|
437
|
+
json.dump(data, f, indent=4)
|
|
438
|
+
|
|
439
|
+
# Return the path to the experiment and path to split
|
|
440
|
+
return path_split, paths_exp
|
|
441
|
+
|
|
442
|
+
def cross_validation_split(
|
|
443
|
+
outcome: List[Union[int, float]],
|
|
444
|
+
n_splits: int = 5,
|
|
445
|
+
seed: int = None
|
|
446
|
+
) -> Tuple[List[List[int]], List[List[int]]]:
|
|
447
|
+
"""
|
|
448
|
+
Perform stratified cross-validation split.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
outcome (list): Outcome variable (binary).
|
|
452
|
+
n_splits (int, optional): Number of folds. Default is 5.
|
|
453
|
+
seed (int or None, optional): Random seed for reproducibility. Default is None.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
train_indices_list (list of lists): List of training indices for each fold.
|
|
457
|
+
test_indices_list (list of lists): List of testing indices for each fold.
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
|
|
461
|
+
train_data_list = []
|
|
462
|
+
test_data_list = []
|
|
463
|
+
patient_ids = pd.Series(outcome.index)
|
|
464
|
+
|
|
465
|
+
for train_indices, test_indices in skf.split(X=outcome, y=outcome):
|
|
466
|
+
train_data_list.append(patient_ids[train_indices])
|
|
467
|
+
test_data_list.append(patient_ids[test_indices])
|
|
468
|
+
|
|
469
|
+
train_data_array = np.array(train_data_list, dtype=object)
|
|
470
|
+
test_data_array = np.array(test_data_list, dtype=object)
|
|
471
|
+
|
|
472
|
+
return train_data_array, test_data_array
|
|
473
|
+
|
|
474
|
+
def find_best_model(path_results: Path, metric: str = 'AUC', second_metric: str = 'AUC') -> Tuple[Dict, Path]:
|
|
475
|
+
"""
|
|
476
|
+
Find the best model with the highest performance on the test set
|
|
477
|
+
in a given path based on a given metric.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
path_results (Path): Path to the results folder.
|
|
481
|
+
metric (str): Metric to use to find the best model in case of a tie. Default is 'AUC'.
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Tuple[Dict, Path]: Tuple containing the best model result dict and the path to the best model.
|
|
485
|
+
"""
|
|
486
|
+
list_metrics = [
|
|
487
|
+
'AUC', 'Sensitivity', 'Specificity',
|
|
488
|
+
'BAC', 'AUPRC', 'Precision',
|
|
489
|
+
'NPV', 'Accuracy', 'F1_score', 'MCC',
|
|
490
|
+
'TP', 'TN', 'FP', 'FN'
|
|
491
|
+
]
|
|
492
|
+
assert metric in list_metrics, f'Given metric {metric} is not in the list of metrics. Please choose from {list_metrics}'
|
|
493
|
+
|
|
494
|
+
# Get all tests paths
|
|
495
|
+
list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
|
|
496
|
+
|
|
497
|
+
# Initialization
|
|
498
|
+
metric_best = -1
|
|
499
|
+
second_metric_best = -1
|
|
500
|
+
path_result_best = None
|
|
501
|
+
|
|
502
|
+
# Get all models and their metrics (AUC especially)
|
|
503
|
+
for path_test in list_path_tests:
|
|
504
|
+
if not (path_test / 'run_results.json').exists():
|
|
505
|
+
continue
|
|
506
|
+
results_dict = load_json(path_test / 'run_results.json')
|
|
507
|
+
metric_test = results_dict[list(results_dict.keys())[0]]['test']['metrics'][metric]
|
|
508
|
+
if metric_test > metric_best:
|
|
509
|
+
metric_best = metric_test
|
|
510
|
+
path_result_best = path_test
|
|
511
|
+
elif metric_test == metric_best:
|
|
512
|
+
second_metric_test = results_dict[list(results_dict.keys())[0]]['test']['metrics'][second_metric]
|
|
513
|
+
if second_metric_test > second_metric_best:
|
|
514
|
+
second_metric_best = second_metric_test
|
|
515
|
+
path_result_best = path_test
|
|
516
|
+
|
|
517
|
+
# Load best model result dict
|
|
518
|
+
results_dict_best = load_json(path_result_best / 'run_results.json')
|
|
519
|
+
|
|
520
|
+
# Load model
|
|
521
|
+
model_name = list(results_dict_best.keys())[0]
|
|
522
|
+
with open(path_result_best / f'{model_name}.pickle', 'rb') as file:
|
|
523
|
+
model = pickle.load(file)
|
|
524
|
+
|
|
525
|
+
return model, results_dict_best
|
|
526
|
+
|
|
527
|
+
def feature_imporance_analysis(path_results: Path):
|
|
528
|
+
"""
|
|
529
|
+
Averages the results (AUC, BAC, Sensitivity and Specifity) of all the runs of the same experiment,
|
|
530
|
+
for training, testing and holdout sets.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
path_results(Path): path to the folder containing the results of the experiment.
|
|
534
|
+
save (bool, optional): If True, saves the results in the same folder as the model.
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
None.
|
|
538
|
+
"""
|
|
539
|
+
# Get all tests paths
|
|
540
|
+
list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
|
|
541
|
+
|
|
542
|
+
# Initialization
|
|
543
|
+
results_avg_temp = {}
|
|
544
|
+
results_avg = {}
|
|
545
|
+
|
|
546
|
+
# Process metrics
|
|
547
|
+
for path_test in list_path_tests:
|
|
548
|
+
variables = []
|
|
549
|
+
list_models = list(path_test.glob('*.pickle'))
|
|
550
|
+
if len(list_models) == 0 or len(list_models) > 1:
|
|
551
|
+
raise ValueError(f'Path {path_test} does not contain a single model.')
|
|
552
|
+
model_obj = list_models[0]
|
|
553
|
+
with open(model_obj, "rb") as f:
|
|
554
|
+
model_dict = pickle.load(f)
|
|
555
|
+
if model_dict["var_names"]:
|
|
556
|
+
variables = get_full_rad_names(model_dict['var_info']['variables']['var_def'], model_dict["var_names"])
|
|
557
|
+
for index, var in enumerate(variables):
|
|
558
|
+
var = var.split("\\")[-1] # Remove the path for windows
|
|
559
|
+
var = var.split("/")[-1] # Remove the path for linux
|
|
560
|
+
if var not in results_avg_temp:
|
|
561
|
+
results_avg_temp[var] = {
|
|
562
|
+
'importance_mean': [],
|
|
563
|
+
'times_selected': 0
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
results_avg_temp[var]['importance_mean'].append(model_dict['model'].feature_importances_[index])
|
|
567
|
+
results_avg_temp[var]['times_selected'] += 1
|
|
568
|
+
for var in results_avg_temp:
|
|
569
|
+
results_avg[var] = {
|
|
570
|
+
'importance_mean': np.sum(results_avg_temp[var]['importance_mean']) / len(list_path_tests),
|
|
571
|
+
'times_selected': results_avg_temp[var]['times_selected']
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
del results_avg_temp
|
|
575
|
+
|
|
576
|
+
save_json(path_results / 'feature_importance_analysis.json', results_avg, cls=NumpyEncoder)
|
|
577
|
+
|
|
578
|
+
def get_ml_test_table(variable_table: pd.DataFrame, var_names: List, var_def: str) -> pd.DataFrame:
|
|
579
|
+
"""
|
|
580
|
+
Gets the test table with the variables that are present in the training table.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
variable_table (pd.DataFrame): Table with the variables to use for the ML model that
|
|
584
|
+
will be matched with the training table.
|
|
585
|
+
var_names (List): List of variable names used for the ML model .
|
|
586
|
+
var_def (str): String of the full variables names used for the ML model.
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
pd.DataFrame: Table with the variables that are present in the training table.
|
|
590
|
+
"""
|
|
591
|
+
|
|
592
|
+
# Get the full variable names for training
|
|
593
|
+
full_radvar_names_trained = get_full_rad_names(var_def, var_names).tolist()
|
|
594
|
+
|
|
595
|
+
# Get the full variable names for testing
|
|
596
|
+
full_rad_var_names_test = get_full_rad_names(
|
|
597
|
+
variable_table.Properties['userData']['variables']['var_def'],
|
|
598
|
+
variable_table.columns.values
|
|
599
|
+
).tolist()
|
|
600
|
+
|
|
601
|
+
# Get the indexes of the variables that are present in the training table
|
|
602
|
+
indexes = []
|
|
603
|
+
for radvar in full_radvar_names_trained:
|
|
604
|
+
try:
|
|
605
|
+
indexes.append(full_rad_var_names_test.index(radvar))
|
|
606
|
+
except ValueError as e:
|
|
607
|
+
print(e)
|
|
608
|
+
raise ValueError('The variable ' + radvar + ' is not present in the test table.')
|
|
609
|
+
|
|
610
|
+
# Get the test table with the variables that are present in the training table
|
|
611
|
+
variable_table = variable_table.iloc[:, indexes]
|
|
612
|
+
|
|
613
|
+
# User data - var_def
|
|
614
|
+
str_names = '||'
|
|
615
|
+
for v in range(len(var_names)):
|
|
616
|
+
str_names += var_names[v] + ':' + full_radvar_names_trained[v] + '||'
|
|
617
|
+
|
|
618
|
+
# Update metadata and variable names
|
|
619
|
+
variable_table.columns = var_names
|
|
620
|
+
variable_table.Properties['VariableNames'] = var_names
|
|
621
|
+
variable_table.Properties['userData']['variables']['var_def'] = str_names
|
|
622
|
+
variable_table.Properties['userData']['variables']['continuous'] = var_names
|
|
623
|
+
|
|
624
|
+
# Rename columns to s sequential names again
|
|
625
|
+
return variable_table
|
|
626
|
+
|
|
627
|
+
def finalize_rad_table(rad_table: pd.DataFrame) -> pd.DataFrame:
|
|
628
|
+
"""
|
|
629
|
+
Finalizes the variable names and the associated metadata. Used to have sequential variable
|
|
630
|
+
names and UserData with only variable names present in the table.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
rad_table (pd.DataFrame): radiomics table to be finalized.
|
|
634
|
+
|
|
635
|
+
Returns:
|
|
636
|
+
pd.DataFrame: Finalized radiomics table.
|
|
637
|
+
"""
|
|
638
|
+
|
|
639
|
+
# Initialization
|
|
640
|
+
var_names = rad_table.columns.values
|
|
641
|
+
full_rad_names = get_full_rad_names(rad_table.Properties['userData']['variables']['var_def'], var_names)
|
|
642
|
+
|
|
643
|
+
# User data - var_def
|
|
644
|
+
str_names = '||'
|
|
645
|
+
for v in range(var_names.size):
|
|
646
|
+
var_names[v] = 'radVar' + str(v+1)
|
|
647
|
+
str_names = str_names + var_names[v] + ':' + full_rad_names[v] + '||'
|
|
648
|
+
|
|
649
|
+
# Update metadata and variable names
|
|
650
|
+
rad_table.columns = var_names
|
|
651
|
+
rad_table.Properties['VariableNames'] = var_names
|
|
652
|
+
rad_table.Properties['userData']['variables']['var_def'] = str_names
|
|
653
|
+
rad_table.Properties['userData']['variables']['continuous'] = var_names
|
|
654
|
+
|
|
655
|
+
return rad_table
|
|
656
|
+
|
|
657
|
+
def get_radiomics_table(
|
|
658
|
+
path_radiomics_csv: Path,
|
|
659
|
+
path_radiomics_txt: Path,
|
|
660
|
+
image_type: str,
|
|
661
|
+
patients_ids: List = None
|
|
662
|
+
) -> pd.DataFrame:
|
|
663
|
+
"""
|
|
664
|
+
Loads the radiomics table from the .csv file and the associated metadata.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
path_radiomics_csv (Path): full path to the csv file of radiomics table.
|
|
668
|
+
--> Ex: /home/myStudy/FEATURES/radiomics__PET(GTV)__image.csv
|
|
669
|
+
path_radiomics_txt: full path to the radiomics variable definitions in text format (associated
|
|
670
|
+
to path_radiomics_csv).
|
|
671
|
+
-> Ex: /home/myStudy/FEATURES/radiomics__PET(GTV)__image.txt
|
|
672
|
+
image_type (str): String specifying the type of image on which the radiomics
|
|
673
|
+
features were computed.
|
|
674
|
+
--> Format: $scan$($roiType$)__$imSpace$
|
|
675
|
+
--> Ex: PET(tumor)__HHH_coif1
|
|
676
|
+
patients_ids (list, optional): List of strings specifying the patientIDs of
|
|
677
|
+
patients to fetch from the radiomics table. If this
|
|
678
|
+
argument is not present, all patients are fetched.
|
|
679
|
+
--> Ex: {'Cervix-UCSF-001';Cervix-McGill-004}
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
pd.DataFrame: radiomics table
|
|
683
|
+
"""
|
|
684
|
+
# Read CSV table
|
|
685
|
+
radiomics_table = pd.read_csv(path_radiomics_csv, index_col=0)
|
|
686
|
+
if patients_ids is not None:
|
|
687
|
+
patients_ids = intersect(patients_ids, list(radiomics_table.index))
|
|
688
|
+
radiomics_table = radiomics_table.loc[patients_ids]
|
|
689
|
+
|
|
690
|
+
# Read the associated TXT file
|
|
691
|
+
with open(path_radiomics_txt, 'r') as f:
|
|
692
|
+
user_data = f.read()
|
|
693
|
+
|
|
694
|
+
# Grouping the information
|
|
695
|
+
radiomics_table._metadata += ["Properties"]
|
|
696
|
+
radiomics_table.Properties = dict()
|
|
697
|
+
radiomics_table.Properties['userData'] = dict()
|
|
698
|
+
radiomics_table.Properties['userData']['variables'] = dict()
|
|
699
|
+
radiomics_table.Properties['userData']['variables']['var_def'] = user_data
|
|
700
|
+
radiomics_table.Properties['Description'] = image_type
|
|
701
|
+
|
|
702
|
+
# Only continuous will be used for now but this design will facilitate the use of
|
|
703
|
+
# other categories in the future.
|
|
704
|
+
# radiomics = continous.
|
|
705
|
+
radiomics_table.Properties['userData']['variables']['continuous'] = np.asarray(list(radiomics_table.columns.values))
|
|
706
|
+
|
|
707
|
+
return radiomics_table
|
|
708
|
+
|
|
709
|
+
def get_splits(outcome: pd.DataFrame, n_split: int, test_split_proportion: float) -> Tuple[List, List]:
|
|
710
|
+
"""
|
|
711
|
+
Splits the given outcome table in two sets.
|
|
712
|
+
|
|
713
|
+
Args:
|
|
714
|
+
outcome (pd.DataFrame): Table with a single outcome column of 0's and 1's.
|
|
715
|
+
n_splits (int): Integer specifying the number of splits to create.
|
|
716
|
+
test_split_proportion (float): Float between 0 and 1 specifying the proportion
|
|
717
|
+
of patients to include in the test set.
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
train_sets List of indexes for the train_sets.
|
|
721
|
+
test_sets: List of indexes for the test_sets.
|
|
722
|
+
|
|
723
|
+
"""
|
|
724
|
+
|
|
725
|
+
ind_neg = np.where(outcome == 0)
|
|
726
|
+
n_neg = len(ind_neg[0])
|
|
727
|
+
ind_pos = np.where(outcome == 1)
|
|
728
|
+
n_pos = len(ind_pos[0])
|
|
729
|
+
n_neg_test = round(test_split_proportion * n_neg)
|
|
730
|
+
n_pos_test = round(test_split_proportion * n_pos)
|
|
731
|
+
|
|
732
|
+
n_inst = len(outcome)
|
|
733
|
+
n_test = n_pos_test + n_neg_test
|
|
734
|
+
n_train = n_inst - n_test
|
|
735
|
+
|
|
736
|
+
if(n_split==1):
|
|
737
|
+
train_sets = np.zeros(n_train)
|
|
738
|
+
test_sets = np.zeros(n_test)
|
|
739
|
+
else:
|
|
740
|
+
train_sets = np.zeros((n_split, n_train))
|
|
741
|
+
test_sets = np.zeros((n_split, n_test))
|
|
742
|
+
|
|
743
|
+
for s in range(n_split):
|
|
744
|
+
ind_pos_test = np.random.choice(ind_pos[0], n_pos_test, replace=False)
|
|
745
|
+
ind_neg_test = np.random.choice(ind_neg[0], n_neg_test, replace=False)
|
|
746
|
+
|
|
747
|
+
ind_test = np.concatenate((ind_pos_test,ind_neg_test))
|
|
748
|
+
ind_test.sort()
|
|
749
|
+
|
|
750
|
+
ind_train = np.arange(n_inst)
|
|
751
|
+
ind_train = np.delete(ind_train, ind_test)
|
|
752
|
+
ind_train.sort()
|
|
753
|
+
|
|
754
|
+
if(n_split>1):
|
|
755
|
+
train_sets[s] = ind_train
|
|
756
|
+
test_sets[s] = ind_test
|
|
757
|
+
else:
|
|
758
|
+
train_sets = ind_train
|
|
759
|
+
test_sets = ind_test
|
|
760
|
+
|
|
761
|
+
return train_sets, test_sets
|
|
762
|
+
|
|
763
|
+
def get_stratified_splits(
|
|
764
|
+
outcome_table: pd.DataFrame,
|
|
765
|
+
n_splits: int,
|
|
766
|
+
test_split_proportion: float,
|
|
767
|
+
seed: int,
|
|
768
|
+
flag_by_cat: bool=False
|
|
769
|
+
) -> Tuple[List, List]:
|
|
770
|
+
"""
|
|
771
|
+
Sub-divides a given outcome dataset into multiple stratified patient splits.
|
|
772
|
+
The stratification is performed per class proportion (or by institution).
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
outcome_table: Table with a single outcome column of 0's and 1's.
|
|
776
|
+
The rows of the table must define the patient IDs: $Cancer-$Institution-$Number.
|
|
777
|
+
n_splits: Integer specifying the number of splits to create.
|
|
778
|
+
test_split_proportion: Float between 0 and 1 specifying the proportion
|
|
779
|
+
of patients to include in the test set.
|
|
780
|
+
seed: Integer specifying the random generator seed to use for random splitting.
|
|
781
|
+
flag_by_cat (optional): Logical flag specifying if we are to produce
|
|
782
|
+
the split by taking into account the institutions in the outcome table.
|
|
783
|
+
If true, patients in Training and testing splits have the same prortion
|
|
784
|
+
of events per instiution as originally found in the initial data. Default: False.
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
List: patients_train_splits, list of size nTrainXnSplit, where each entry
|
|
788
|
+
is a string specifying a "Training" patient.
|
|
789
|
+
List: patients_test_splits, list of size nTestXnSplit, where each entry
|
|
790
|
+
is a string specifying a "testing" patient
|
|
791
|
+
"""
|
|
792
|
+
patient_ids = pd.Series(outcome_table.index)
|
|
793
|
+
patients_train_splits = []
|
|
794
|
+
patients_test_splits = []
|
|
795
|
+
|
|
796
|
+
# Take into account the institutions in the outcome table
|
|
797
|
+
if flag_by_cat:
|
|
798
|
+
institution_cat_vector = get_institutions_from_ids(patient_ids)
|
|
799
|
+
all_categories = np.unique(institution_cat_vector)
|
|
800
|
+
n_cat = len(all_categories)
|
|
801
|
+
# Split for each institution
|
|
802
|
+
for i in range(n_cat):
|
|
803
|
+
np.random.seed(seed)
|
|
804
|
+
cat = all_categories[i]
|
|
805
|
+
flag_cat = institution_cat_vector == cat
|
|
806
|
+
patient_ids_cat = patient_ids[flag_cat]
|
|
807
|
+
patient_ids_cat.reset_index(inplace=True, drop=True)
|
|
808
|
+
|
|
809
|
+
# Split train and test sets
|
|
810
|
+
train_sets, test_sets = get_splits(outcome_table[flag_cat.values], n_splits, test_split_proportion)
|
|
811
|
+
|
|
812
|
+
if n_splits > 1:
|
|
813
|
+
temp_patients_train = np.empty((n_splits, len(train_sets[0])), dtype=object)
|
|
814
|
+
temp_patientsTest = np.empty((n_splits, len(test_sets[0])), dtype=object)
|
|
815
|
+
for s in range(n_splits):
|
|
816
|
+
temp_patients_train[s] = patient_ids_cat[train_sets[s]]
|
|
817
|
+
temp_patientsTest[s] = patient_ids_cat[test_sets[s]]
|
|
818
|
+
else:
|
|
819
|
+
temp_patients_train = patient_ids_cat[train_sets]
|
|
820
|
+
temp_patients_train.reset_index(inplace=True, drop=True)
|
|
821
|
+
temp_patientsTest = patient_ids_cat[test_sets]
|
|
822
|
+
temp_patientsTest.reset_index(inplace=True, drop=True)
|
|
823
|
+
|
|
824
|
+
# Initialize the train and test patients list (1st iteration)
|
|
825
|
+
if i==0:
|
|
826
|
+
patients_train_splits=temp_patients_train
|
|
827
|
+
patients_test_splits=temp_patientsTest
|
|
828
|
+
|
|
829
|
+
# Add new patients to the train and test patients list (other iterations)
|
|
830
|
+
if i>0:
|
|
831
|
+
if n_splits>1:
|
|
832
|
+
patients_train_splits = np.append(patients_train_splits, temp_patients_train, axis=1)
|
|
833
|
+
patients_test_splits = np.append(patients_test_splits, temp_patientsTest, axis=1)
|
|
834
|
+
|
|
835
|
+
else:
|
|
836
|
+
patients_train_splits = np.append(patients_train_splits, temp_patients_train)
|
|
837
|
+
patients_test_splits = np.append(patients_test_splits, temp_patientsTest)
|
|
838
|
+
|
|
839
|
+
# Do not take into account the institutions in the outcome table
|
|
840
|
+
else:
|
|
841
|
+
# Split train and test sets
|
|
842
|
+
train_sets, test_sets = get_splits(outcome_table, n_splits, test_split_proportion)
|
|
843
|
+
if n_splits > 1:
|
|
844
|
+
patients_train_splits = np.empty((n_splits, len(train_sets[0])), dtype=object)
|
|
845
|
+
patients_test_splits = np.empty((n_splits, len(test_sets[0])), dtype=object)
|
|
846
|
+
for s in range(n_splits):
|
|
847
|
+
patients_train_splits[s] = patient_ids[train_sets[s]]
|
|
848
|
+
patients_test_splits[s] = patient_ids[test_sets[s]]
|
|
849
|
+
else:
|
|
850
|
+
patients_train_splits = patient_ids[train_sets]
|
|
851
|
+
patients_train_splits.reset_index(inplace=True, drop=True)
|
|
852
|
+
patients_test_splits = patient_ids[test_sets]
|
|
853
|
+
patients_test_splits.reset_index(inplace=True, drop=True)
|
|
854
|
+
|
|
855
|
+
return patients_train_splits, patients_test_splits
|
|
856
|
+
|
|
857
|
+
def get_patient_id_classes(outcome_table: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
858
|
+
"""
|
|
859
|
+
Yields the patients from the majority class and the minority class in the given outcome table.
|
|
860
|
+
Only supports binary classes.
|
|
861
|
+
|
|
862
|
+
Args:
|
|
863
|
+
outcome_table(pd.DataFrame): outcome table with binary labels.
|
|
864
|
+
|
|
865
|
+
Returns:
|
|
866
|
+
pd.DataFrame: Majority class patientIDs.
|
|
867
|
+
pd.DataFrame: Minority class patientIDs.
|
|
868
|
+
"""
|
|
869
|
+
ones = outcome_table.loc[outcome_table.iloc[0:].values == 1].index
|
|
870
|
+
zeros = outcome_table.loc[outcome_table.iloc[0:].values == 0].index
|
|
871
|
+
if ones.size > zeros.size:
|
|
872
|
+
return ones, zeros
|
|
873
|
+
|
|
874
|
+
return zeros, ones
|
|
875
|
+
|
|
876
|
+
def intersect(list1: List, list2: List, sort: bool = False) -> List:
|
|
877
|
+
"""
|
|
878
|
+
Returns the intersection of two list.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
list1 (List): the first list.
|
|
882
|
+
list2 (List): the second list.
|
|
883
|
+
order (bool): if True, the intersection is sorted.
|
|
884
|
+
|
|
885
|
+
Returns:
|
|
886
|
+
List: the intersection of the two lists.
|
|
887
|
+
"""
|
|
888
|
+
|
|
889
|
+
intersection = list(filter(lambda x: x in list1, list2))
|
|
890
|
+
if sort:
|
|
891
|
+
return sorted(intersection)
|
|
892
|
+
return intersection
|
|
893
|
+
|
|
894
|
+
def intersect_var_tables(var_table1: pd.DataFrame, var_table2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
895
|
+
"""
|
|
896
|
+
This function takes 2 variable table, compares the indexes and drops the
|
|
897
|
+
ones that are not in both, then returns the 2 table.
|
|
898
|
+
|
|
899
|
+
Args:
|
|
900
|
+
var_table1 (pd.DataFrame): first variable table.
|
|
901
|
+
var_table2 (pd.DataFrame): second variable table.
|
|
902
|
+
|
|
903
|
+
Returns:
|
|
904
|
+
pd.DataFrame: first variable table with the same indexes as the second.
|
|
905
|
+
pd.DataFrame: second variable table with the same indexes as the first.
|
|
906
|
+
"""
|
|
907
|
+
# Find the unique values in var_table1 that are not in var_table2
|
|
908
|
+
missing = np.setdiff1d(var_table1.index.to_numpy(), var_table2.index.to_numpy())
|
|
909
|
+
if missing.size > 0:
|
|
910
|
+
var_table1 = var_table1.drop(missing)
|
|
911
|
+
|
|
912
|
+
# Find the unique values in var_table2 that are not in var_table1
|
|
913
|
+
missing = np.setdiff1d(var_table2.index.to_numpy(), var_table1.index.to_numpy())
|
|
914
|
+
if missing.size > 0:
|
|
915
|
+
var_table2 = var_table2.drop(missing)
|
|
916
|
+
|
|
917
|
+
return var_table1, var_table2
|
|
918
|
+
|
|
919
|
+
def under_sample(outcome_table_binary: pd.DataFrame) -> pd.DataFrame:
|
|
920
|
+
"""
|
|
921
|
+
Performs under-sampling to obtain an equal number of outcomes in the binary outcome table.
|
|
922
|
+
|
|
923
|
+
Args:
|
|
924
|
+
outcome_table_binary (pd.DataFrame): outcome table with binary labels.
|
|
925
|
+
|
|
926
|
+
Returns:
|
|
927
|
+
pd.DataFrame: outcome table with balanced binary labels.
|
|
928
|
+
"""
|
|
929
|
+
|
|
930
|
+
# We place them prematurely in maj and min and correct it afterwards
|
|
931
|
+
n_maj = (outcome_table_binary == 0).sum().values[0]
|
|
932
|
+
n_min = (outcome_table_binary == 1).sum().values[0]
|
|
933
|
+
if n_maj == n_min:
|
|
934
|
+
return outcome_table_binary
|
|
935
|
+
elif n_min > n_maj:
|
|
936
|
+
n_min, n_maj = n_maj, n_min
|
|
937
|
+
|
|
938
|
+
# Sample the patients from the majority class
|
|
939
|
+
patient_ids_maj, patient_ids_min = get_patient_id_classes(outcome_table_binary)
|
|
940
|
+
patient_ids_min = list(patient_ids_min)
|
|
941
|
+
patient_ids_numpy = patient_ids_maj.to_numpy()
|
|
942
|
+
np.random.shuffle(patient_ids_numpy)
|
|
943
|
+
patient_ids_sample = list(patient_ids_numpy[0:n_min])
|
|
944
|
+
new_ids = patient_ids_min + patient_ids_sample
|
|
945
|
+
|
|
946
|
+
return outcome_table_binary.loc[new_ids, :]
|
|
947
|
+
|
|
948
|
+
def save_model(model: Dict, var_id: str, path_model: Path, ml: Dict = None, name_type: str = "") -> Dict:
|
|
949
|
+
"""
|
|
950
|
+
Saves a given model locally as a pickle object and outputs a dictionary
|
|
951
|
+
containing the model's information.
|
|
952
|
+
|
|
953
|
+
Args:
|
|
954
|
+
model (Dict): The model dict to save.
|
|
955
|
+
var_id (str): The stduied variable. For ex: 'var3'.
|
|
956
|
+
path_model (str): The path to save the model.
|
|
957
|
+
ml (Dict, optional): Dicionary containing the settings of the machine learning experiment.
|
|
958
|
+
name_type (str, optional): String specifying the type of the variable. For examlpe: "RadiomicsIntensity". Default is "".
|
|
959
|
+
|
|
960
|
+
Returns:
|
|
961
|
+
Dict: A dictionary containing the model's information.
|
|
962
|
+
"""
|
|
963
|
+
# Saving model
|
|
964
|
+
with open(path_model, "wb") as f:
|
|
965
|
+
pickle.dump(model, f)
|
|
966
|
+
|
|
967
|
+
# Getting the "var_names" string
|
|
968
|
+
if ml is not None:
|
|
969
|
+
var_names = ml['variables'][var_id]['nameType']
|
|
970
|
+
elif name_type != "":
|
|
971
|
+
var_names = name_type
|
|
972
|
+
else:
|
|
973
|
+
var_names = [var_id]
|
|
974
|
+
|
|
975
|
+
# Recording model info
|
|
976
|
+
model_info = dict()
|
|
977
|
+
model_info['path'] = path_model
|
|
978
|
+
model_info['var_ids'] = var_id
|
|
979
|
+
model_info['var_type'] = var_names
|
|
980
|
+
|
|
981
|
+
try: # This part may fail if model training failed.
|
|
982
|
+
model_info['var_names'] = model['var_names']
|
|
983
|
+
model_info['var_info'] = model['var_info']
|
|
984
|
+
if 'normalization' in model_info['var_info'].keys():
|
|
985
|
+
if 'normalization_table' in model_info['var_info']['normalization'].keys():
|
|
986
|
+
normalization_struct = write_table_structure(model_info['var_info']['normalization']['normalization_table'])
|
|
987
|
+
model_info['var_info']['normalization']['normalization_table'] = normalization_struct
|
|
988
|
+
model_info['threshold'] = model['threshold']
|
|
989
|
+
except Exception as e:
|
|
990
|
+
print("Failed to create a fully model info")
|
|
991
|
+
print(e)
|
|
992
|
+
|
|
993
|
+
return model_info
|
|
994
|
+
|
|
995
|
+
def write_table_structure(data_table: pd.DataFrame) -> Dict:
|
|
996
|
+
"""
|
|
997
|
+
Writes the structure of a table in a dictionary.
|
|
998
|
+
|
|
999
|
+
Args:
|
|
1000
|
+
data_table (pd.DataFrame): a table.
|
|
1001
|
+
|
|
1002
|
+
Returns:
|
|
1003
|
+
Dict: a dictionary containing the table's structure.
|
|
1004
|
+
"""
|
|
1005
|
+
# Initialization
|
|
1006
|
+
data_struct = dict()
|
|
1007
|
+
|
|
1008
|
+
if len(data_table.index) != 0:
|
|
1009
|
+
data_struct['index'] = list(data_table.index)
|
|
1010
|
+
|
|
1011
|
+
# Creating the structure
|
|
1012
|
+
for column in data_table.columns:
|
|
1013
|
+
data_struct[column] = data_table[column]
|
|
1014
|
+
|
|
1015
|
+
return data_struct
|