mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import platform
|
|
2
|
+
import re
|
|
3
|
+
from itertools import combinations, product
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from ..utils.get_institutions_from_ids import get_institutions_from_ids
|
|
10
|
+
from ..utils.json_utils import load_json, posix_to_string, save_json
|
|
11
|
+
from .ml_utils import cross_validation_split, get_stratified_splits
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DesignExperiment:
|
|
15
|
+
def __init__(self, path_study: Path, path_settings: Path, experiment_label: str) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Constructor of the class DesignExperiment.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
path_study (Path): Path to the main study folder where the outcomes,
|
|
21
|
+
learning patients and holdout patients dictionaries are found.
|
|
22
|
+
path_settings (Path): Path to the settings folder.
|
|
23
|
+
experiment_label (str): String specifying the label to attach to a given learning experiment in
|
|
24
|
+
"path_experiments". This label will be attached to the ml__$experiments_label$.json file as well
|
|
25
|
+
as the learn__$experiment_label$ folder. This label is used to keep track of different experiments
|
|
26
|
+
with different settings (e.g. radiomics, scans, machine learning algorithms, etc.).
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
None
|
|
30
|
+
"""
|
|
31
|
+
self.path_study = Path(path_study)
|
|
32
|
+
self.path_settings = Path(path_settings)
|
|
33
|
+
self.experiment_label = str(experiment_label)
|
|
34
|
+
self.path_ml_object = None
|
|
35
|
+
|
|
36
|
+
def __create_folder_and_content(
|
|
37
|
+
self,
|
|
38
|
+
path_learn: Path,
|
|
39
|
+
run_name: str,
|
|
40
|
+
patients_train: List,
|
|
41
|
+
patients_test: List,
|
|
42
|
+
ml_path: Path
|
|
43
|
+
) -> List:
|
|
44
|
+
"""
|
|
45
|
+
Creates json files needed for a given run
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
path_learn (Path): path to the main learning folder containing information about the training and test set.
|
|
49
|
+
run_name (str): name for a given run.
|
|
50
|
+
patients_train (List): list of patients in the training set.
|
|
51
|
+
patients_test (List): list of patients in the test set.
|
|
52
|
+
ml_path (Path): path to the given run.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List: list of paths to the given run.
|
|
56
|
+
"""
|
|
57
|
+
paths_ml = dict()
|
|
58
|
+
path_run = path_learn / run_name
|
|
59
|
+
Path.mkdir(path_run, exist_ok=True)
|
|
60
|
+
path_train = path_run / 'patientsTrain.json'
|
|
61
|
+
path_test = path_run / 'patientsTest.json'
|
|
62
|
+
save_json(path_train, sorted(patients_train))
|
|
63
|
+
save_json(path_test, sorted(patients_test))
|
|
64
|
+
paths_ml['patientsTrain'] = path_train
|
|
65
|
+
paths_ml['patientsTest'] = path_test
|
|
66
|
+
paths_ml['outcomes'] = self.path_study / 'outcomes.csv'
|
|
67
|
+
paths_ml['ml'] = self.path_ml_object
|
|
68
|
+
paths_ml['results'] = path_run / 'run_results.json'
|
|
69
|
+
path_file = path_run / 'paths_ml.json'
|
|
70
|
+
paths_ml = posix_to_string(paths_ml)
|
|
71
|
+
ml_path.append(path_file)
|
|
72
|
+
save_json(path_file, paths_ml)
|
|
73
|
+
|
|
74
|
+
return ml_path
|
|
75
|
+
|
|
76
|
+
def generate_learner_dict(self) -> dict:
|
|
77
|
+
"""
|
|
78
|
+
Generates a dictionary containing all the settings for the learning experiment.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
dict: Dictionary containing all the settings for the learning experiment.
|
|
82
|
+
"""
|
|
83
|
+
ml_options = dict()
|
|
84
|
+
|
|
85
|
+
# operating system
|
|
86
|
+
ml_options['os'] = platform.system()
|
|
87
|
+
|
|
88
|
+
# design experiment settings
|
|
89
|
+
ml_options['design'] = self.path_settings / 'ml_design.json'
|
|
90
|
+
# check if file exist:
|
|
91
|
+
if not ml_options['design'].exists():
|
|
92
|
+
raise FileNotFoundError(f"File {ml_options['design']} does not exist.")
|
|
93
|
+
|
|
94
|
+
# ML run settings
|
|
95
|
+
run = dict()
|
|
96
|
+
ml_options['run'] = run
|
|
97
|
+
|
|
98
|
+
# Machine learning settings
|
|
99
|
+
ml_options['settings'] = self.path_settings / 'ml_settings.json'
|
|
100
|
+
# check if file exist:
|
|
101
|
+
if not ml_options['settings'].exists():
|
|
102
|
+
raise FileNotFoundError(f"File {ml_options['settings']} does not exist.")
|
|
103
|
+
|
|
104
|
+
# variables settings
|
|
105
|
+
ml_options['variables'] = self.path_settings / 'ml_variables.json'
|
|
106
|
+
# check if file exist:
|
|
107
|
+
if not ml_options['variables'].exists():
|
|
108
|
+
raise FileNotFoundError(f"File {ml_options['variables']} does not exist.")
|
|
109
|
+
|
|
110
|
+
# ML algorithms settings
|
|
111
|
+
ml_options['algorithms'] = self.path_settings / 'ml_algorithms.json'
|
|
112
|
+
# check if file exist:
|
|
113
|
+
if not ml_options['algorithms'].exists():
|
|
114
|
+
raise FileNotFoundError(f"File {ml_options['algorithms']} does not exist.")
|
|
115
|
+
|
|
116
|
+
# Data cleaning settings
|
|
117
|
+
ml_options['datacleaning'] = self.path_settings / 'ml_datacleaning.json'
|
|
118
|
+
# check if file exist:
|
|
119
|
+
if not ml_options['datacleaning'].exists():
|
|
120
|
+
raise FileNotFoundError(f"File {ml_options['datacleaning']} does not exist.")
|
|
121
|
+
|
|
122
|
+
# Normalization settings
|
|
123
|
+
ml_options['normalization'] = self.path_settings / 'ml_normalization.json'
|
|
124
|
+
# check if file exist:
|
|
125
|
+
if not ml_options['normalization'].exists():
|
|
126
|
+
raise FileNotFoundError(f"File {ml_options['normalization']} does not exist.")
|
|
127
|
+
|
|
128
|
+
# Feature set reduction settings
|
|
129
|
+
ml_options['fSetReduction'] = self.path_settings / 'ml_fset_reduction.json'
|
|
130
|
+
# check if file exist:
|
|
131
|
+
if not ml_options['fSetReduction'].exists():
|
|
132
|
+
raise FileNotFoundError(f"File {ml_options['fSetReduction']} does not exist.")
|
|
133
|
+
|
|
134
|
+
# Experiment label check
|
|
135
|
+
if self.experiment_label == "":
|
|
136
|
+
raise ValueError("Experiment label is empty. Class was not initialized properly.")
|
|
137
|
+
|
|
138
|
+
# save all the ml options and return the path to the saved file
|
|
139
|
+
name_save_options = 'ml_options_' + self.experiment_label + '.json'
|
|
140
|
+
path_ml_options = self.path_settings / name_save_options
|
|
141
|
+
ml_options = posix_to_string(ml_options)
|
|
142
|
+
save_json(path_ml_options, ml_options)
|
|
143
|
+
|
|
144
|
+
return path_ml_options
|
|
145
|
+
|
|
146
|
+
def fill_learner_dict(self, path_ml_options: Path) -> Path:
|
|
147
|
+
"""
|
|
148
|
+
Fills the main expirement dictionary from the settings in the different json files.
|
|
149
|
+
This main dictionary will hold all the settings for the data processing and learning experiment.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
path_ml_options (Path): Path to the ml_options json file for the experiment.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Path: Path to the learner object.
|
|
156
|
+
"""
|
|
157
|
+
# Initialization
|
|
158
|
+
all_datacleaning = list()
|
|
159
|
+
all_normalization = list()
|
|
160
|
+
all_fset_reduction = list()
|
|
161
|
+
|
|
162
|
+
# Load ml options dict
|
|
163
|
+
ml_options = load_json(path_ml_options)
|
|
164
|
+
options = ml_options.keys()
|
|
165
|
+
|
|
166
|
+
# Design options
|
|
167
|
+
ml = dict()
|
|
168
|
+
ml['design'] = load_json(ml_options['design'])
|
|
169
|
+
|
|
170
|
+
# ML run options
|
|
171
|
+
ml['run'] = ml_options['run']
|
|
172
|
+
|
|
173
|
+
# Machine learning options
|
|
174
|
+
if 'settings' in options:
|
|
175
|
+
ml['settings'] = load_json(ml_options['settings'])
|
|
176
|
+
|
|
177
|
+
# Machine learning variables
|
|
178
|
+
if 'variables' in options:
|
|
179
|
+
ml['variables'] = dict()
|
|
180
|
+
var_options = load_json(ml_options['variables'])
|
|
181
|
+
fields = list(var_options.keys())
|
|
182
|
+
vars = [(idx, s) for idx, s in enumerate(fields) if re.match(r"^var[0-9]{1,}$", s)]
|
|
183
|
+
var_names = [var[1] for var in vars] # list of var names
|
|
184
|
+
|
|
185
|
+
# For each variable, organize the option in the ML dictionary
|
|
186
|
+
for (idx, var) in vars:
|
|
187
|
+
vars_dict = dict()
|
|
188
|
+
vars_dict[var] = var_options[var]
|
|
189
|
+
var_struct = var_options[var]
|
|
190
|
+
|
|
191
|
+
# Radiomics variables
|
|
192
|
+
if 'radiomics' in var_struct['nameType'].lower():
|
|
193
|
+
# Get radiomics features in workspace
|
|
194
|
+
if 'settofeatures' in var_struct['path'].lower():
|
|
195
|
+
name_folder = re.match(r"setTo(.*)inWorkspace", var_struct['path']).group(1)
|
|
196
|
+
path_features = self.path_study / name_folder
|
|
197
|
+
# Get radiomics features in path provided in the dictionary by the user
|
|
198
|
+
else:
|
|
199
|
+
path_features = var_struct['path']
|
|
200
|
+
scans = var_struct['scans'] # list of imaging sequences
|
|
201
|
+
rois = var_struct['rois'] # list of roi labels
|
|
202
|
+
im_spaces = var_struct['imSpaces'] # list of image spaces (filterd and original)
|
|
203
|
+
use_combinations = var_struct['use_combinations'] # boolean to use combinations of scans and im_spaces
|
|
204
|
+
if use_combinations:
|
|
205
|
+
all_combinations = []
|
|
206
|
+
scans = list(var_struct['combinations'].keys())
|
|
207
|
+
for scan in scans:
|
|
208
|
+
im_spaces = list(var_struct['combinations'][scan])
|
|
209
|
+
all_combinations += list(product([scan], rois, im_spaces))
|
|
210
|
+
else:
|
|
211
|
+
all_combinations = list(product(scans, rois, im_spaces))
|
|
212
|
+
|
|
213
|
+
# Initialize dict to hold all paths to radiomics features (csv and txt files)
|
|
214
|
+
path = dict()
|
|
215
|
+
for idx, (scan, roi, im_space) in enumerate(all_combinations):
|
|
216
|
+
rad_tab_x = {}
|
|
217
|
+
name_tab = 'radTab' + str(idx+1)
|
|
218
|
+
radiomics_table_name = 'radiomics__' + scan + '(' + roi + ')__' + im_space
|
|
219
|
+
rad_tab_x['csv'] = path_features / (radiomics_table_name + '.csv')
|
|
220
|
+
rad_tab_x['txt'] = path_features / (radiomics_table_name + '.txt')
|
|
221
|
+
rad_tab_x['type'] = path_features / (scan + '(' + roi + ')__' + im_space)
|
|
222
|
+
|
|
223
|
+
# check if file exist
|
|
224
|
+
if not rad_tab_x['csv'].exists():
|
|
225
|
+
raise FileNotFoundError(f"File {rad_tab_x['csv']} does not exist.")
|
|
226
|
+
if not rad_tab_x['txt'].exists():
|
|
227
|
+
raise FileNotFoundError(f"File {rad_tab_x['txt']} does not exist.")
|
|
228
|
+
|
|
229
|
+
path[name_tab] = rad_tab_x
|
|
230
|
+
|
|
231
|
+
# Add path to ml dict for the current variable
|
|
232
|
+
vars_dict[var]['path'] = path
|
|
233
|
+
|
|
234
|
+
# Add to ml dict for the current variable
|
|
235
|
+
ml['variables'].update(vars_dict)
|
|
236
|
+
|
|
237
|
+
# Clinical or other variables (For ex: Volume)
|
|
238
|
+
else:
|
|
239
|
+
# get path to csv file of features
|
|
240
|
+
if not var_struct['path']:
|
|
241
|
+
if var_options['pathCSV'] == 'setToCSVinWorkspace':
|
|
242
|
+
path_csv = self.path_study / 'CSV'
|
|
243
|
+
else:
|
|
244
|
+
path_csv = var_options['pathCSV']
|
|
245
|
+
var_struct['path'] = path_csv / var_struct['nameFile']
|
|
246
|
+
|
|
247
|
+
# Add to ml dict for the current variable
|
|
248
|
+
ml['variables'].update(vars_dict)
|
|
249
|
+
|
|
250
|
+
# Initialize data processing methods
|
|
251
|
+
if 'var_datacleaning' in var_struct.keys():
|
|
252
|
+
all_datacleaning.append(var_struct['var_datacleaning'])
|
|
253
|
+
if 'var_normalization' in var_struct.keys():
|
|
254
|
+
all_normalization.append((var_struct['var_normalization']))
|
|
255
|
+
if 'var_fSetReduction' in var_struct.keys():
|
|
256
|
+
all_fset_reduction.append(var_struct['var_fSetReduction']['method'])
|
|
257
|
+
|
|
258
|
+
# Combinations of variables
|
|
259
|
+
if 'combinations' in var_options.keys():
|
|
260
|
+
if var_options['combinations'] == ['all']: # Combine all variables
|
|
261
|
+
combs = [comb for i in range(len(vars)) for comb in combinations(var_names, i+1)]
|
|
262
|
+
combstrings = ['_'.join(elt) for elt in combs]
|
|
263
|
+
ml['variables']['combinations'] = combstrings
|
|
264
|
+
else:
|
|
265
|
+
ml['variables']['combinations'] = var_options['combinations']
|
|
266
|
+
|
|
267
|
+
# Varibles to use for ML
|
|
268
|
+
ml['variables']['varStudy'] = var_options['varStudy']
|
|
269
|
+
|
|
270
|
+
# ML algorithms
|
|
271
|
+
if 'algorithms' in options:
|
|
272
|
+
algorithm = ml['settings']['algorithm']
|
|
273
|
+
algorithms = load_json(ml_options['algorithms'])
|
|
274
|
+
ml['algorithms'] = {}
|
|
275
|
+
ml['algorithms'][algorithm] = algorithms[algorithm]
|
|
276
|
+
|
|
277
|
+
# ML data processing methods and its options
|
|
278
|
+
for (method, method_list) in [
|
|
279
|
+
('datacleaning', all_datacleaning),
|
|
280
|
+
('normalization', all_normalization),
|
|
281
|
+
('fSetReduction', all_fset_reduction)
|
|
282
|
+
]:
|
|
283
|
+
# Skip if no method is selected
|
|
284
|
+
if all(v == "" for v in method_list):
|
|
285
|
+
continue
|
|
286
|
+
if method in options:
|
|
287
|
+
# Add algorithm specific methods
|
|
288
|
+
if method in ml['settings'].keys():
|
|
289
|
+
method_list.append(ml['settings'][method])
|
|
290
|
+
method_list = list(set(method_list)) # to only get unique values of all_datacleaning
|
|
291
|
+
method_options = load_json(ml_options[method]) # load json file of each method
|
|
292
|
+
if method == 'normalization' and 'combat' in method_list:
|
|
293
|
+
ml[method] = 'combat'
|
|
294
|
+
continue
|
|
295
|
+
ml[method] = dict()
|
|
296
|
+
for name in list(set(method_list)):
|
|
297
|
+
if name != "":
|
|
298
|
+
ml[method][name] = method_options[name]
|
|
299
|
+
|
|
300
|
+
# Save the ML dictionary
|
|
301
|
+
if self.experiment_label == "":
|
|
302
|
+
raise ValueError("Experiment label is empty. Class was not initialized properly.")
|
|
303
|
+
path_ml_object = self.path_study / f'ml__{self.experiment_label}.json'
|
|
304
|
+
ml = posix_to_string(ml) # Convert all paths to string
|
|
305
|
+
save_json(path_ml_object, ml)
|
|
306
|
+
|
|
307
|
+
# return ml
|
|
308
|
+
return path_ml_object
|
|
309
|
+
|
|
310
|
+
def create_experiment(self, ml: dict = None) -> Dict:
|
|
311
|
+
"""
|
|
312
|
+
Create the machine learning experiment dictionary, organizes each test/split information in a seperate folder.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
ml (dict, optional): Dictionary containing all the machine learning settings. Defaults to None.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Dict: Dictionary containing all the organized machine learning settings.
|
|
319
|
+
"""
|
|
320
|
+
# Initialization
|
|
321
|
+
ml_path = list()
|
|
322
|
+
ml = load_json(self.path_ml_object) if ml is None else ml
|
|
323
|
+
|
|
324
|
+
# Learning set
|
|
325
|
+
patients_learn = load_json(self.path_study / 'patientsLearn.json')
|
|
326
|
+
|
|
327
|
+
# Outcomes table
|
|
328
|
+
outcomes_table = pd.read_csv(self.path_study / 'outcomes.csv', index_col=0)
|
|
329
|
+
|
|
330
|
+
# keep only patients in learn set and outcomes table
|
|
331
|
+
patients_to_keep = list(filter(lambda x: x in patients_learn, outcomes_table.index.values.tolist()))
|
|
332
|
+
outcomes_table = outcomes_table.loc[patients_to_keep]
|
|
333
|
+
|
|
334
|
+
# Get the "experiment label" from ml__$experiment_label$.json
|
|
335
|
+
if self.experiment_label:
|
|
336
|
+
experiment_label = self.experiment_label
|
|
337
|
+
else:
|
|
338
|
+
experiment_label = Path(self.path_ml_object).name[4:-5]
|
|
339
|
+
|
|
340
|
+
# Create the folder for the training and testing sets (machine learning) information
|
|
341
|
+
name_learn = 'learn__' + experiment_label
|
|
342
|
+
path_learn = Path(self.path_study) / name_learn
|
|
343
|
+
Path.mkdir(path_learn, exist_ok=True)
|
|
344
|
+
|
|
345
|
+
# Getting the type of test_sets
|
|
346
|
+
test_sets_types = ml['design']['testSets']
|
|
347
|
+
|
|
348
|
+
# Creating the sets for the different machine learning runs
|
|
349
|
+
for type_set in test_sets_types:
|
|
350
|
+
# Random splits
|
|
351
|
+
if type_set.lower() == 'random':
|
|
352
|
+
# Get the experiment options for the sets
|
|
353
|
+
random_info = ml['design'][type_set]
|
|
354
|
+
method = random_info['method']
|
|
355
|
+
n_splits = random_info['nSplits']
|
|
356
|
+
stratify_institutions = random_info['stratifyInstitutions']
|
|
357
|
+
test_proportion = random_info['testProportion']
|
|
358
|
+
seed = random_info['seed']
|
|
359
|
+
if method == 'SubSampling':
|
|
360
|
+
# Get the training and testing sets
|
|
361
|
+
patients_train, patients_test = get_stratified_splits(
|
|
362
|
+
outcomes_table, n_splits,
|
|
363
|
+
test_proportion, seed,
|
|
364
|
+
stratify_institutions
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# If patients are not in a list
|
|
368
|
+
if type(patients_train) != list and not hasattr((patients_train), "__len__"):
|
|
369
|
+
patients_train = [patients_train]
|
|
370
|
+
patients_test = [patients_test]
|
|
371
|
+
|
|
372
|
+
for i in range(n_splits):
|
|
373
|
+
# Create a folder for each split/run
|
|
374
|
+
run_name = "test__{0:03}".format(i+1)
|
|
375
|
+
ml_path = self.__create_folder_and_content(
|
|
376
|
+
path_learn,
|
|
377
|
+
run_name,
|
|
378
|
+
patients_train[i],
|
|
379
|
+
patients_test[i],
|
|
380
|
+
ml_path
|
|
381
|
+
)
|
|
382
|
+
# Institutions-based splits
|
|
383
|
+
elif type_set.lower() == 'institutions':
|
|
384
|
+
# Get institutions run info
|
|
385
|
+
patient_ids = pd.Series(outcomes_table.index)
|
|
386
|
+
institution_cat_vector = get_institutions_from_ids(patient_ids)
|
|
387
|
+
institution_cats = list(set(institution_cat_vector))
|
|
388
|
+
n_institution = len(institution_cats)
|
|
389
|
+
# The 'Institutions' argument only make sense if n_institutions > 1
|
|
390
|
+
if n_institution > 1:
|
|
391
|
+
for i in range(n_institution):
|
|
392
|
+
cat = institution_cats[i]
|
|
393
|
+
patients_train = [elt for elt in patient_ids if cat not in elt]
|
|
394
|
+
patients_test = [elt for elt in patient_ids if cat in elt]
|
|
395
|
+
run_name = f"test__{cat}"
|
|
396
|
+
# Create a folder for each split/run
|
|
397
|
+
ml_path = self.__create_folder_and_content(
|
|
398
|
+
path_learn,
|
|
399
|
+
run_name,
|
|
400
|
+
patients_train,
|
|
401
|
+
patients_test,
|
|
402
|
+
ml_path
|
|
403
|
+
)
|
|
404
|
+
if n_institution > 2:
|
|
405
|
+
size_inst = list()
|
|
406
|
+
for i in range(n_institution):
|
|
407
|
+
cat = institution_cats[i]
|
|
408
|
+
size_inst.append(sum([1 if cat in elt else 0 for elt in institution_cat_vector]))
|
|
409
|
+
ind_max = size_inst.index(max(size_inst))
|
|
410
|
+
str_test = list()
|
|
411
|
+
for i in range(n_institution):
|
|
412
|
+
if i != ind_max:
|
|
413
|
+
cat = institution_cats[i]
|
|
414
|
+
str_test.append(cat)
|
|
415
|
+
cat = institution_cats[ind_max]
|
|
416
|
+
patients_train = [elt for elt in patient_ids if cat in elt]
|
|
417
|
+
patients_test = [elt for elt in patient_ids if cat not in elt]
|
|
418
|
+
run_name = f"test__{'_'.join(str_test)}"
|
|
419
|
+
# Create a folder for each split/run
|
|
420
|
+
ml_path = self.__create_folder_and_content(
|
|
421
|
+
path_learn,
|
|
422
|
+
run_name,
|
|
423
|
+
patients_train,
|
|
424
|
+
patients_test,
|
|
425
|
+
ml_path
|
|
426
|
+
)
|
|
427
|
+
elif type_set.lower() == 'cv':
|
|
428
|
+
# Get the experiment options for the sets
|
|
429
|
+
cv_info = ml['design'][type_set]
|
|
430
|
+
n_splits = cv_info['nSplits']
|
|
431
|
+
seed = cv_info['seed']
|
|
432
|
+
|
|
433
|
+
# Get the training and testing sets
|
|
434
|
+
patients_train, patients_test = cross_validation_split(
|
|
435
|
+
outcomes_table,
|
|
436
|
+
n_splits,
|
|
437
|
+
seed=seed
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
# If patients are not in a list
|
|
441
|
+
if type(patients_train) != list and not hasattr((patients_train), "__len__"):
|
|
442
|
+
patients_train = [patients_train]
|
|
443
|
+
patients_test = [patients_test]
|
|
444
|
+
|
|
445
|
+
for i in range(n_splits):
|
|
446
|
+
# Create a folder for each split/run
|
|
447
|
+
run_name = "test__{0:03}".format(i+1)
|
|
448
|
+
ml_path = self.__create_folder_and_content(
|
|
449
|
+
path_learn,
|
|
450
|
+
run_name,
|
|
451
|
+
patients_train[i],
|
|
452
|
+
patients_test[i],
|
|
453
|
+
ml_path
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
raise ValueError("The type of test set is not recognized. Must be 'random' or 'institutions'.")
|
|
457
|
+
|
|
458
|
+
# Make ml_path a dictionary to easily save it in json
|
|
459
|
+
return {f"run{idx+1}": value for idx, value in enumerate(ml_path)}
|
|
460
|
+
|
|
461
|
+
def generate_experiment(self):
|
|
462
|
+
"""
|
|
463
|
+
Generate the json files containing all the options the experiment.
|
|
464
|
+
The json files will then be used in machine learning.
|
|
465
|
+
"""
|
|
466
|
+
# Generate the ml options dictionary
|
|
467
|
+
path_ml_options = self.generate_learner_dict()
|
|
468
|
+
|
|
469
|
+
# Fill the ml options dictionary
|
|
470
|
+
self.path_ml_object = self.fill_learner_dict(path_ml_options)
|
|
471
|
+
|
|
472
|
+
# Generate the experiment dictionary
|
|
473
|
+
experiment_dict = self.create_experiment()
|
|
474
|
+
|
|
475
|
+
# Saving the final experiment dictionary
|
|
476
|
+
path_file = self.path_study / f'path_file_ml_paths__{self.experiment_label}.json'
|
|
477
|
+
experiment_dict = posix_to_string(experiment_dict) # Convert all paths to string
|
|
478
|
+
save_json(path_file, experiment_dict)
|
|
479
|
+
|
|
480
|
+
return path_file
|