mediml 0.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. MEDiml/MEDscan.py +1696 -0
  2. MEDiml/__init__.py +21 -0
  3. MEDiml/biomarkers/BatchExtractor.py +806 -0
  4. MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
  5. MEDiml/biomarkers/__init__.py +16 -0
  6. MEDiml/biomarkers/diagnostics.py +125 -0
  7. MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
  8. MEDiml/biomarkers/glcm.py +1602 -0
  9. MEDiml/biomarkers/gldzm.py +523 -0
  10. MEDiml/biomarkers/glrlm.py +1315 -0
  11. MEDiml/biomarkers/glszm.py +555 -0
  12. MEDiml/biomarkers/int_vol_hist.py +527 -0
  13. MEDiml/biomarkers/intensity_histogram.py +615 -0
  14. MEDiml/biomarkers/local_intensity.py +89 -0
  15. MEDiml/biomarkers/morph.py +1756 -0
  16. MEDiml/biomarkers/ngldm.py +780 -0
  17. MEDiml/biomarkers/ngtdm.py +414 -0
  18. MEDiml/biomarkers/stats.py +373 -0
  19. MEDiml/biomarkers/utils.py +389 -0
  20. MEDiml/filters/TexturalFilter.py +299 -0
  21. MEDiml/filters/__init__.py +9 -0
  22. MEDiml/filters/apply_filter.py +134 -0
  23. MEDiml/filters/gabor.py +215 -0
  24. MEDiml/filters/laws.py +283 -0
  25. MEDiml/filters/log.py +147 -0
  26. MEDiml/filters/mean.py +121 -0
  27. MEDiml/filters/textural_filters_kernels.py +1738 -0
  28. MEDiml/filters/utils.py +107 -0
  29. MEDiml/filters/wavelet.py +237 -0
  30. MEDiml/learning/DataCleaner.py +198 -0
  31. MEDiml/learning/DesignExperiment.py +480 -0
  32. MEDiml/learning/FSR.py +667 -0
  33. MEDiml/learning/Normalization.py +112 -0
  34. MEDiml/learning/RadiomicsLearner.py +714 -0
  35. MEDiml/learning/Results.py +2237 -0
  36. MEDiml/learning/Stats.py +694 -0
  37. MEDiml/learning/__init__.py +10 -0
  38. MEDiml/learning/cleaning_utils.py +107 -0
  39. MEDiml/learning/ml_utils.py +1015 -0
  40. MEDiml/processing/__init__.py +6 -0
  41. MEDiml/processing/compute_suv_map.py +121 -0
  42. MEDiml/processing/discretisation.py +149 -0
  43. MEDiml/processing/interpolation.py +275 -0
  44. MEDiml/processing/resegmentation.py +66 -0
  45. MEDiml/processing/segmentation.py +912 -0
  46. MEDiml/utils/__init__.py +25 -0
  47. MEDiml/utils/batch_patients.py +45 -0
  48. MEDiml/utils/create_radiomics_table.py +131 -0
  49. MEDiml/utils/data_frame_export.py +42 -0
  50. MEDiml/utils/find_process_names.py +16 -0
  51. MEDiml/utils/get_file_paths.py +34 -0
  52. MEDiml/utils/get_full_rad_names.py +21 -0
  53. MEDiml/utils/get_institutions_from_ids.py +16 -0
  54. MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
  55. MEDiml/utils/get_patient_names.py +26 -0
  56. MEDiml/utils/get_radiomic_names.py +27 -0
  57. MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
  58. MEDiml/utils/image_reader_SITK.py +37 -0
  59. MEDiml/utils/image_volume_obj.py +22 -0
  60. MEDiml/utils/imref.py +340 -0
  61. MEDiml/utils/initialize_features_names.py +62 -0
  62. MEDiml/utils/inpolygon.py +159 -0
  63. MEDiml/utils/interp3.py +43 -0
  64. MEDiml/utils/json_utils.py +78 -0
  65. MEDiml/utils/mode.py +31 -0
  66. MEDiml/utils/parse_contour_string.py +58 -0
  67. MEDiml/utils/save_MEDscan.py +30 -0
  68. MEDiml/utils/strfind.py +32 -0
  69. MEDiml/utils/textureTools.py +188 -0
  70. MEDiml/utils/texture_features_names.py +115 -0
  71. MEDiml/utils/write_radiomics_csv.py +47 -0
  72. MEDiml/wrangling/DataManager.py +1724 -0
  73. MEDiml/wrangling/ProcessDICOM.py +512 -0
  74. MEDiml/wrangling/__init__.py +3 -0
  75. mediml-0.9.9.dist-info/LICENSE.md +674 -0
  76. mediml-0.9.9.dist-info/METADATA +232 -0
  77. mediml-0.9.9.dist-info/RECORD +78 -0
  78. mediml-0.9.9.dist-info/WHEEL +4 -0
@@ -0,0 +1,1015 @@
1
+ import csv
2
+ import json
3
+ import os
4
+ import pickle
5
+ import re
6
+ import string
7
+ from copy import deepcopy
8
+ from pathlib import Path
9
+ from typing import Dict, List, Tuple, Union
10
+
11
+ import matplotlib.pyplot as plt
12
+ import numpy as np
13
+ import pandas
14
+ import pandas as pd
15
+ import seaborn as sns
16
+ from numpyencoder import NumpyEncoder
17
+ from sklearn.model_selection import StratifiedKFold
18
+
19
+ from MEDiml.utils import get_institutions_from_ids
20
+ from MEDiml.utils.get_full_rad_names import get_full_rad_names
21
+ from MEDiml.utils.json_utils import load_json, save_json
22
+
23
+
24
+ # Define useful constants
25
+ # Metrics to process
26
+ list_metrics = [
27
+ 'AUC', 'AUPRC', 'BAC', 'Sensitivity', 'Specificity',
28
+ 'Precision', 'NPV', 'F1_score', 'Accuracy', 'MCC',
29
+ 'TN', 'FP', 'FN', 'TP'
30
+ ]
31
+
32
+ def average_results(path_results: Path, save: bool = False) -> None:
33
+ """
34
+ Averages the results (AUC, BAC, Sensitivity and Specifity) of all the runs of the same experiment,
35
+ for training, testing and holdout sets.
36
+
37
+ Args:
38
+ path_results(Path): path to the folder containing the results of the experiment.
39
+ save (bool, optional): If True, saves the results in the same folder as the model.
40
+
41
+ Returns:
42
+ None.
43
+ """
44
+ # Get all tests paths
45
+ list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
46
+
47
+ # Initialize dictionaries
48
+ results_avg = {
49
+ 'train': {},
50
+ 'test': {},
51
+ 'holdout': {}
52
+ }
53
+
54
+ # Metrics to process
55
+ metrics = ['AUC', 'AUPRC', 'BAC', 'Sensitivity', 'Specificity',
56
+ 'Precision', 'NPV', 'F1_score', 'Accuracy', 'MCC',
57
+ 'TN', 'FP', 'FN', 'TP']
58
+
59
+ # Process metrics
60
+ for dataset in ['train', 'test', 'holdout']:
61
+ dataset_dict = results_avg[dataset]
62
+ for metric in metrics:
63
+ metric_values = []
64
+ for path_test in list_path_tests:
65
+ results_dict = load_json(path_test / 'run_results.json')
66
+ if dataset in results_dict[list(results_dict.keys())[0]].keys():
67
+ if 'metrics' in results_dict[list(results_dict.keys())[0]][dataset].keys():
68
+ metric_values.append(results_dict[list(results_dict.keys())[0]][dataset]['metrics'][metric])
69
+ else:
70
+ continue
71
+ else:
72
+ continue
73
+
74
+ # Fill the dictionary
75
+ if metric_values:
76
+ dataset_dict[f'{metric}_mean'] = np.nanmean(metric_values)
77
+ dataset_dict[f'{metric}_std'] = np.nanstd(metric_values)
78
+ dataset_dict[f'{metric}_max'] = np.nanmax(metric_values)
79
+ dataset_dict[f'{metric}_min'] = np.nanmin(metric_values)
80
+ dataset_dict[f'{metric}_2.5%'] = np.nanpercentile(metric_values, 2.5)
81
+ dataset_dict[f'{metric}_97.5%'] = np.nanpercentile(metric_values, 97.5)
82
+
83
+ # Save the results
84
+ if save:
85
+ save_json(path_results / 'results_avg.json', results_avg, cls=NumpyEncoder)
86
+ return path_results / 'results_avg.json'
87
+
88
+ return results_avg
89
+
90
+ def combine_rad_tables(rad_tables: List) -> pd.DataFrame:
91
+ """
92
+ Combines a list of radiomics tables into one single table.
93
+
94
+ Args:
95
+ rad_tables (List): List of radiomics tables.
96
+
97
+ Returns:
98
+ pd.DataFrame: Single combined radiomics table.
99
+ """
100
+ # Initialization
101
+ n_tables = len(rad_tables)
102
+
103
+ base_idx = 0
104
+ for idx, table in enumerate(rad_tables):
105
+ if not table.empty:
106
+ base_idx = idx
107
+ break
108
+ # Finding patient intersection
109
+ for t in range(n_tables):
110
+ if rad_tables[t].shape[1] > 0 and t != base_idx:
111
+ rad_tables[base_idx], rad_tables[t] = intersect_var_tables(rad_tables[base_idx], rad_tables[t])
112
+
113
+ # Check for NaNs
114
+ '''for table in rad_tables:
115
+ assert(table.isna().sum().sum() == 0)'''
116
+
117
+ # Initializing the radiomics table template
118
+ radiomics_table = pd.DataFrame()
119
+ radiomics_table.Properties = {}
120
+ radiomics_table._metadata += ['Properties']
121
+ radiomics_table.Properties['userData'] = {}
122
+ radiomics_table.Properties['VariableNames'] = []
123
+ radiomics_table.Properties['userData']['normalization'] = {}
124
+
125
+ # Combining radiomics table one by one
126
+ count = 0
127
+ continuous = []
128
+ str_names = '||'
129
+ for t in range(n_tables):
130
+ rad_table_id = 'radTab' + str(t+1)
131
+ if rad_tables[t].shape[1] > 0 and rad_tables[t].shape[0] > 0:
132
+ features = rad_tables[t].columns.values
133
+ description = rad_tables[t].Properties['Description']
134
+ full_rad_names = get_full_rad_names(rad_tables[t].Properties['userData']['variables']['var_def'],
135
+ features)
136
+ if 'normalization' in rad_tables[t].Properties['userData']:
137
+ radiomics_table.Properties['userData']['normalization'][rad_table_id] = rad_tables[t].Properties[
138
+ 'userData']['normalization']
139
+ for f, feature in enumerate(features):
140
+ count += 1
141
+ var_name = 'radVar' + str(count)
142
+ radiomics_table[var_name] = rad_tables[t][feature]
143
+ radiomics_table.Properties['VariableNames'].append(var_name)
144
+ continuous.append(var_name)
145
+ if description:
146
+ str_names += 'radVar' + str(count) + ':' + description + '___' + full_rad_names[f] + '||'
147
+ else:
148
+ str_names += 'radVar' + str(count) + ':' + full_rad_names[f] + '||'
149
+
150
+ # Updating the radiomics table properties
151
+ radiomics_table.Properties['Description'] = ''
152
+ radiomics_table.Properties['DimensionNames'] = ['PatientID']
153
+ radiomics_table.Properties['userData']['variables'] = {}
154
+ radiomics_table.Properties['userData']['variables']['var_def'] = str_names
155
+ radiomics_table.Properties['userData']['variables']['continuous'] = continuous
156
+
157
+ return radiomics_table
158
+
159
+ def combine_tables_from_list(var_list: List, combination: List) -> pd.DataFrame:
160
+ """
161
+ Concatenates all variable tables in ``var_list`` according to ``var_ids``.
162
+
163
+ Unlike ``combine_rad_tables`` This method concatenates variable tables instead of creating a new table from
164
+ the intersection of the tables.
165
+
166
+ Args:
167
+ var_list (List): List of tables. Each key is a given var_id and holds a radiomic table.
168
+ --> Ex: .var1: variable table 1
169
+ .var2: variable table 2
170
+ .var3: variable table 3
171
+ combination (list): List of strings to identify the table to combine in var_list.
172
+ --> Ex: {'var1','var3'}
173
+
174
+ Returns:
175
+ pd.DataFrame: variable_table: Combined radiomics table.
176
+ """
177
+ def concatenate_varid(var_names, var_id):
178
+ return np.asarray([var_id + "__" + var_name for var_name in var_names.tolist()])
179
+
180
+ # Initialization
181
+ variables = dict()
182
+ variables['continuous'] = np.array([])
183
+ variable_tables = list()
184
+
185
+ # Using the first table as template
186
+ var_id = combination[0]
187
+ variable_table = deepcopy(var_list[var_id]) # first table from the list
188
+ variable_table.Properties = deepcopy(var_list[var_id].Properties)
189
+ new_columns = [var_id + '__' + col for col in variable_table.columns]
190
+ variable_table.columns = new_columns
191
+ variable_table.Properties['VariableNames'] = new_columns
192
+ variable_table.Properties['userData'] = dict() # Re-Initializing
193
+ variable_table.Properties['userData'][var_id] = deepcopy(var_list[var_id].Properties['userData'])
194
+ variables['continuous'] = np.concatenate((variables['continuous'], var_list[var_id].Properties[
195
+ 'userData']['variables']['continuous']))
196
+ variable_tables.append(variable_table)
197
+
198
+ # Concatenating all other tables
199
+ for var_id in combination[1:]:
200
+ variable_table.Properties['userData'][var_id] = var_list[var_id].Properties['userData']
201
+ patient_ids = intersect(list(variable_table.index), (var_list[var_id].index))
202
+ var_list[var_id] = var_list[var_id].loc[patient_ids]
203
+ variable_table = variable_table.loc[patient_ids]
204
+ old_columns = list(variable_table.columns)
205
+ old_properties = deepcopy(variable_table.Properties) # for unknown reason Properties are erased after concat
206
+ variable_table = pd.concat([variable_table, var_list[var_id]], axis=1)
207
+ variable_table.columns = old_columns + [var_id + "__" + col for col in var_list[var_id].columns]
208
+ variable_table.Properties = old_properties
209
+ variable_table.Properties['VariableNames'] = list(variable_table.columns)
210
+ variables['continuous'] = np.concatenate((variables['continuous'], var_list[var_id].Properties['userData']['variables']['continuous']))
211
+
212
+ # Updating the radiomics table properties
213
+ variable_table.Properties['Description'] = "Data table"
214
+ variables['continuous'] = concatenate_varid(variables['continuous'], var_id)
215
+ variable_table.Properties['userData']['variables'] = variables
216
+
217
+ return variable_table
218
+
219
+ def convert_comibnations_to_list(combinations_string: str) -> Tuple[List, List]:
220
+ """
221
+ Converts a cell of strings specifying variable ids combinations to
222
+ a cell of cells of strings.
223
+
224
+ Args:
225
+ combinations_string (str): Cell of strings specifying var_ids combinations
226
+ separated by underscores.
227
+ --> Ex: {'var1_var2';'var2_var3';'var1_var2_var3'}
228
+
229
+ Rerturs:
230
+ - List: List of strings of the seperated var_ids.
231
+ --> Ex: {{'var1','var2'};{'var2','var3'};{'var1','var2','var3'}}
232
+ - List: List of strings specifying the "alphabetical" IDs of combined variables
233
+ in ``combinations``. var1 --> A, var2 -> B, etc.
234
+ --> Ex: {'model_AB';'model_BC';'model_ABC'}
235
+ """
236
+ # Building combinations
237
+ combinations = [s.split('_') for s in combinations_string]
238
+
239
+ # Building model_ids
240
+ alphabet = string.ascii_uppercase
241
+ model_ids = list()
242
+ for combination in combinations:
243
+ model_ids.append('model_' + ''.join([alphabet[int(var[3:])-1] for var in combination]))
244
+
245
+ return combinations, model_ids
246
+
247
+ def count_class_imbalance(path_csv_outcomes: Path) -> Dict:
248
+ """
249
+ Counts the class imbalance in a given outcome table.
250
+
251
+ Args:
252
+ path_csv_outcomes (Path): Path to the outcome table.
253
+
254
+ Returns:
255
+ Dict: Dictionary containing the count of each class.
256
+ """
257
+ # Initialization
258
+ outcomes = pandas.read_csv(path_csv_outcomes, sep=',')
259
+ outcomes.dropna(inplace=True)
260
+ outcomes.reset_index(inplace=True, drop=True)
261
+ name_outcome = outcomes.columns[-1]
262
+
263
+ # Counting the percentage of each class
264
+ class_0_perc = np.sum(outcomes[name_outcome] == 0) / len(outcomes)
265
+ class_1_perc = np.sum(outcomes[name_outcome] == 1) / len(outcomes)
266
+
267
+ return {'class_0': class_0_perc, 'class_1': class_1_perc}
268
+
269
+ def create_experiment_folder(path_outcome_folder: str, method: str = 'Random') -> str:
270
+ """
271
+ Creates the experiment folder where the hold-out splits will be saved and returns the path
272
+ to the folder.
273
+
274
+ Args:
275
+ path_outcome_folder (str): Full path to the outcome folder (folder containing the outcome table etc).
276
+ method (str): String specifying the split type. Default is 'Random'.
277
+
278
+ Returns:
279
+ str: Full path to the experiment folder.
280
+ """
281
+
282
+ # Creating the outcome folder if it does not exist
283
+ if not os.path.isdir(path_outcome_folder):
284
+ os.makedirs(path_outcome_folder)
285
+
286
+ # Creating the experiment folder if it does not exist
287
+ list_outcome = os.listdir(path_outcome_folder)
288
+ if not list_outcome:
289
+ flag_exist_split = False
290
+ else:
291
+ n_exist = 0
292
+ flag_exist_split = False
293
+ for i in range(len(list_outcome)):
294
+ if 'holdOut__' + method + '__' in list_outcome[i]:
295
+ n_exist = n_exist + 1
296
+ flag_exist_split = True
297
+
298
+ # If path experiment folder exists already, create a new one (sequentially)
299
+ if not flag_exist_split:
300
+ path_split = str(path_outcome_folder) + '/holdOut__' + method + '__001'
301
+ else:
302
+ path_split = str(path_outcome_folder) + '/holdOut__' + method + '__' + \
303
+ str(n_exist+1).zfill(3)
304
+
305
+ os.mkdir(path_split)
306
+ return path_split
307
+
308
+ def create_holdout_set(
309
+ path_outcome_file: Union[str, Path],
310
+ outcome_name: str,
311
+ path_save_experiments: Union[str, Path] = None,
312
+ method: str = 'random',
313
+ percentage: float = 0.2,
314
+ n_split: int = 1,
315
+ seed : int = 1) -> None:
316
+ """
317
+ Creates a hold-out patient set to be used for final independent testing after a final
318
+ model is chosen. All the information is saved in a JSON file.
319
+
320
+ Args:
321
+ path_outcome_file (str): Full path to where the outcome CSV file is stored.
322
+ outcome_name (str): Name of the outcome. For example, 'OS' for overral survivor.
323
+ path_save_experiments (str): Full path to the folder where the experiments
324
+ will be saved.
325
+ method (str): Method to use for creating the hold-out set. Options are:
326
+ - 'random': Randomly selects patients for the hold-out set.
327
+ - 'all_learn': No hold-out set is created. All patients are used for learning.
328
+ - 'institution': TODO.
329
+ percentage (float): Percentage of patients to use for the hold-out set. Default is 0.2.
330
+ n_split (int): Number of splits to create. Default is 1.
331
+ seed (int): Seed to use for the random split. Default is 1.
332
+
333
+ Returns:
334
+ None.
335
+ """
336
+ # Initilization
337
+ outcome_name = outcome_name.upper()
338
+ outcome_table = pandas.read_csv(path_outcome_file, sep=',')
339
+ outcome_table.dropna(inplace=True)
340
+ outcome_table.reset_index(inplace=True, drop=True)
341
+ patient_ids = outcome_table['PatientID']
342
+
343
+ # Creating experiment folders and patient test split(s)
344
+ outcome_name = re.sub(r'\W', "", outcome_name)
345
+ path_outcome = str(path_save_experiments) + '/' + outcome_name
346
+ name_outcome_in_table_binary = outcome_name + '_binary'
347
+
348
+ # Column names in the outcome table
349
+ with open(path_outcome_file, 'r') as infile:
350
+ reader = csv.DictReader(infile, delimiter=',')
351
+ var_names = reader.fieldnames
352
+
353
+ # Include time to event if it exists
354
+ flag_time = False
355
+ if(outcome_name + '_eventFreeTime' in str(var_names)):
356
+ name_outcome_in_table_time = outcome_name + '_eventFreeTime'
357
+ flag_time = True
358
+
359
+ # Check if the outcome name for binary is correct
360
+ if name_outcome_in_table_binary not in outcome_table.columns:
361
+ name_outcome_in_table_binary = var_names[-1]
362
+
363
+ # Run the split
364
+ # Random
365
+ if 'random' in method.lower():
366
+ # Creating the experiment folder
367
+ path_split = create_experiment_folder(path_outcome, 'random')
368
+
369
+ # Getting the random split
370
+ patients_learn_temp, patients_hold_out_temp = get_stratified_splits(
371
+ outcome_table[['PatientID', name_outcome_in_table_binary]],
372
+ n_split, percentage, seed, False)
373
+
374
+ # Getting the patient IDs in the learning and hold-out sets
375
+ if n_split > 1:
376
+ patients_learn = np.empty((n_split, len(patients_learn_temp[0])), dtype=object)
377
+ patients_hold_out = np.empty((n_split, len(patients_hold_out_temp[0])), dtype=object)
378
+ for s in range(n_split):
379
+ patients_learn[s] = patient_ids[patients_learn_temp[s]]
380
+ patients_hold_out[s] = patient_ids[patients_hold_out_temp[s]]
381
+ else:
382
+ patients_learn = patient_ids[patients_learn_temp.values.tolist()]
383
+ patients_learn.reset_index(inplace=True, drop=True)
384
+ patients_hold_out = patient_ids[patients_hold_out_temp.values.tolist()]
385
+ patients_hold_out.reset_index(inplace=True, drop=True)
386
+
387
+ # All Learn
388
+ elif 'all_learn' in method.lower():
389
+ # Creating the experiment folder
390
+ path_split = create_experiment_folder(path_outcome, 'all_learn')
391
+
392
+ # Getting the split (all Learn so no hold out)
393
+ patients_learn = patient_ids
394
+ patients_hold_out = []
395
+ else:
396
+ raise ValueError('Method not recognized. Use "random" or "all_learn".')
397
+
398
+ # Creating final outcome table and saving it
399
+ if flag_time:
400
+ outcomes = outcome_table[
401
+ ['PatientID', name_outcome_in_table_binary, name_outcome_in_table_time]]
402
+ else:
403
+ outcomes = outcome_table[['PatientID', name_outcome_in_table_binary]]
404
+
405
+ # Finalize the outcome table
406
+ outcomes = outcomes.dropna(inplace=False) # Drop NaNs
407
+ outcomes.reset_index(inplace=True, drop=True) # Reset index
408
+
409
+ # Save the outcome table
410
+ paths_exp_outcomes = str(path_split + '/outcomes.csv')
411
+ outcomes.to_csv(paths_exp_outcomes, index=False)
412
+
413
+ # Save dict of patientsLearn
414
+ paths_exp_patientsLearn = str(path_split) + '/patientsLearn.json'
415
+ patients_learn.to_json(paths_exp_patientsLearn, orient='values', indent=4)
416
+
417
+ # Save dict of patientsHoldOut
418
+ if method == 'random':
419
+ paths_exp_patients_hold_out = str(path_split) + '/patientsHoldOut.json'
420
+ patients_hold_out.to_json(paths_exp_patients_hold_out, orient='values', indent=4)
421
+
422
+ # Save dict of all the paths
423
+ data={
424
+ "outcomes" : paths_exp_outcomes,
425
+ "patientsLearn": paths_exp_patientsLearn,
426
+ "patientsHoldOut": paths_exp_patients_hold_out,
427
+ "pathWORK": path_split
428
+ }
429
+ else:
430
+ data={
431
+ "outcomes" : paths_exp_outcomes,
432
+ "patientsLearn": paths_exp_patientsLearn,
433
+ "pathWORK": path_split
434
+ }
435
+ paths_exp = str(path_split + '/paths_exp.json')
436
+ with open(paths_exp, 'w') as f:
437
+ json.dump(data, f, indent=4)
438
+
439
+ # Return the path to the experiment and path to split
440
+ return path_split, paths_exp
441
+
442
+ def cross_validation_split(
443
+ outcome: List[Union[int, float]],
444
+ n_splits: int = 5,
445
+ seed: int = None
446
+ ) -> Tuple[List[List[int]], List[List[int]]]:
447
+ """
448
+ Perform stratified cross-validation split.
449
+
450
+ Args:
451
+ outcome (list): Outcome variable (binary).
452
+ n_splits (int, optional): Number of folds. Default is 5.
453
+ seed (int or None, optional): Random seed for reproducibility. Default is None.
454
+
455
+ Returns:
456
+ train_indices_list (list of lists): List of training indices for each fold.
457
+ test_indices_list (list of lists): List of testing indices for each fold.
458
+ """
459
+
460
+ skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
461
+ train_data_list = []
462
+ test_data_list = []
463
+ patient_ids = pd.Series(outcome.index)
464
+
465
+ for train_indices, test_indices in skf.split(X=outcome, y=outcome):
466
+ train_data_list.append(patient_ids[train_indices])
467
+ test_data_list.append(patient_ids[test_indices])
468
+
469
+ train_data_array = np.array(train_data_list, dtype=object)
470
+ test_data_array = np.array(test_data_list, dtype=object)
471
+
472
+ return train_data_array, test_data_array
473
+
474
+ def find_best_model(path_results: Path, metric: str = 'AUC', second_metric: str = 'AUC') -> Tuple[Dict, Path]:
475
+ """
476
+ Find the best model with the highest performance on the test set
477
+ in a given path based on a given metric.
478
+
479
+ Args:
480
+ path_results (Path): Path to the results folder.
481
+ metric (str): Metric to use to find the best model in case of a tie. Default is 'AUC'.
482
+
483
+ Returns:
484
+ Tuple[Dict, Path]: Tuple containing the best model result dict and the path to the best model.
485
+ """
486
+ list_metrics = [
487
+ 'AUC', 'Sensitivity', 'Specificity',
488
+ 'BAC', 'AUPRC', 'Precision',
489
+ 'NPV', 'Accuracy', 'F1_score', 'MCC',
490
+ 'TP', 'TN', 'FP', 'FN'
491
+ ]
492
+ assert metric in list_metrics, f'Given metric {metric} is not in the list of metrics. Please choose from {list_metrics}'
493
+
494
+ # Get all tests paths
495
+ list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
496
+
497
+ # Initialization
498
+ metric_best = -1
499
+ second_metric_best = -1
500
+ path_result_best = None
501
+
502
+ # Get all models and their metrics (AUC especially)
503
+ for path_test in list_path_tests:
504
+ if not (path_test / 'run_results.json').exists():
505
+ continue
506
+ results_dict = load_json(path_test / 'run_results.json')
507
+ metric_test = results_dict[list(results_dict.keys())[0]]['test']['metrics'][metric]
508
+ if metric_test > metric_best:
509
+ metric_best = metric_test
510
+ path_result_best = path_test
511
+ elif metric_test == metric_best:
512
+ second_metric_test = results_dict[list(results_dict.keys())[0]]['test']['metrics'][second_metric]
513
+ if second_metric_test > second_metric_best:
514
+ second_metric_best = second_metric_test
515
+ path_result_best = path_test
516
+
517
+ # Load best model result dict
518
+ results_dict_best = load_json(path_result_best / 'run_results.json')
519
+
520
+ # Load model
521
+ model_name = list(results_dict_best.keys())[0]
522
+ with open(path_result_best / f'{model_name}.pickle', 'rb') as file:
523
+ model = pickle.load(file)
524
+
525
+ return model, results_dict_best
526
+
527
+ def feature_imporance_analysis(path_results: Path):
528
+ """
529
+ Averages the results (AUC, BAC, Sensitivity and Specifity) of all the runs of the same experiment,
530
+ for training, testing and holdout sets.
531
+
532
+ Args:
533
+ path_results(Path): path to the folder containing the results of the experiment.
534
+ save (bool, optional): If True, saves the results in the same folder as the model.
535
+
536
+ Returns:
537
+ None.
538
+ """
539
+ # Get all tests paths
540
+ list_path_tests = [path for path in path_results.iterdir() if path.is_dir()]
541
+
542
+ # Initialization
543
+ results_avg_temp = {}
544
+ results_avg = {}
545
+
546
+ # Process metrics
547
+ for path_test in list_path_tests:
548
+ variables = []
549
+ list_models = list(path_test.glob('*.pickle'))
550
+ if len(list_models) == 0 or len(list_models) > 1:
551
+ raise ValueError(f'Path {path_test} does not contain a single model.')
552
+ model_obj = list_models[0]
553
+ with open(model_obj, "rb") as f:
554
+ model_dict = pickle.load(f)
555
+ if model_dict["var_names"]:
556
+ variables = get_full_rad_names(model_dict['var_info']['variables']['var_def'], model_dict["var_names"])
557
+ for index, var in enumerate(variables):
558
+ var = var.split("\\")[-1] # Remove the path for windows
559
+ var = var.split("/")[-1] # Remove the path for linux
560
+ if var not in results_avg_temp:
561
+ results_avg_temp[var] = {
562
+ 'importance_mean': [],
563
+ 'times_selected': 0
564
+ }
565
+
566
+ results_avg_temp[var]['importance_mean'].append(model_dict['model'].feature_importances_[index])
567
+ results_avg_temp[var]['times_selected'] += 1
568
+ for var in results_avg_temp:
569
+ results_avg[var] = {
570
+ 'importance_mean': np.sum(results_avg_temp[var]['importance_mean']) / len(list_path_tests),
571
+ 'times_selected': results_avg_temp[var]['times_selected']
572
+ }
573
+
574
+ del results_avg_temp
575
+
576
+ save_json(path_results / 'feature_importance_analysis.json', results_avg, cls=NumpyEncoder)
577
+
578
+ def get_ml_test_table(variable_table: pd.DataFrame, var_names: List, var_def: str) -> pd.DataFrame:
579
+ """
580
+ Gets the test table with the variables that are present in the training table.
581
+
582
+ Args:
583
+ variable_table (pd.DataFrame): Table with the variables to use for the ML model that
584
+ will be matched with the training table.
585
+ var_names (List): List of variable names used for the ML model .
586
+ var_def (str): String of the full variables names used for the ML model.
587
+
588
+ Returns:
589
+ pd.DataFrame: Table with the variables that are present in the training table.
590
+ """
591
+
592
+ # Get the full variable names for training
593
+ full_radvar_names_trained = get_full_rad_names(var_def, var_names).tolist()
594
+
595
+ # Get the full variable names for testing
596
+ full_rad_var_names_test = get_full_rad_names(
597
+ variable_table.Properties['userData']['variables']['var_def'],
598
+ variable_table.columns.values
599
+ ).tolist()
600
+
601
+ # Get the indexes of the variables that are present in the training table
602
+ indexes = []
603
+ for radvar in full_radvar_names_trained:
604
+ try:
605
+ indexes.append(full_rad_var_names_test.index(radvar))
606
+ except ValueError as e:
607
+ print(e)
608
+ raise ValueError('The variable ' + radvar + ' is not present in the test table.')
609
+
610
+ # Get the test table with the variables that are present in the training table
611
+ variable_table = variable_table.iloc[:, indexes]
612
+
613
+ # User data - var_def
614
+ str_names = '||'
615
+ for v in range(len(var_names)):
616
+ str_names += var_names[v] + ':' + full_radvar_names_trained[v] + '||'
617
+
618
+ # Update metadata and variable names
619
+ variable_table.columns = var_names
620
+ variable_table.Properties['VariableNames'] = var_names
621
+ variable_table.Properties['userData']['variables']['var_def'] = str_names
622
+ variable_table.Properties['userData']['variables']['continuous'] = var_names
623
+
624
+ # Rename columns to s sequential names again
625
+ return variable_table
626
+
627
+ def finalize_rad_table(rad_table: pd.DataFrame) -> pd.DataFrame:
628
+ """
629
+ Finalizes the variable names and the associated metadata. Used to have sequential variable
630
+ names and UserData with only variable names present in the table.
631
+
632
+ Args:
633
+ rad_table (pd.DataFrame): radiomics table to be finalized.
634
+
635
+ Returns:
636
+ pd.DataFrame: Finalized radiomics table.
637
+ """
638
+
639
+ # Initialization
640
+ var_names = rad_table.columns.values
641
+ full_rad_names = get_full_rad_names(rad_table.Properties['userData']['variables']['var_def'], var_names)
642
+
643
+ # User data - var_def
644
+ str_names = '||'
645
+ for v in range(var_names.size):
646
+ var_names[v] = 'radVar' + str(v+1)
647
+ str_names = str_names + var_names[v] + ':' + full_rad_names[v] + '||'
648
+
649
+ # Update metadata and variable names
650
+ rad_table.columns = var_names
651
+ rad_table.Properties['VariableNames'] = var_names
652
+ rad_table.Properties['userData']['variables']['var_def'] = str_names
653
+ rad_table.Properties['userData']['variables']['continuous'] = var_names
654
+
655
+ return rad_table
656
+
657
+ def get_radiomics_table(
658
+ path_radiomics_csv: Path,
659
+ path_radiomics_txt: Path,
660
+ image_type: str,
661
+ patients_ids: List = None
662
+ ) -> pd.DataFrame:
663
+ """
664
+ Loads the radiomics table from the .csv file and the associated metadata.
665
+
666
+ Args:
667
+ path_radiomics_csv (Path): full path to the csv file of radiomics table.
668
+ --> Ex: /home/myStudy/FEATURES/radiomics__PET(GTV)__image.csv
669
+ path_radiomics_txt: full path to the radiomics variable definitions in text format (associated
670
+ to path_radiomics_csv).
671
+ -> Ex: /home/myStudy/FEATURES/radiomics__PET(GTV)__image.txt
672
+ image_type (str): String specifying the type of image on which the radiomics
673
+ features were computed.
674
+ --> Format: $scan$($roiType$)__$imSpace$
675
+ --> Ex: PET(tumor)__HHH_coif1
676
+ patients_ids (list, optional): List of strings specifying the patientIDs of
677
+ patients to fetch from the radiomics table. If this
678
+ argument is not present, all patients are fetched.
679
+ --> Ex: {'Cervix-UCSF-001';Cervix-McGill-004}
680
+
681
+ Returns:
682
+ pd.DataFrame: radiomics table
683
+ """
684
+ # Read CSV table
685
+ radiomics_table = pd.read_csv(path_radiomics_csv, index_col=0)
686
+ if patients_ids is not None:
687
+ patients_ids = intersect(patients_ids, list(radiomics_table.index))
688
+ radiomics_table = radiomics_table.loc[patients_ids]
689
+
690
+ # Read the associated TXT file
691
+ with open(path_radiomics_txt, 'r') as f:
692
+ user_data = f.read()
693
+
694
+ # Grouping the information
695
+ radiomics_table._metadata += ["Properties"]
696
+ radiomics_table.Properties = dict()
697
+ radiomics_table.Properties['userData'] = dict()
698
+ radiomics_table.Properties['userData']['variables'] = dict()
699
+ radiomics_table.Properties['userData']['variables']['var_def'] = user_data
700
+ radiomics_table.Properties['Description'] = image_type
701
+
702
+ # Only continuous will be used for now but this design will facilitate the use of
703
+ # other categories in the future.
704
+ # radiomics = continous.
705
+ radiomics_table.Properties['userData']['variables']['continuous'] = np.asarray(list(radiomics_table.columns.values))
706
+
707
+ return radiomics_table
708
+
709
+ def get_splits(outcome: pd.DataFrame, n_split: int, test_split_proportion: float) -> Tuple[List, List]:
710
+ """
711
+ Splits the given outcome table in two sets.
712
+
713
+ Args:
714
+ outcome (pd.DataFrame): Table with a single outcome column of 0's and 1's.
715
+ n_splits (int): Integer specifying the number of splits to create.
716
+ test_split_proportion (float): Float between 0 and 1 specifying the proportion
717
+ of patients to include in the test set.
718
+
719
+ Returns:
720
+ train_sets List of indexes for the train_sets.
721
+ test_sets: List of indexes for the test_sets.
722
+
723
+ """
724
+
725
+ ind_neg = np.where(outcome == 0)
726
+ n_neg = len(ind_neg[0])
727
+ ind_pos = np.where(outcome == 1)
728
+ n_pos = len(ind_pos[0])
729
+ n_neg_test = round(test_split_proportion * n_neg)
730
+ n_pos_test = round(test_split_proportion * n_pos)
731
+
732
+ n_inst = len(outcome)
733
+ n_test = n_pos_test + n_neg_test
734
+ n_train = n_inst - n_test
735
+
736
+ if(n_split==1):
737
+ train_sets = np.zeros(n_train)
738
+ test_sets = np.zeros(n_test)
739
+ else:
740
+ train_sets = np.zeros((n_split, n_train))
741
+ test_sets = np.zeros((n_split, n_test))
742
+
743
+ for s in range(n_split):
744
+ ind_pos_test = np.random.choice(ind_pos[0], n_pos_test, replace=False)
745
+ ind_neg_test = np.random.choice(ind_neg[0], n_neg_test, replace=False)
746
+
747
+ ind_test = np.concatenate((ind_pos_test,ind_neg_test))
748
+ ind_test.sort()
749
+
750
+ ind_train = np.arange(n_inst)
751
+ ind_train = np.delete(ind_train, ind_test)
752
+ ind_train.sort()
753
+
754
+ if(n_split>1):
755
+ train_sets[s] = ind_train
756
+ test_sets[s] = ind_test
757
+ else:
758
+ train_sets = ind_train
759
+ test_sets = ind_test
760
+
761
+ return train_sets, test_sets
762
+
763
+ def get_stratified_splits(
764
+ outcome_table: pd.DataFrame,
765
+ n_splits: int,
766
+ test_split_proportion: float,
767
+ seed: int,
768
+ flag_by_cat: bool=False
769
+ ) -> Tuple[List, List]:
770
+ """
771
+ Sub-divides a given outcome dataset into multiple stratified patient splits.
772
+ The stratification is performed per class proportion (or by institution).
773
+
774
+ Args:
775
+ outcome_table: Table with a single outcome column of 0's and 1's.
776
+ The rows of the table must define the patient IDs: $Cancer-$Institution-$Number.
777
+ n_splits: Integer specifying the number of splits to create.
778
+ test_split_proportion: Float between 0 and 1 specifying the proportion
779
+ of patients to include in the test set.
780
+ seed: Integer specifying the random generator seed to use for random splitting.
781
+ flag_by_cat (optional): Logical flag specifying if we are to produce
782
+ the split by taking into account the institutions in the outcome table.
783
+ If true, patients in Training and testing splits have the same prortion
784
+ of events per instiution as originally found in the initial data. Default: False.
785
+
786
+ Returns:
787
+ List: patients_train_splits, list of size nTrainXnSplit, where each entry
788
+ is a string specifying a "Training" patient.
789
+ List: patients_test_splits, list of size nTestXnSplit, where each entry
790
+ is a string specifying a "testing" patient
791
+ """
792
+ patient_ids = pd.Series(outcome_table.index)
793
+ patients_train_splits = []
794
+ patients_test_splits = []
795
+
796
+ # Take into account the institutions in the outcome table
797
+ if flag_by_cat:
798
+ institution_cat_vector = get_institutions_from_ids(patient_ids)
799
+ all_categories = np.unique(institution_cat_vector)
800
+ n_cat = len(all_categories)
801
+ # Split for each institution
802
+ for i in range(n_cat):
803
+ np.random.seed(seed)
804
+ cat = all_categories[i]
805
+ flag_cat = institution_cat_vector == cat
806
+ patient_ids_cat = patient_ids[flag_cat]
807
+ patient_ids_cat.reset_index(inplace=True, drop=True)
808
+
809
+ # Split train and test sets
810
+ train_sets, test_sets = get_splits(outcome_table[flag_cat.values], n_splits, test_split_proportion)
811
+
812
+ if n_splits > 1:
813
+ temp_patients_train = np.empty((n_splits, len(train_sets[0])), dtype=object)
814
+ temp_patientsTest = np.empty((n_splits, len(test_sets[0])), dtype=object)
815
+ for s in range(n_splits):
816
+ temp_patients_train[s] = patient_ids_cat[train_sets[s]]
817
+ temp_patientsTest[s] = patient_ids_cat[test_sets[s]]
818
+ else:
819
+ temp_patients_train = patient_ids_cat[train_sets]
820
+ temp_patients_train.reset_index(inplace=True, drop=True)
821
+ temp_patientsTest = patient_ids_cat[test_sets]
822
+ temp_patientsTest.reset_index(inplace=True, drop=True)
823
+
824
+ # Initialize the train and test patients list (1st iteration)
825
+ if i==0:
826
+ patients_train_splits=temp_patients_train
827
+ patients_test_splits=temp_patientsTest
828
+
829
+ # Add new patients to the train and test patients list (other iterations)
830
+ if i>0:
831
+ if n_splits>1:
832
+ patients_train_splits = np.append(patients_train_splits, temp_patients_train, axis=1)
833
+ patients_test_splits = np.append(patients_test_splits, temp_patientsTest, axis=1)
834
+
835
+ else:
836
+ patients_train_splits = np.append(patients_train_splits, temp_patients_train)
837
+ patients_test_splits = np.append(patients_test_splits, temp_patientsTest)
838
+
839
+ # Do not take into account the institutions in the outcome table
840
+ else:
841
+ # Split train and test sets
842
+ train_sets, test_sets = get_splits(outcome_table, n_splits, test_split_proportion)
843
+ if n_splits > 1:
844
+ patients_train_splits = np.empty((n_splits, len(train_sets[0])), dtype=object)
845
+ patients_test_splits = np.empty((n_splits, len(test_sets[0])), dtype=object)
846
+ for s in range(n_splits):
847
+ patients_train_splits[s] = patient_ids[train_sets[s]]
848
+ patients_test_splits[s] = patient_ids[test_sets[s]]
849
+ else:
850
+ patients_train_splits = patient_ids[train_sets]
851
+ patients_train_splits.reset_index(inplace=True, drop=True)
852
+ patients_test_splits = patient_ids[test_sets]
853
+ patients_test_splits.reset_index(inplace=True, drop=True)
854
+
855
+ return patients_train_splits, patients_test_splits
856
+
857
+ def get_patient_id_classes(outcome_table: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
858
+ """
859
+ Yields the patients from the majority class and the minority class in the given outcome table.
860
+ Only supports binary classes.
861
+
862
+ Args:
863
+ outcome_table(pd.DataFrame): outcome table with binary labels.
864
+
865
+ Returns:
866
+ pd.DataFrame: Majority class patientIDs.
867
+ pd.DataFrame: Minority class patientIDs.
868
+ """
869
+ ones = outcome_table.loc[outcome_table.iloc[0:].values == 1].index
870
+ zeros = outcome_table.loc[outcome_table.iloc[0:].values == 0].index
871
+ if ones.size > zeros.size:
872
+ return ones, zeros
873
+
874
+ return zeros, ones
875
+
876
+ def intersect(list1: List, list2: List, sort: bool = False) -> List:
877
+ """
878
+ Returns the intersection of two list.
879
+
880
+ Args:
881
+ list1 (List): the first list.
882
+ list2 (List): the second list.
883
+ order (bool): if True, the intersection is sorted.
884
+
885
+ Returns:
886
+ List: the intersection of the two lists.
887
+ """
888
+
889
+ intersection = list(filter(lambda x: x in list1, list2))
890
+ if sort:
891
+ return sorted(intersection)
892
+ return intersection
893
+
894
+ def intersect_var_tables(var_table1: pd.DataFrame, var_table2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
895
+ """
896
+ This function takes 2 variable table, compares the indexes and drops the
897
+ ones that are not in both, then returns the 2 table.
898
+
899
+ Args:
900
+ var_table1 (pd.DataFrame): first variable table.
901
+ var_table2 (pd.DataFrame): second variable table.
902
+
903
+ Returns:
904
+ pd.DataFrame: first variable table with the same indexes as the second.
905
+ pd.DataFrame: second variable table with the same indexes as the first.
906
+ """
907
+ # Find the unique values in var_table1 that are not in var_table2
908
+ missing = np.setdiff1d(var_table1.index.to_numpy(), var_table2.index.to_numpy())
909
+ if missing.size > 0:
910
+ var_table1 = var_table1.drop(missing)
911
+
912
+ # Find the unique values in var_table2 that are not in var_table1
913
+ missing = np.setdiff1d(var_table2.index.to_numpy(), var_table1.index.to_numpy())
914
+ if missing.size > 0:
915
+ var_table2 = var_table2.drop(missing)
916
+
917
+ return var_table1, var_table2
918
+
919
+ def under_sample(outcome_table_binary: pd.DataFrame) -> pd.DataFrame:
920
+ """
921
+ Performs under-sampling to obtain an equal number of outcomes in the binary outcome table.
922
+
923
+ Args:
924
+ outcome_table_binary (pd.DataFrame): outcome table with binary labels.
925
+
926
+ Returns:
927
+ pd.DataFrame: outcome table with balanced binary labels.
928
+ """
929
+
930
+ # We place them prematurely in maj and min and correct it afterwards
931
+ n_maj = (outcome_table_binary == 0).sum().values[0]
932
+ n_min = (outcome_table_binary == 1).sum().values[0]
933
+ if n_maj == n_min:
934
+ return outcome_table_binary
935
+ elif n_min > n_maj:
936
+ n_min, n_maj = n_maj, n_min
937
+
938
+ # Sample the patients from the majority class
939
+ patient_ids_maj, patient_ids_min = get_patient_id_classes(outcome_table_binary)
940
+ patient_ids_min = list(patient_ids_min)
941
+ patient_ids_numpy = patient_ids_maj.to_numpy()
942
+ np.random.shuffle(patient_ids_numpy)
943
+ patient_ids_sample = list(patient_ids_numpy[0:n_min])
944
+ new_ids = patient_ids_min + patient_ids_sample
945
+
946
+ return outcome_table_binary.loc[new_ids, :]
947
+
948
+ def save_model(model: Dict, var_id: str, path_model: Path, ml: Dict = None, name_type: str = "") -> Dict:
949
+ """
950
+ Saves a given model locally as a pickle object and outputs a dictionary
951
+ containing the model's information.
952
+
953
+ Args:
954
+ model (Dict): The model dict to save.
955
+ var_id (str): The stduied variable. For ex: 'var3'.
956
+ path_model (str): The path to save the model.
957
+ ml (Dict, optional): Dicionary containing the settings of the machine learning experiment.
958
+ name_type (str, optional): String specifying the type of the variable. For examlpe: "RadiomicsIntensity". Default is "".
959
+
960
+ Returns:
961
+ Dict: A dictionary containing the model's information.
962
+ """
963
+ # Saving model
964
+ with open(path_model, "wb") as f:
965
+ pickle.dump(model, f)
966
+
967
+ # Getting the "var_names" string
968
+ if ml is not None:
969
+ var_names = ml['variables'][var_id]['nameType']
970
+ elif name_type != "":
971
+ var_names = name_type
972
+ else:
973
+ var_names = [var_id]
974
+
975
+ # Recording model info
976
+ model_info = dict()
977
+ model_info['path'] = path_model
978
+ model_info['var_ids'] = var_id
979
+ model_info['var_type'] = var_names
980
+
981
+ try: # This part may fail if model training failed.
982
+ model_info['var_names'] = model['var_names']
983
+ model_info['var_info'] = model['var_info']
984
+ if 'normalization' in model_info['var_info'].keys():
985
+ if 'normalization_table' in model_info['var_info']['normalization'].keys():
986
+ normalization_struct = write_table_structure(model_info['var_info']['normalization']['normalization_table'])
987
+ model_info['var_info']['normalization']['normalization_table'] = normalization_struct
988
+ model_info['threshold'] = model['threshold']
989
+ except Exception as e:
990
+ print("Failed to create a fully model info")
991
+ print(e)
992
+
993
+ return model_info
994
+
995
+ def write_table_structure(data_table: pd.DataFrame) -> Dict:
996
+ """
997
+ Writes the structure of a table in a dictionary.
998
+
999
+ Args:
1000
+ data_table (pd.DataFrame): a table.
1001
+
1002
+ Returns:
1003
+ Dict: a dictionary containing the table's structure.
1004
+ """
1005
+ # Initialization
1006
+ data_struct = dict()
1007
+
1008
+ if len(data_table.index) != 0:
1009
+ data_struct['index'] = list(data_table.index)
1010
+
1011
+ # Creating the structure
1012
+ for column in data_table.columns:
1013
+ data_struct[column] = data_table[column]
1014
+
1015
+ return data_struct