mediml 0.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. MEDiml/MEDscan.py +1696 -0
  2. MEDiml/__init__.py +21 -0
  3. MEDiml/biomarkers/BatchExtractor.py +806 -0
  4. MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
  5. MEDiml/biomarkers/__init__.py +16 -0
  6. MEDiml/biomarkers/diagnostics.py +125 -0
  7. MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
  8. MEDiml/biomarkers/glcm.py +1602 -0
  9. MEDiml/biomarkers/gldzm.py +523 -0
  10. MEDiml/biomarkers/glrlm.py +1315 -0
  11. MEDiml/biomarkers/glszm.py +555 -0
  12. MEDiml/biomarkers/int_vol_hist.py +527 -0
  13. MEDiml/biomarkers/intensity_histogram.py +615 -0
  14. MEDiml/biomarkers/local_intensity.py +89 -0
  15. MEDiml/biomarkers/morph.py +1756 -0
  16. MEDiml/biomarkers/ngldm.py +780 -0
  17. MEDiml/biomarkers/ngtdm.py +414 -0
  18. MEDiml/biomarkers/stats.py +373 -0
  19. MEDiml/biomarkers/utils.py +389 -0
  20. MEDiml/filters/TexturalFilter.py +299 -0
  21. MEDiml/filters/__init__.py +9 -0
  22. MEDiml/filters/apply_filter.py +134 -0
  23. MEDiml/filters/gabor.py +215 -0
  24. MEDiml/filters/laws.py +283 -0
  25. MEDiml/filters/log.py +147 -0
  26. MEDiml/filters/mean.py +121 -0
  27. MEDiml/filters/textural_filters_kernels.py +1738 -0
  28. MEDiml/filters/utils.py +107 -0
  29. MEDiml/filters/wavelet.py +237 -0
  30. MEDiml/learning/DataCleaner.py +198 -0
  31. MEDiml/learning/DesignExperiment.py +480 -0
  32. MEDiml/learning/FSR.py +667 -0
  33. MEDiml/learning/Normalization.py +112 -0
  34. MEDiml/learning/RadiomicsLearner.py +714 -0
  35. MEDiml/learning/Results.py +2237 -0
  36. MEDiml/learning/Stats.py +694 -0
  37. MEDiml/learning/__init__.py +10 -0
  38. MEDiml/learning/cleaning_utils.py +107 -0
  39. MEDiml/learning/ml_utils.py +1015 -0
  40. MEDiml/processing/__init__.py +6 -0
  41. MEDiml/processing/compute_suv_map.py +121 -0
  42. MEDiml/processing/discretisation.py +149 -0
  43. MEDiml/processing/interpolation.py +275 -0
  44. MEDiml/processing/resegmentation.py +66 -0
  45. MEDiml/processing/segmentation.py +912 -0
  46. MEDiml/utils/__init__.py +25 -0
  47. MEDiml/utils/batch_patients.py +45 -0
  48. MEDiml/utils/create_radiomics_table.py +131 -0
  49. MEDiml/utils/data_frame_export.py +42 -0
  50. MEDiml/utils/find_process_names.py +16 -0
  51. MEDiml/utils/get_file_paths.py +34 -0
  52. MEDiml/utils/get_full_rad_names.py +21 -0
  53. MEDiml/utils/get_institutions_from_ids.py +16 -0
  54. MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
  55. MEDiml/utils/get_patient_names.py +26 -0
  56. MEDiml/utils/get_radiomic_names.py +27 -0
  57. MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
  58. MEDiml/utils/image_reader_SITK.py +37 -0
  59. MEDiml/utils/image_volume_obj.py +22 -0
  60. MEDiml/utils/imref.py +340 -0
  61. MEDiml/utils/initialize_features_names.py +62 -0
  62. MEDiml/utils/inpolygon.py +159 -0
  63. MEDiml/utils/interp3.py +43 -0
  64. MEDiml/utils/json_utils.py +78 -0
  65. MEDiml/utils/mode.py +31 -0
  66. MEDiml/utils/parse_contour_string.py +58 -0
  67. MEDiml/utils/save_MEDscan.py +30 -0
  68. MEDiml/utils/strfind.py +32 -0
  69. MEDiml/utils/textureTools.py +188 -0
  70. MEDiml/utils/texture_features_names.py +115 -0
  71. MEDiml/utils/write_radiomics_csv.py +47 -0
  72. MEDiml/wrangling/DataManager.py +1724 -0
  73. MEDiml/wrangling/ProcessDICOM.py +512 -0
  74. MEDiml/wrangling/__init__.py +3 -0
  75. mediml-0.9.9.dist-info/LICENSE.md +674 -0
  76. mediml-0.9.9.dist-info/METADATA +232 -0
  77. mediml-0.9.9.dist-info/RECORD +78 -0
  78. mediml-0.9.9.dist-info/WHEEL +4 -0
MEDiml/learning/FSR.py ADDED
@@ -0,0 +1,667 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from numpyencoder import NumpyEncoder
7
+
8
+ from MEDiml.learning.ml_utils import (combine_rad_tables, finalize_rad_table,
9
+ get_stratified_splits,
10
+ intersect_var_tables)
11
+ from MEDiml.utils.get_full_rad_names import get_full_rad_names
12
+ from MEDiml.utils.json_utils import save_json
13
+
14
+
15
+ class FSR:
16
+ def __init__(self, method: str = 'fda') -> None:
17
+ """
18
+ Feature set reduction class constructor.
19
+
20
+ Args:
21
+ method (str): Method of feature set reduction. Can be "FDA", "LASSO" or "mRMR".
22
+ """
23
+ self.method = method
24
+
25
+ def __get_fda_corr_table(
26
+ self,
27
+ variable_table: pd.DataFrame,
28
+ outcome_table_binary: pd.DataFrame,
29
+ n_splits: int,
30
+ corr_type: str,
31
+ seed: int
32
+ ) -> pd.DataFrame:
33
+ """
34
+ Calculates the correlation table of the FDA algorithm.
35
+
36
+ Args:
37
+ variable_table (pd.DataFrame): variable table to check for stability.
38
+ outcome_table_binary (pd.DataFrame): outcome table with binary labels.
39
+ n_splits (int): Number of splits in the FDA algorithm (Ex: 100).
40
+ corr_type: String specifying the correlation type that we are investigating.
41
+ Must be either 'Pearson' or 'Spearman'.
42
+ seed (int): Random generator seed.
43
+
44
+ Returns:
45
+ pd.DataFrame: Correlation table of the FDA algorithm. Rows are splits, columns are features.
46
+ """
47
+ # Setting the seed
48
+ np.random.seed(seed)
49
+
50
+ # Initialization
51
+ row_names = []
52
+ corr_table = pd.DataFrame()
53
+ fraction_for_splits = 1/3
54
+ number_of_splits = 1
55
+
56
+ # For each split, we calculate the correlation table
57
+ for s in range(n_splits):
58
+ row_names.append("Split_{0:03}".format(s))
59
+
60
+ # Keep only variables that are in both tables
61
+ _, outcome_table_binary = intersect_var_tables(variable_table, outcome_table_binary)
62
+
63
+ # Under-sample the outcome table to equalize the number of positive and negative outcomes
64
+ #outcome_table_binary_balanced = under_sample(outcome_table_binary)
65
+
66
+ # Get the patient teach split
67
+ patients_teach_splits = get_stratified_splits(
68
+ outcome_table_binary,
69
+ number_of_splits,
70
+ fraction_for_splits,
71
+ seed,
72
+ flag_by_cat=True
73
+ )[0]
74
+
75
+ # Creating a table with both the variables and the outcome with
76
+ # only the patient teach splits, ranked for spearman and not for pearson
77
+ if corr_type == 'Spearman':
78
+ full_table = pd.concat([variable_table.loc[patients_teach_splits, :].rank(),
79
+ outcome_table_binary.loc[patients_teach_splits,
80
+ outcome_table_binary.columns.values[-1]]], axis=1)
81
+
82
+ elif corr_type == 'Pearson':
83
+ # Pearson is the base method used by numpy, so we dont have to do any
84
+ # manipulations to the data like with spearman.
85
+ full_table = pd.concat([variable_table.loc[patients_teach_splits, :],
86
+ outcome_table_binary.loc[patients_teach_splits,
87
+ outcome_table_binary.columns.values[-1]]], axis=1)
88
+ else:
89
+ raise ValueError("Correlation type not recognized. Please use 'Pearson' or 'Spearman'")
90
+
91
+ # calculate the whole correlation table for all variables.
92
+ full_table = np.corrcoef(full_table, rowvar=False)[-1][:-1].reshape((1, -1))
93
+ corr_table = corr_table.append(pd.DataFrame(full_table))
94
+
95
+ # Add the metadata to the correlation table
96
+ corr_table.columns = list(variable_table.columns.values)
97
+ corr_table = corr_table.fillna(0)
98
+ corr_table.index = row_names
99
+ corr_table.Properties = {}
100
+ corr_table._metadata += ['Properties']
101
+ corr_table.Properties['description'] = variable_table.Properties['Description']
102
+ corr_table.Properties['userData'] = variable_table.Properties['userData']
103
+
104
+ return corr_table
105
+
106
+ def __find_fda_best_mean(self, corr_tables: pd.DataFrame, min_n_feat_stable: int) -> Tuple[Dict, pd.DataFrame]:
107
+ """
108
+ Finds the best mean correlation of all the stable variables in the table.
109
+
110
+ Args:
111
+ corr_tables (Dict): dictionary containing the correlation tables of
112
+ dimension : [n_splits,n_features] for each table.
113
+ min_n_feat_stable (int): minimal number of stable features.
114
+
115
+ Returns:
116
+ Tuple[Dict, pd.DataFrame]: Dict containing the name of each stable variables in every table and
117
+ pd.DataFrame containing the mean correlation of all the stable variables in the table.
118
+ """
119
+ # Initialization
120
+ var_names_stable = {}
121
+ corr_mean_stable = corr_tables
122
+ n_features = 0
123
+ corr_table = corr_tables
124
+ corr_table = corr_table.fillna(0)
125
+
126
+ # Calculation of the mean correlation among the n splits (R mean)
127
+ var_names_stable = corr_table.index
128
+
129
+ # Calculating the total number of features
130
+ n_features += var_names_stable.size
131
+
132
+ # Getting absolute values of the mean correlation
133
+ corr_mean_stable_abs = corr_mean_stable.abs()
134
+
135
+ # Keeping only the best features if there are more than min_n_feat_stable features
136
+ if n_features > min_n_feat_stable:
137
+ # Get min_n_feat_stable highest correlations
138
+ best_features = corr_mean_stable_abs.sort_values(ascending=False)[0:min_n_feat_stable]
139
+ var_names_stable = best_features.index.values
140
+ corr_mean_stable = best_features
141
+
142
+ return var_names_stable, corr_mean_stable
143
+
144
+ def __find_fda_stable(self, corr_table: pd.DataFrame, thresh_stable: float) -> Tuple[Dict, pd.DataFrame]:
145
+ """
146
+ Finds the stable features in each correlation table
147
+ and the mean correlation of all the stable variables in the table.
148
+
149
+ Args:
150
+ corr_tables (Dict): dictionary containing the correlation tables of
151
+ dimension : [n_splits,n_features] for each table.
152
+ thresh_stable (float): the threshold deciding if a feature is stable.
153
+
154
+ Returns:
155
+ Tuple[Dict, pd.DataFrame]: dictionary containing the name of each stable variables in every tables
156
+ and table containing the mean correlation of all the stable variables in the table.
157
+ (The keys are the table names and the values are pd.Series).
158
+ """
159
+
160
+ # Initialization
161
+ corr_table.fillna(0, inplace=True)
162
+
163
+ # Calculation of R mean
164
+ corr_mean_stable = corr_table.mean()
165
+ mean_r = corr_mean_stable
166
+
167
+ # Calculation of min and max
168
+ min_r = corr_table.quantile(0.05)
169
+ max_r = corr_table.quantile(0.95)
170
+
171
+ # Calculation of unstable features
172
+ unstable = (min_r < thresh_stable) & (mean_r > 0) | (max_r > -thresh_stable) & (mean_r < 0)
173
+ ind_unstable = unstable.index[unstable]
174
+
175
+ # Stable variables
176
+ var_names_stable = unstable.index[~unstable].values
177
+ corr_mean_stable = mean_r.drop(ind_unstable)
178
+
179
+ return var_names_stable, corr_mean_stable
180
+
181
+ def __keep_best_text_param(
182
+ self,
183
+ corr_table: pd.DataFrame,
184
+ var_names_stable: List,
185
+ corr_mean_stable: pd.DataFrame
186
+ ) -> Tuple[List, pd.DataFrame]:
187
+ """
188
+ Keeps the best texture features extraction parameters in the correlation tables
189
+ by dropping the variants of a given feature.
190
+
191
+ Args:
192
+ corr_table (pd.DataFrame): Correlation table of dimension : [n_splits,n_features].
193
+ var_names_stable (List): List of the stable variables in the table.
194
+ corr_mean_stable (pd.DataFrame): Table of the mean correlation of the stable variables in the variables table.
195
+
196
+ Returns:
197
+ Tuple[List, pd.DataFrame]: list of the stable variables in the tables and table containing the mean
198
+ correlation of all the stable variables.
199
+ """
200
+
201
+ # If no stable features for the currect field, continue
202
+ if var_names_stable.size == 0:
203
+ return var_names_stable, corr_mean_stable
204
+
205
+ # Get the actual radiomics features names from the sequential names
206
+ full_rad_names = get_full_rad_names(
207
+ corr_table.Properties['userData']['variables']['var_def'],
208
+ var_names_stable)
209
+
210
+ # Now parsing the full names to get only the rad names and not the variant
211
+ rad_names = np.array([])
212
+ for n in range(full_rad_names.size):
213
+ rad_names = np.append(rad_names, full_rad_names[n].split('__')[1:2])
214
+
215
+ # Verifying if two features are the same variant and keeping the best one
216
+ n_var = rad_names.size
217
+ var_to_drop = []
218
+ for rad_name in rad_names:
219
+ # If all the features are unique, break
220
+ if np.unique(rad_names).size == n_var:
221
+ break
222
+ else:
223
+ ind_same = np.where(rad_names == rad_name)[0]
224
+ n_same = ind_same.size
225
+ if n_same > 1:
226
+ var_to_drop.append(list(corr_mean_stable.iloc[ind_same].sort_values().index[1:].values))
227
+
228
+ # Dropping the variants
229
+ if len(var_to_drop) > 0:
230
+ # convert to list of lists to list
231
+ var_to_drop = [item for sublist in var_to_drop for item in sublist]
232
+
233
+ # From the unique values of var_to_drop, drop the variants
234
+ for var in set(var_to_drop):
235
+ var_names_stable = np.delete(var_names_stable, np.where(var_names_stable == var))
236
+ corr_mean_stable = corr_mean_stable.drop(var)
237
+
238
+ return var_names_stable, corr_mean_stable
239
+
240
+ def __remove_correlated_variables(
241
+ self,
242
+ variable_table: pd.DataFrame,
243
+ rank: pd.Series,
244
+ corr_type: str,
245
+ thresh_inter_corr: float,
246
+ min_n_feat_total: int
247
+ ) -> pd.DataFrame:
248
+ """
249
+ Removes inter-correlated variables given a certain threshold.
250
+
251
+ Args:
252
+ variable_table (pd.DataFrame): variable table for which we want to remove intercorrelated variables.
253
+ Size: N X M (observations, features).
254
+ rank (pd.Series): Vector of correlation values per feature (of size 1 X M).
255
+ corr_type (str): String specifying the correlation type that we are investigating.
256
+ Must be 'Pearson' or 'Spearman'.
257
+ thresh_inter_corr (float): Numerical value specifying the threshold above which two variables are
258
+ considered to be correlated.
259
+ min_n_feat_total (int): Minimum number of features to keep in the table.
260
+
261
+ Returns:
262
+ pd.DataFrame: Final variable table with the least correlated variables that are kept.
263
+ """
264
+ # Initialization
265
+ n_features = variable_table.shape[1]
266
+
267
+ # Compute correlation matrix
268
+ if corr_type == 'Spearman':
269
+ corr_mat = abs(np.corrcoef(variable_table.rank(), rowvar=False))
270
+ elif corr_type == 'Pearson':
271
+ corr_mat = abs(np.corrcoef(variable_table, rowvar=False))
272
+ else:
273
+ raise ValueError('corr_type must be either "Pearson" or "Spearman"')
274
+
275
+ # Set diagonal elements to Nans
276
+ np.fill_diagonal(corr_mat, val=np.nan)
277
+
278
+ # Calculate mean inter-variable correlation
279
+ mean_corr = np.nanmean(corr_mat, axis=1)
280
+
281
+ # Looping over all features once
282
+ # rank variables once, for meaningful variable loop.
283
+ ind_loop = pd.Series(mean_corr).rank(method="first") - 1
284
+ # Create a copy of the correlation matrix (to be modified)
285
+ corr_mat_temp = corr_mat.copy()
286
+ while True:
287
+ for f in range(n_features):
288
+ # Use index loop if not NaN
289
+ try:
290
+ i = int(ind_loop[f])
291
+ except:
292
+ i = 0
293
+ # Select the row of the current feature
294
+ row = corr_mat_temp[i][:]
295
+ correlated = 1*(row > thresh_inter_corr) # to turn into integers
296
+
297
+ # While the correlations are above the threshold for the select row, we select another row
298
+ while sum(correlated) > 0 and np.isnan(row).sum != len(row):
299
+ # Find the variable with the highest correlation and drop the one with the lowest rank
300
+ ind_max = np.nanargmax(row)
301
+ ind_min = np.nanargmin(np.array([rank[i], rank[ind_max]]))
302
+ if ind_min == 0:
303
+ # Drop the current row if the current feature has the lowest correlation with outcome
304
+ corr_mat_temp[i][:] = np.nan
305
+ corr_mat_temp[:][i] = np.nan
306
+ row[:] = np.nan
307
+ else:
308
+ # Drop the feature with the highest correlation to the current feature with the lowest correlation with outcome
309
+ corr_mat_temp[ind_max][:] = np.nan
310
+ corr_mat_temp[:][ind_max] = np.nan
311
+ row[ind_max] = np.nan
312
+
313
+ # Update the correlated vector
314
+ correlated = row > thresh_inter_corr
315
+
316
+ # If all the rows are NaN, we keep the variable with the highest rank
317
+ if (1*np.isnan(corr_mat_temp)).sum() == corr_mat_temp.size:
318
+ ind_keep = np.nanargmax(rank)
319
+ else:
320
+ ind_keep = list()
321
+ for row in range(corr_mat_temp.shape[0]):
322
+ if 1*np.isnan(corr_mat_temp[row][:]).sum() < corr_mat_temp.shape[1]:
323
+ ind_keep.append(row)
324
+
325
+ # if ind_keep happens to be a numpy type convert it to list for better subscripting
326
+ if isinstance(ind_keep, np.int64):
327
+ ind_keep = [ind_keep.tolist()] # work around
328
+ elif isinstance(ind_keep, np.ndarray):
329
+ ind_keep = ind_keep.tolist()
330
+
331
+ # Update threshold if the number of variables is too small or too large
332
+ if len(ind_keep) < min_n_feat_total:
333
+ # Increase the threshold (less stringent)
334
+ thresh_inter_corr = thresh_inter_corr + 0.05
335
+ corr_mat_temp = corr_mat.copy() # reset the correlation matrix
336
+ else:
337
+ break
338
+
339
+ # Make sure we have the best
340
+ if len(ind_keep) != min_n_feat_total:
341
+ # Take the features with the highest rank
342
+ ind_keep = sorted(ind_keep)[:min_n_feat_total]
343
+
344
+ # Creating new variable_table
345
+ columns = [variable_table.columns[idx] for idx in ind_keep]
346
+ variable_table = variable_table.loc[:, columns]
347
+
348
+ return variable_table
349
+
350
+ def apply_fda_one_space(
351
+ self,
352
+ ml: Dict,
353
+ variable_table: List,
354
+ outcome_table_binary: pd.DataFrame,
355
+ del_variants: bool = True,
356
+ logging_dict: Dict = None
357
+ ) -> List:
358
+ """
359
+ Applies false discovery avoidance method.
360
+
361
+ Args:
362
+ ml (dict): Machine learning dictionary containing the learning options.
363
+ variable_table (List): Table of variables.
364
+ outcome_table_binary (pd.DataFrame): Table of binary outcomes.
365
+ del_variants (bool, optional): If True, will delete the variants of the same feature. Defaults to True.
366
+
367
+ Returns:
368
+ List: Table of variables after feature set reduction.
369
+ """
370
+ # Initilization
371
+ n_splits = ml['fSetReduction']['FDA']['nSplits']
372
+ corr_type = ml['fSetReduction']['FDA']['corrType']
373
+ thresh_stable_start = ml['fSetReduction']['FDA']['threshStableStart']
374
+ thresh_inter_corr = ml['fSetReduction']['FDA']['threshInterCorr']
375
+ min_n_feat_stable = ml['fSetReduction']['FDA']['minNfeatStable']
376
+ min_n_feat_total = ml['fSetReduction']['FDA']['minNfeat']
377
+ seed = ml['fSetReduction']['FDA']['seed']
378
+
379
+ # Initialization - logging
380
+ if logging_dict is not None:
381
+ table_level = variable_table.Properties['Description'].split('__')[-1]
382
+ logging_dict['one_space']['unstable'][table_level] = {}
383
+ logging_dict['one_space']['inter_corr'][table_level] = {}
384
+
385
+ # Getting the correlation table for the radiomics table
386
+ radiomics_table_temp = variable_table.copy()
387
+ outcome_table_binary_temp = outcome_table_binary.copy()
388
+
389
+ # Get the correlation table
390
+ corr_table = self.__get_fda_corr_table(
391
+ radiomics_table_temp,
392
+ outcome_table_binary_temp,
393
+ n_splits,
394
+ corr_type,
395
+ seed
396
+ )
397
+
398
+ # Calculating the total numbers of features
399
+ feature_total = radiomics_table_temp.shape[1]
400
+
401
+ # Cut unstable features (Rmin cut)
402
+ if feature_total > min_n_feat_stable:
403
+ # starting threshold (set by user)
404
+ thresh_stable = thresh_stable_start
405
+ while True:
406
+ # find which features are stable
407
+ var_names_stable, corrs_stable = self.__find_fda_stable(corr_table, thresh_stable)
408
+
409
+ # Keep the best textural parameters per image space (deleting variants)
410
+ if del_variants:
411
+ var_names_stable, corrs_stable = self.__keep_best_text_param(corr_table, var_names_stable, corrs_stable)
412
+
413
+ # count the number of stable features
414
+ n_stable = var_names_stable.size
415
+
416
+ # stop if the minimum number of stable features is reached, if not, lower the threshold.
417
+ if n_stable >= min_n_feat_stable:
418
+ break
419
+ else:
420
+ thresh_stable = thresh_stable - 0.05
421
+
422
+ # stop if the threshold is zero or below
423
+ if thresh_stable <= 0:
424
+ break
425
+
426
+ # take the best mean correlation
427
+ if n_stable > min_n_feat_stable:
428
+ var_names_stable, corr_mean_stable = self.__find_fda_best_mean(corrs_stable, min_n_feat_stable)
429
+ else:
430
+ # Compute mean correlation
431
+ corr_mean_stable = corr_table.mean()
432
+
433
+ # Finalize radiomics tables before inter-correlation cut
434
+ if len(var_names_stable) > 0:
435
+ var_names = var_names_stable
436
+ if isinstance(radiomics_table_temp, pd.Series):
437
+ radiomics_table_temp = radiomics_table_temp[[var_names]]
438
+ else:
439
+ radiomics_table_temp = radiomics_table_temp[var_names]
440
+ radiomics_table_temp = finalize_rad_table(radiomics_table_temp)
441
+ else:
442
+ radiomics_table_temp = pd.DataFrame()
443
+ else:
444
+ # if there is less features than the minimal number, take them all
445
+ n_stable = feature_total
446
+
447
+ # Compute mean correlation
448
+ corr_mean_stable = corr_table.mean()
449
+
450
+ # Update logging
451
+ if logging_dict is not None:
452
+ logging_dict['one_space']['unstable'][table_level] = radiomics_table_temp.columns.shape[0]
453
+
454
+ # Inter-Correlation Cut
455
+ if radiomics_table_temp.shape[1] > 1 and n_stable > min_n_feat_total:
456
+ radiomics_table_temp = self.__remove_correlated_variables(
457
+ radiomics_table_temp,
458
+ corr_mean_stable.abs(),
459
+ corr_type,
460
+ thresh_inter_corr,
461
+ min_n_feat_total
462
+ )
463
+
464
+ # Finalize radiomics table
465
+ radiomics_table_temp = finalize_rad_table(radiomics_table_temp)
466
+
467
+ # Update logging
468
+ if logging_dict is not None:
469
+ logging_dict['one_space']['inter_corr'][table_level] = get_full_rad_names(
470
+ radiomics_table_temp.Properties['userData']['variables']['var_def'],
471
+ radiomics_table_temp.columns.values
472
+ ).tolist()
473
+
474
+ return radiomics_table_temp
475
+
476
+ def apply_fda(
477
+ self,
478
+ ml: Dict,
479
+ variable_table: List,
480
+ outcome_table_binary: pd.DataFrame,
481
+ logging: bool = True,
482
+ path_save_logging: Path = None
483
+ ) -> List:
484
+ """
485
+ Applies false discovery avoidance method.
486
+
487
+ Args:
488
+ ml (dict): Machine learning dictionary containing the learning options.
489
+ variable_table (List): Table of variables.
490
+ outcome_table_binary (pd.DataFrame): Table of binary outcomes.
491
+ logging (bool, optional): If True, will save a dict that tracks features selsected for each level. Defaults to True.
492
+ path_save_logging (Path, optional): Path to save the logging dict. Defaults to None.
493
+
494
+ Returns:
495
+ List: Table of variables after feature set reduction.
496
+ """
497
+ # Initialization
498
+ rad_tables = variable_table.copy()
499
+ n_rad_tables = len(rad_tables)
500
+ variable_tables = []
501
+ logging_dict = {'one_space': {'unstable': {}, 'inter_corr': {}}, 'final': {}}
502
+
503
+ # Apply FDA for each image space/radiomics table
504
+ for r in range(n_rad_tables):
505
+ if logging:
506
+ variable_tables.append(self.apply_fda_one_space(ml, rad_tables[r], outcome_table_binary, logging_dict=logging_dict))
507
+ else:
508
+ variable_tables.append(self.apply_fda_one_space(ml, rad_tables[r], outcome_table_binary))
509
+
510
+ # Combine radiomics tables
511
+ variable_table = combine_rad_tables(variable_tables)
512
+
513
+ # Apply FDA again on the combined radiomics table
514
+ variable_table = self.apply_fda_one_space(ml, variable_table, outcome_table_binary, del_variants=False)
515
+
516
+ # Update logging dict
517
+ if logging:
518
+ logging_dict['final'] = get_full_rad_names(variable_table.Properties['userData']['variables']['var_def'],
519
+ variable_table.columns.values).tolist()
520
+ if path_save_logging is not None:
521
+ path_save_logging = Path(path_save_logging).parent / 'fda_logging_dict.json'
522
+ save_json(path_save_logging, logging_dict, cls=NumpyEncoder)
523
+
524
+ return variable_table
525
+
526
+ def apply_fda_balanced(
527
+ self,
528
+ ml: Dict,
529
+ variable_table: List,
530
+ outcome_table_binary: pd.DataFrame,
531
+ ) -> List:
532
+ """
533
+ Applies false discovery avoidance method but balances the number of features on each level.
534
+
535
+ Args:
536
+ ml (dict): Machine learning dictionary containing the learning options.
537
+ variable_table (List): Table of variables.
538
+ outcome_table_binary (pd.DataFrame): Table of binary outcomes.
539
+ logging (bool, optional): If True, will save a dict that tracks features selsected for each level. Defaults to True.
540
+ path_save_logging (Path, optional): Path to save the logging dict. Defaults to None.
541
+
542
+ Returns:
543
+ List: Table of variables after feature set reduction.
544
+ """
545
+ # Initilization
546
+ rad_tables = variable_table.copy()
547
+ n_rad_tables = len(rad_tables)
548
+ variable_tables_all_levels = []
549
+ levels = [[], [], []]
550
+
551
+ # Organize the tables by level
552
+ for r in range(n_rad_tables):
553
+ if 'morph' in rad_tables[r].Properties['Description'].lower():
554
+ levels[0].append(rad_tables[r])
555
+ elif 'intensity' in rad_tables[r].Properties['Description'].lower():
556
+ levels[0].append(rad_tables[r])
557
+ elif 'texture' in rad_tables[r].Properties['Description'].lower():
558
+ levels[0].append(rad_tables[r])
559
+ elif 'mean' in rad_tables[r].Properties['Description'].lower() or \
560
+ 'laws' in rad_tables[r].Properties['Description'].lower() or \
561
+ 'log' in rad_tables[r].Properties['Description'].lower() or \
562
+ 'gabor' in rad_tables[r].Properties['Description'].lower() or \
563
+ 'coif' in rad_tables[r].Properties['Description'].lower() or \
564
+ 'wavelet' in rad_tables[r].Properties['Description'].lower():
565
+ levels[1].append(rad_tables[r])
566
+ elif 'glcm' in rad_tables[r].Properties['Description'].lower():
567
+ levels[2].append(rad_tables[r])
568
+
569
+ # Apply FDA for each image space/radiomics table for each level
570
+ for level in levels:
571
+ variable_tables = []
572
+ if len(level) == 0:
573
+ continue
574
+ for r in range(len(level)):
575
+ variable_tables.append(self.apply_fda_one_space(ml, level[r], outcome_table_binary))
576
+
577
+ # Combine radiomics tables
578
+ variable_table = combine_rad_tables(variable_tables)
579
+
580
+ # Apply FDA again on the combined radiomics table
581
+ variable_table = self.apply_fda_one_space(ml, variable_table, outcome_table_binary, del_variants=False)
582
+
583
+ # Add-up the tables
584
+ variable_tables_all_levels.append(variable_table)
585
+
586
+ # Combine radiomics tables of all 3 major levels (original, linear filters and textures)
587
+ variable_table_all_levels = combine_rad_tables(variable_tables_all_levels)
588
+
589
+ # Apply FDA again on the combined radiomics table
590
+ variable_table_all_levels = self.apply_fda_one_space(ml, variable_table_all_levels, outcome_table_binary, del_variants=False)
591
+
592
+ return variable_table_all_levels
593
+
594
+ def apply_random_fsr_one_space(
595
+ self,
596
+ ml: Dict,
597
+ variable_table: pd.DataFrame,
598
+ ) -> List:
599
+ seed = ml['fSetReduction']['FDA']['seed']
600
+
601
+ # Setting the seed
602
+ np.random.seed(seed)
603
+
604
+ # Random select 10 columns (features)
605
+ random_df = np.random.choice(variable_table.columns.values.tolist(), 10, replace=False)
606
+ random_df = variable_table[random_df]
607
+
608
+ return finalize_rad_table(random_df)
609
+
610
+ def apply_random_fsr(
611
+ self,
612
+ ml: Dict,
613
+ variable_table: List,
614
+ ) -> List:
615
+ """
616
+ Applies random feature set reduction by choosing a random number of features.
617
+
618
+ Args:
619
+ ml (dict): Machine learning dictionary containing the learning options.
620
+ variable_table (List): Table of variables.
621
+ outcome_table_binary (pd.DataFrame): Table of binary outcomes.
622
+
623
+ Returns:
624
+ List: Table of variables after feature set reduction.
625
+ """
626
+ # Iinitilization
627
+ rad_tables = variable_table.copy()
628
+ n_rad_tables = len(rad_tables)
629
+ variable_tables = []
630
+
631
+ # Apply FDA for each image space/radiomics table
632
+ for r in range(n_rad_tables):
633
+ variable_tables.append(self.apply_random_fsr_one_space(ml, rad_tables[r]))
634
+
635
+ # Combine radiomics tables
636
+ variable_table = combine_rad_tables(variable_tables)
637
+
638
+ # Apply FDA again on the combined radiomics table
639
+ variable_table = self.apply_random_fsr_one_space(ml, variable_table)
640
+
641
+ return variable_table
642
+
643
+ def apply_fsr(self, ml: Dict, variable_table: List, outcome_table_binary: pd.DataFrame, path_save_logging: Path = None) -> List:
644
+ """
645
+ Applies feature set reduction method.
646
+
647
+ Args:
648
+ ml (dict): Machine learning dictionary containing the learning options.
649
+ variable_table (List): Table of variables.
650
+ outcome_table_binary (pd.DataFrame): Table of binary outcomes.
651
+
652
+ Returns:
653
+ List: Table of variables after feature set reduction.
654
+ """
655
+ if self.method.lower() == "fda":
656
+ variable_table = self.apply_fda(ml, variable_table, outcome_table_binary, path_save_logging=path_save_logging)
657
+ elif self.method.lower() == "fdabalanced":
658
+ variable_table = self.apply_fda_balanced(ml, variable_table, outcome_table_binary)
659
+ elif self.method.lower() == "random":
660
+ variable_table = self.apply_random_fsr(ml, variable_table)
661
+ elif self.method == "LASSO":
662
+ raise NotImplementedError("LASSO not implemented yet.")
663
+ elif self.method == "mRMR":
664
+ raise NotImplementedError("mRMR not implemented yet.")
665
+ else:
666
+ raise ValueError("FSR method is None or unknown: " + self.method)
667
+ return variable_table