mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
MEDiml/learning/FSR.py
ADDED
|
@@ -0,0 +1,667 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Dict, List, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from numpyencoder import NumpyEncoder
|
|
7
|
+
|
|
8
|
+
from MEDiml.learning.ml_utils import (combine_rad_tables, finalize_rad_table,
|
|
9
|
+
get_stratified_splits,
|
|
10
|
+
intersect_var_tables)
|
|
11
|
+
from MEDiml.utils.get_full_rad_names import get_full_rad_names
|
|
12
|
+
from MEDiml.utils.json_utils import save_json
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FSR:
|
|
16
|
+
def __init__(self, method: str = 'fda') -> None:
|
|
17
|
+
"""
|
|
18
|
+
Feature set reduction class constructor.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
method (str): Method of feature set reduction. Can be "FDA", "LASSO" or "mRMR".
|
|
22
|
+
"""
|
|
23
|
+
self.method = method
|
|
24
|
+
|
|
25
|
+
def __get_fda_corr_table(
|
|
26
|
+
self,
|
|
27
|
+
variable_table: pd.DataFrame,
|
|
28
|
+
outcome_table_binary: pd.DataFrame,
|
|
29
|
+
n_splits: int,
|
|
30
|
+
corr_type: str,
|
|
31
|
+
seed: int
|
|
32
|
+
) -> pd.DataFrame:
|
|
33
|
+
"""
|
|
34
|
+
Calculates the correlation table of the FDA algorithm.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
variable_table (pd.DataFrame): variable table to check for stability.
|
|
38
|
+
outcome_table_binary (pd.DataFrame): outcome table with binary labels.
|
|
39
|
+
n_splits (int): Number of splits in the FDA algorithm (Ex: 100).
|
|
40
|
+
corr_type: String specifying the correlation type that we are investigating.
|
|
41
|
+
Must be either 'Pearson' or 'Spearman'.
|
|
42
|
+
seed (int): Random generator seed.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
pd.DataFrame: Correlation table of the FDA algorithm. Rows are splits, columns are features.
|
|
46
|
+
"""
|
|
47
|
+
# Setting the seed
|
|
48
|
+
np.random.seed(seed)
|
|
49
|
+
|
|
50
|
+
# Initialization
|
|
51
|
+
row_names = []
|
|
52
|
+
corr_table = pd.DataFrame()
|
|
53
|
+
fraction_for_splits = 1/3
|
|
54
|
+
number_of_splits = 1
|
|
55
|
+
|
|
56
|
+
# For each split, we calculate the correlation table
|
|
57
|
+
for s in range(n_splits):
|
|
58
|
+
row_names.append("Split_{0:03}".format(s))
|
|
59
|
+
|
|
60
|
+
# Keep only variables that are in both tables
|
|
61
|
+
_, outcome_table_binary = intersect_var_tables(variable_table, outcome_table_binary)
|
|
62
|
+
|
|
63
|
+
# Under-sample the outcome table to equalize the number of positive and negative outcomes
|
|
64
|
+
#outcome_table_binary_balanced = under_sample(outcome_table_binary)
|
|
65
|
+
|
|
66
|
+
# Get the patient teach split
|
|
67
|
+
patients_teach_splits = get_stratified_splits(
|
|
68
|
+
outcome_table_binary,
|
|
69
|
+
number_of_splits,
|
|
70
|
+
fraction_for_splits,
|
|
71
|
+
seed,
|
|
72
|
+
flag_by_cat=True
|
|
73
|
+
)[0]
|
|
74
|
+
|
|
75
|
+
# Creating a table with both the variables and the outcome with
|
|
76
|
+
# only the patient teach splits, ranked for spearman and not for pearson
|
|
77
|
+
if corr_type == 'Spearman':
|
|
78
|
+
full_table = pd.concat([variable_table.loc[patients_teach_splits, :].rank(),
|
|
79
|
+
outcome_table_binary.loc[patients_teach_splits,
|
|
80
|
+
outcome_table_binary.columns.values[-1]]], axis=1)
|
|
81
|
+
|
|
82
|
+
elif corr_type == 'Pearson':
|
|
83
|
+
# Pearson is the base method used by numpy, so we dont have to do any
|
|
84
|
+
# manipulations to the data like with spearman.
|
|
85
|
+
full_table = pd.concat([variable_table.loc[patients_teach_splits, :],
|
|
86
|
+
outcome_table_binary.loc[patients_teach_splits,
|
|
87
|
+
outcome_table_binary.columns.values[-1]]], axis=1)
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError("Correlation type not recognized. Please use 'Pearson' or 'Spearman'")
|
|
90
|
+
|
|
91
|
+
# calculate the whole correlation table for all variables.
|
|
92
|
+
full_table = np.corrcoef(full_table, rowvar=False)[-1][:-1].reshape((1, -1))
|
|
93
|
+
corr_table = corr_table.append(pd.DataFrame(full_table))
|
|
94
|
+
|
|
95
|
+
# Add the metadata to the correlation table
|
|
96
|
+
corr_table.columns = list(variable_table.columns.values)
|
|
97
|
+
corr_table = corr_table.fillna(0)
|
|
98
|
+
corr_table.index = row_names
|
|
99
|
+
corr_table.Properties = {}
|
|
100
|
+
corr_table._metadata += ['Properties']
|
|
101
|
+
corr_table.Properties['description'] = variable_table.Properties['Description']
|
|
102
|
+
corr_table.Properties['userData'] = variable_table.Properties['userData']
|
|
103
|
+
|
|
104
|
+
return corr_table
|
|
105
|
+
|
|
106
|
+
def __find_fda_best_mean(self, corr_tables: pd.DataFrame, min_n_feat_stable: int) -> Tuple[Dict, pd.DataFrame]:
|
|
107
|
+
"""
|
|
108
|
+
Finds the best mean correlation of all the stable variables in the table.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
corr_tables (Dict): dictionary containing the correlation tables of
|
|
112
|
+
dimension : [n_splits,n_features] for each table.
|
|
113
|
+
min_n_feat_stable (int): minimal number of stable features.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple[Dict, pd.DataFrame]: Dict containing the name of each stable variables in every table and
|
|
117
|
+
pd.DataFrame containing the mean correlation of all the stable variables in the table.
|
|
118
|
+
"""
|
|
119
|
+
# Initialization
|
|
120
|
+
var_names_stable = {}
|
|
121
|
+
corr_mean_stable = corr_tables
|
|
122
|
+
n_features = 0
|
|
123
|
+
corr_table = corr_tables
|
|
124
|
+
corr_table = corr_table.fillna(0)
|
|
125
|
+
|
|
126
|
+
# Calculation of the mean correlation among the n splits (R mean)
|
|
127
|
+
var_names_stable = corr_table.index
|
|
128
|
+
|
|
129
|
+
# Calculating the total number of features
|
|
130
|
+
n_features += var_names_stable.size
|
|
131
|
+
|
|
132
|
+
# Getting absolute values of the mean correlation
|
|
133
|
+
corr_mean_stable_abs = corr_mean_stable.abs()
|
|
134
|
+
|
|
135
|
+
# Keeping only the best features if there are more than min_n_feat_stable features
|
|
136
|
+
if n_features > min_n_feat_stable:
|
|
137
|
+
# Get min_n_feat_stable highest correlations
|
|
138
|
+
best_features = corr_mean_stable_abs.sort_values(ascending=False)[0:min_n_feat_stable]
|
|
139
|
+
var_names_stable = best_features.index.values
|
|
140
|
+
corr_mean_stable = best_features
|
|
141
|
+
|
|
142
|
+
return var_names_stable, corr_mean_stable
|
|
143
|
+
|
|
144
|
+
def __find_fda_stable(self, corr_table: pd.DataFrame, thresh_stable: float) -> Tuple[Dict, pd.DataFrame]:
|
|
145
|
+
"""
|
|
146
|
+
Finds the stable features in each correlation table
|
|
147
|
+
and the mean correlation of all the stable variables in the table.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
corr_tables (Dict): dictionary containing the correlation tables of
|
|
151
|
+
dimension : [n_splits,n_features] for each table.
|
|
152
|
+
thresh_stable (float): the threshold deciding if a feature is stable.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Tuple[Dict, pd.DataFrame]: dictionary containing the name of each stable variables in every tables
|
|
156
|
+
and table containing the mean correlation of all the stable variables in the table.
|
|
157
|
+
(The keys are the table names and the values are pd.Series).
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
# Initialization
|
|
161
|
+
corr_table.fillna(0, inplace=True)
|
|
162
|
+
|
|
163
|
+
# Calculation of R mean
|
|
164
|
+
corr_mean_stable = corr_table.mean()
|
|
165
|
+
mean_r = corr_mean_stable
|
|
166
|
+
|
|
167
|
+
# Calculation of min and max
|
|
168
|
+
min_r = corr_table.quantile(0.05)
|
|
169
|
+
max_r = corr_table.quantile(0.95)
|
|
170
|
+
|
|
171
|
+
# Calculation of unstable features
|
|
172
|
+
unstable = (min_r < thresh_stable) & (mean_r > 0) | (max_r > -thresh_stable) & (mean_r < 0)
|
|
173
|
+
ind_unstable = unstable.index[unstable]
|
|
174
|
+
|
|
175
|
+
# Stable variables
|
|
176
|
+
var_names_stable = unstable.index[~unstable].values
|
|
177
|
+
corr_mean_stable = mean_r.drop(ind_unstable)
|
|
178
|
+
|
|
179
|
+
return var_names_stable, corr_mean_stable
|
|
180
|
+
|
|
181
|
+
def __keep_best_text_param(
|
|
182
|
+
self,
|
|
183
|
+
corr_table: pd.DataFrame,
|
|
184
|
+
var_names_stable: List,
|
|
185
|
+
corr_mean_stable: pd.DataFrame
|
|
186
|
+
) -> Tuple[List, pd.DataFrame]:
|
|
187
|
+
"""
|
|
188
|
+
Keeps the best texture features extraction parameters in the correlation tables
|
|
189
|
+
by dropping the variants of a given feature.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
corr_table (pd.DataFrame): Correlation table of dimension : [n_splits,n_features].
|
|
193
|
+
var_names_stable (List): List of the stable variables in the table.
|
|
194
|
+
corr_mean_stable (pd.DataFrame): Table of the mean correlation of the stable variables in the variables table.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Tuple[List, pd.DataFrame]: list of the stable variables in the tables and table containing the mean
|
|
198
|
+
correlation of all the stable variables.
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# If no stable features for the currect field, continue
|
|
202
|
+
if var_names_stable.size == 0:
|
|
203
|
+
return var_names_stable, corr_mean_stable
|
|
204
|
+
|
|
205
|
+
# Get the actual radiomics features names from the sequential names
|
|
206
|
+
full_rad_names = get_full_rad_names(
|
|
207
|
+
corr_table.Properties['userData']['variables']['var_def'],
|
|
208
|
+
var_names_stable)
|
|
209
|
+
|
|
210
|
+
# Now parsing the full names to get only the rad names and not the variant
|
|
211
|
+
rad_names = np.array([])
|
|
212
|
+
for n in range(full_rad_names.size):
|
|
213
|
+
rad_names = np.append(rad_names, full_rad_names[n].split('__')[1:2])
|
|
214
|
+
|
|
215
|
+
# Verifying if two features are the same variant and keeping the best one
|
|
216
|
+
n_var = rad_names.size
|
|
217
|
+
var_to_drop = []
|
|
218
|
+
for rad_name in rad_names:
|
|
219
|
+
# If all the features are unique, break
|
|
220
|
+
if np.unique(rad_names).size == n_var:
|
|
221
|
+
break
|
|
222
|
+
else:
|
|
223
|
+
ind_same = np.where(rad_names == rad_name)[0]
|
|
224
|
+
n_same = ind_same.size
|
|
225
|
+
if n_same > 1:
|
|
226
|
+
var_to_drop.append(list(corr_mean_stable.iloc[ind_same].sort_values().index[1:].values))
|
|
227
|
+
|
|
228
|
+
# Dropping the variants
|
|
229
|
+
if len(var_to_drop) > 0:
|
|
230
|
+
# convert to list of lists to list
|
|
231
|
+
var_to_drop = [item for sublist in var_to_drop for item in sublist]
|
|
232
|
+
|
|
233
|
+
# From the unique values of var_to_drop, drop the variants
|
|
234
|
+
for var in set(var_to_drop):
|
|
235
|
+
var_names_stable = np.delete(var_names_stable, np.where(var_names_stable == var))
|
|
236
|
+
corr_mean_stable = corr_mean_stable.drop(var)
|
|
237
|
+
|
|
238
|
+
return var_names_stable, corr_mean_stable
|
|
239
|
+
|
|
240
|
+
def __remove_correlated_variables(
|
|
241
|
+
self,
|
|
242
|
+
variable_table: pd.DataFrame,
|
|
243
|
+
rank: pd.Series,
|
|
244
|
+
corr_type: str,
|
|
245
|
+
thresh_inter_corr: float,
|
|
246
|
+
min_n_feat_total: int
|
|
247
|
+
) -> pd.DataFrame:
|
|
248
|
+
"""
|
|
249
|
+
Removes inter-correlated variables given a certain threshold.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
variable_table (pd.DataFrame): variable table for which we want to remove intercorrelated variables.
|
|
253
|
+
Size: N X M (observations, features).
|
|
254
|
+
rank (pd.Series): Vector of correlation values per feature (of size 1 X M).
|
|
255
|
+
corr_type (str): String specifying the correlation type that we are investigating.
|
|
256
|
+
Must be 'Pearson' or 'Spearman'.
|
|
257
|
+
thresh_inter_corr (float): Numerical value specifying the threshold above which two variables are
|
|
258
|
+
considered to be correlated.
|
|
259
|
+
min_n_feat_total (int): Minimum number of features to keep in the table.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
pd.DataFrame: Final variable table with the least correlated variables that are kept.
|
|
263
|
+
"""
|
|
264
|
+
# Initialization
|
|
265
|
+
n_features = variable_table.shape[1]
|
|
266
|
+
|
|
267
|
+
# Compute correlation matrix
|
|
268
|
+
if corr_type == 'Spearman':
|
|
269
|
+
corr_mat = abs(np.corrcoef(variable_table.rank(), rowvar=False))
|
|
270
|
+
elif corr_type == 'Pearson':
|
|
271
|
+
corr_mat = abs(np.corrcoef(variable_table, rowvar=False))
|
|
272
|
+
else:
|
|
273
|
+
raise ValueError('corr_type must be either "Pearson" or "Spearman"')
|
|
274
|
+
|
|
275
|
+
# Set diagonal elements to Nans
|
|
276
|
+
np.fill_diagonal(corr_mat, val=np.nan)
|
|
277
|
+
|
|
278
|
+
# Calculate mean inter-variable correlation
|
|
279
|
+
mean_corr = np.nanmean(corr_mat, axis=1)
|
|
280
|
+
|
|
281
|
+
# Looping over all features once
|
|
282
|
+
# rank variables once, for meaningful variable loop.
|
|
283
|
+
ind_loop = pd.Series(mean_corr).rank(method="first") - 1
|
|
284
|
+
# Create a copy of the correlation matrix (to be modified)
|
|
285
|
+
corr_mat_temp = corr_mat.copy()
|
|
286
|
+
while True:
|
|
287
|
+
for f in range(n_features):
|
|
288
|
+
# Use index loop if not NaN
|
|
289
|
+
try:
|
|
290
|
+
i = int(ind_loop[f])
|
|
291
|
+
except:
|
|
292
|
+
i = 0
|
|
293
|
+
# Select the row of the current feature
|
|
294
|
+
row = corr_mat_temp[i][:]
|
|
295
|
+
correlated = 1*(row > thresh_inter_corr) # to turn into integers
|
|
296
|
+
|
|
297
|
+
# While the correlations are above the threshold for the select row, we select another row
|
|
298
|
+
while sum(correlated) > 0 and np.isnan(row).sum != len(row):
|
|
299
|
+
# Find the variable with the highest correlation and drop the one with the lowest rank
|
|
300
|
+
ind_max = np.nanargmax(row)
|
|
301
|
+
ind_min = np.nanargmin(np.array([rank[i], rank[ind_max]]))
|
|
302
|
+
if ind_min == 0:
|
|
303
|
+
# Drop the current row if the current feature has the lowest correlation with outcome
|
|
304
|
+
corr_mat_temp[i][:] = np.nan
|
|
305
|
+
corr_mat_temp[:][i] = np.nan
|
|
306
|
+
row[:] = np.nan
|
|
307
|
+
else:
|
|
308
|
+
# Drop the feature with the highest correlation to the current feature with the lowest correlation with outcome
|
|
309
|
+
corr_mat_temp[ind_max][:] = np.nan
|
|
310
|
+
corr_mat_temp[:][ind_max] = np.nan
|
|
311
|
+
row[ind_max] = np.nan
|
|
312
|
+
|
|
313
|
+
# Update the correlated vector
|
|
314
|
+
correlated = row > thresh_inter_corr
|
|
315
|
+
|
|
316
|
+
# If all the rows are NaN, we keep the variable with the highest rank
|
|
317
|
+
if (1*np.isnan(corr_mat_temp)).sum() == corr_mat_temp.size:
|
|
318
|
+
ind_keep = np.nanargmax(rank)
|
|
319
|
+
else:
|
|
320
|
+
ind_keep = list()
|
|
321
|
+
for row in range(corr_mat_temp.shape[0]):
|
|
322
|
+
if 1*np.isnan(corr_mat_temp[row][:]).sum() < corr_mat_temp.shape[1]:
|
|
323
|
+
ind_keep.append(row)
|
|
324
|
+
|
|
325
|
+
# if ind_keep happens to be a numpy type convert it to list for better subscripting
|
|
326
|
+
if isinstance(ind_keep, np.int64):
|
|
327
|
+
ind_keep = [ind_keep.tolist()] # work around
|
|
328
|
+
elif isinstance(ind_keep, np.ndarray):
|
|
329
|
+
ind_keep = ind_keep.tolist()
|
|
330
|
+
|
|
331
|
+
# Update threshold if the number of variables is too small or too large
|
|
332
|
+
if len(ind_keep) < min_n_feat_total:
|
|
333
|
+
# Increase the threshold (less stringent)
|
|
334
|
+
thresh_inter_corr = thresh_inter_corr + 0.05
|
|
335
|
+
corr_mat_temp = corr_mat.copy() # reset the correlation matrix
|
|
336
|
+
else:
|
|
337
|
+
break
|
|
338
|
+
|
|
339
|
+
# Make sure we have the best
|
|
340
|
+
if len(ind_keep) != min_n_feat_total:
|
|
341
|
+
# Take the features with the highest rank
|
|
342
|
+
ind_keep = sorted(ind_keep)[:min_n_feat_total]
|
|
343
|
+
|
|
344
|
+
# Creating new variable_table
|
|
345
|
+
columns = [variable_table.columns[idx] for idx in ind_keep]
|
|
346
|
+
variable_table = variable_table.loc[:, columns]
|
|
347
|
+
|
|
348
|
+
return variable_table
|
|
349
|
+
|
|
350
|
+
def apply_fda_one_space(
|
|
351
|
+
self,
|
|
352
|
+
ml: Dict,
|
|
353
|
+
variable_table: List,
|
|
354
|
+
outcome_table_binary: pd.DataFrame,
|
|
355
|
+
del_variants: bool = True,
|
|
356
|
+
logging_dict: Dict = None
|
|
357
|
+
) -> List:
|
|
358
|
+
"""
|
|
359
|
+
Applies false discovery avoidance method.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
ml (dict): Machine learning dictionary containing the learning options.
|
|
363
|
+
variable_table (List): Table of variables.
|
|
364
|
+
outcome_table_binary (pd.DataFrame): Table of binary outcomes.
|
|
365
|
+
del_variants (bool, optional): If True, will delete the variants of the same feature. Defaults to True.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
List: Table of variables after feature set reduction.
|
|
369
|
+
"""
|
|
370
|
+
# Initilization
|
|
371
|
+
n_splits = ml['fSetReduction']['FDA']['nSplits']
|
|
372
|
+
corr_type = ml['fSetReduction']['FDA']['corrType']
|
|
373
|
+
thresh_stable_start = ml['fSetReduction']['FDA']['threshStableStart']
|
|
374
|
+
thresh_inter_corr = ml['fSetReduction']['FDA']['threshInterCorr']
|
|
375
|
+
min_n_feat_stable = ml['fSetReduction']['FDA']['minNfeatStable']
|
|
376
|
+
min_n_feat_total = ml['fSetReduction']['FDA']['minNfeat']
|
|
377
|
+
seed = ml['fSetReduction']['FDA']['seed']
|
|
378
|
+
|
|
379
|
+
# Initialization - logging
|
|
380
|
+
if logging_dict is not None:
|
|
381
|
+
table_level = variable_table.Properties['Description'].split('__')[-1]
|
|
382
|
+
logging_dict['one_space']['unstable'][table_level] = {}
|
|
383
|
+
logging_dict['one_space']['inter_corr'][table_level] = {}
|
|
384
|
+
|
|
385
|
+
# Getting the correlation table for the radiomics table
|
|
386
|
+
radiomics_table_temp = variable_table.copy()
|
|
387
|
+
outcome_table_binary_temp = outcome_table_binary.copy()
|
|
388
|
+
|
|
389
|
+
# Get the correlation table
|
|
390
|
+
corr_table = self.__get_fda_corr_table(
|
|
391
|
+
radiomics_table_temp,
|
|
392
|
+
outcome_table_binary_temp,
|
|
393
|
+
n_splits,
|
|
394
|
+
corr_type,
|
|
395
|
+
seed
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Calculating the total numbers of features
|
|
399
|
+
feature_total = radiomics_table_temp.shape[1]
|
|
400
|
+
|
|
401
|
+
# Cut unstable features (Rmin cut)
|
|
402
|
+
if feature_total > min_n_feat_stable:
|
|
403
|
+
# starting threshold (set by user)
|
|
404
|
+
thresh_stable = thresh_stable_start
|
|
405
|
+
while True:
|
|
406
|
+
# find which features are stable
|
|
407
|
+
var_names_stable, corrs_stable = self.__find_fda_stable(corr_table, thresh_stable)
|
|
408
|
+
|
|
409
|
+
# Keep the best textural parameters per image space (deleting variants)
|
|
410
|
+
if del_variants:
|
|
411
|
+
var_names_stable, corrs_stable = self.__keep_best_text_param(corr_table, var_names_stable, corrs_stable)
|
|
412
|
+
|
|
413
|
+
# count the number of stable features
|
|
414
|
+
n_stable = var_names_stable.size
|
|
415
|
+
|
|
416
|
+
# stop if the minimum number of stable features is reached, if not, lower the threshold.
|
|
417
|
+
if n_stable >= min_n_feat_stable:
|
|
418
|
+
break
|
|
419
|
+
else:
|
|
420
|
+
thresh_stable = thresh_stable - 0.05
|
|
421
|
+
|
|
422
|
+
# stop if the threshold is zero or below
|
|
423
|
+
if thresh_stable <= 0:
|
|
424
|
+
break
|
|
425
|
+
|
|
426
|
+
# take the best mean correlation
|
|
427
|
+
if n_stable > min_n_feat_stable:
|
|
428
|
+
var_names_stable, corr_mean_stable = self.__find_fda_best_mean(corrs_stable, min_n_feat_stable)
|
|
429
|
+
else:
|
|
430
|
+
# Compute mean correlation
|
|
431
|
+
corr_mean_stable = corr_table.mean()
|
|
432
|
+
|
|
433
|
+
# Finalize radiomics tables before inter-correlation cut
|
|
434
|
+
if len(var_names_stable) > 0:
|
|
435
|
+
var_names = var_names_stable
|
|
436
|
+
if isinstance(radiomics_table_temp, pd.Series):
|
|
437
|
+
radiomics_table_temp = radiomics_table_temp[[var_names]]
|
|
438
|
+
else:
|
|
439
|
+
radiomics_table_temp = radiomics_table_temp[var_names]
|
|
440
|
+
radiomics_table_temp = finalize_rad_table(radiomics_table_temp)
|
|
441
|
+
else:
|
|
442
|
+
radiomics_table_temp = pd.DataFrame()
|
|
443
|
+
else:
|
|
444
|
+
# if there is less features than the minimal number, take them all
|
|
445
|
+
n_stable = feature_total
|
|
446
|
+
|
|
447
|
+
# Compute mean correlation
|
|
448
|
+
corr_mean_stable = corr_table.mean()
|
|
449
|
+
|
|
450
|
+
# Update logging
|
|
451
|
+
if logging_dict is not None:
|
|
452
|
+
logging_dict['one_space']['unstable'][table_level] = radiomics_table_temp.columns.shape[0]
|
|
453
|
+
|
|
454
|
+
# Inter-Correlation Cut
|
|
455
|
+
if radiomics_table_temp.shape[1] > 1 and n_stable > min_n_feat_total:
|
|
456
|
+
radiomics_table_temp = self.__remove_correlated_variables(
|
|
457
|
+
radiomics_table_temp,
|
|
458
|
+
corr_mean_stable.abs(),
|
|
459
|
+
corr_type,
|
|
460
|
+
thresh_inter_corr,
|
|
461
|
+
min_n_feat_total
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Finalize radiomics table
|
|
465
|
+
radiomics_table_temp = finalize_rad_table(radiomics_table_temp)
|
|
466
|
+
|
|
467
|
+
# Update logging
|
|
468
|
+
if logging_dict is not None:
|
|
469
|
+
logging_dict['one_space']['inter_corr'][table_level] = get_full_rad_names(
|
|
470
|
+
radiomics_table_temp.Properties['userData']['variables']['var_def'],
|
|
471
|
+
radiomics_table_temp.columns.values
|
|
472
|
+
).tolist()
|
|
473
|
+
|
|
474
|
+
return radiomics_table_temp
|
|
475
|
+
|
|
476
|
+
def apply_fda(
|
|
477
|
+
self,
|
|
478
|
+
ml: Dict,
|
|
479
|
+
variable_table: List,
|
|
480
|
+
outcome_table_binary: pd.DataFrame,
|
|
481
|
+
logging: bool = True,
|
|
482
|
+
path_save_logging: Path = None
|
|
483
|
+
) -> List:
|
|
484
|
+
"""
|
|
485
|
+
Applies false discovery avoidance method.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
ml (dict): Machine learning dictionary containing the learning options.
|
|
489
|
+
variable_table (List): Table of variables.
|
|
490
|
+
outcome_table_binary (pd.DataFrame): Table of binary outcomes.
|
|
491
|
+
logging (bool, optional): If True, will save a dict that tracks features selsected for each level. Defaults to True.
|
|
492
|
+
path_save_logging (Path, optional): Path to save the logging dict. Defaults to None.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
List: Table of variables after feature set reduction.
|
|
496
|
+
"""
|
|
497
|
+
# Initialization
|
|
498
|
+
rad_tables = variable_table.copy()
|
|
499
|
+
n_rad_tables = len(rad_tables)
|
|
500
|
+
variable_tables = []
|
|
501
|
+
logging_dict = {'one_space': {'unstable': {}, 'inter_corr': {}}, 'final': {}}
|
|
502
|
+
|
|
503
|
+
# Apply FDA for each image space/radiomics table
|
|
504
|
+
for r in range(n_rad_tables):
|
|
505
|
+
if logging:
|
|
506
|
+
variable_tables.append(self.apply_fda_one_space(ml, rad_tables[r], outcome_table_binary, logging_dict=logging_dict))
|
|
507
|
+
else:
|
|
508
|
+
variable_tables.append(self.apply_fda_one_space(ml, rad_tables[r], outcome_table_binary))
|
|
509
|
+
|
|
510
|
+
# Combine radiomics tables
|
|
511
|
+
variable_table = combine_rad_tables(variable_tables)
|
|
512
|
+
|
|
513
|
+
# Apply FDA again on the combined radiomics table
|
|
514
|
+
variable_table = self.apply_fda_one_space(ml, variable_table, outcome_table_binary, del_variants=False)
|
|
515
|
+
|
|
516
|
+
# Update logging dict
|
|
517
|
+
if logging:
|
|
518
|
+
logging_dict['final'] = get_full_rad_names(variable_table.Properties['userData']['variables']['var_def'],
|
|
519
|
+
variable_table.columns.values).tolist()
|
|
520
|
+
if path_save_logging is not None:
|
|
521
|
+
path_save_logging = Path(path_save_logging).parent / 'fda_logging_dict.json'
|
|
522
|
+
save_json(path_save_logging, logging_dict, cls=NumpyEncoder)
|
|
523
|
+
|
|
524
|
+
return variable_table
|
|
525
|
+
|
|
526
|
+
def apply_fda_balanced(
|
|
527
|
+
self,
|
|
528
|
+
ml: Dict,
|
|
529
|
+
variable_table: List,
|
|
530
|
+
outcome_table_binary: pd.DataFrame,
|
|
531
|
+
) -> List:
|
|
532
|
+
"""
|
|
533
|
+
Applies false discovery avoidance method but balances the number of features on each level.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
ml (dict): Machine learning dictionary containing the learning options.
|
|
537
|
+
variable_table (List): Table of variables.
|
|
538
|
+
outcome_table_binary (pd.DataFrame): Table of binary outcomes.
|
|
539
|
+
logging (bool, optional): If True, will save a dict that tracks features selsected for each level. Defaults to True.
|
|
540
|
+
path_save_logging (Path, optional): Path to save the logging dict. Defaults to None.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
List: Table of variables after feature set reduction.
|
|
544
|
+
"""
|
|
545
|
+
# Initilization
|
|
546
|
+
rad_tables = variable_table.copy()
|
|
547
|
+
n_rad_tables = len(rad_tables)
|
|
548
|
+
variable_tables_all_levels = []
|
|
549
|
+
levels = [[], [], []]
|
|
550
|
+
|
|
551
|
+
# Organize the tables by level
|
|
552
|
+
for r in range(n_rad_tables):
|
|
553
|
+
if 'morph' in rad_tables[r].Properties['Description'].lower():
|
|
554
|
+
levels[0].append(rad_tables[r])
|
|
555
|
+
elif 'intensity' in rad_tables[r].Properties['Description'].lower():
|
|
556
|
+
levels[0].append(rad_tables[r])
|
|
557
|
+
elif 'texture' in rad_tables[r].Properties['Description'].lower():
|
|
558
|
+
levels[0].append(rad_tables[r])
|
|
559
|
+
elif 'mean' in rad_tables[r].Properties['Description'].lower() or \
|
|
560
|
+
'laws' in rad_tables[r].Properties['Description'].lower() or \
|
|
561
|
+
'log' in rad_tables[r].Properties['Description'].lower() or \
|
|
562
|
+
'gabor' in rad_tables[r].Properties['Description'].lower() or \
|
|
563
|
+
'coif' in rad_tables[r].Properties['Description'].lower() or \
|
|
564
|
+
'wavelet' in rad_tables[r].Properties['Description'].lower():
|
|
565
|
+
levels[1].append(rad_tables[r])
|
|
566
|
+
elif 'glcm' in rad_tables[r].Properties['Description'].lower():
|
|
567
|
+
levels[2].append(rad_tables[r])
|
|
568
|
+
|
|
569
|
+
# Apply FDA for each image space/radiomics table for each level
|
|
570
|
+
for level in levels:
|
|
571
|
+
variable_tables = []
|
|
572
|
+
if len(level) == 0:
|
|
573
|
+
continue
|
|
574
|
+
for r in range(len(level)):
|
|
575
|
+
variable_tables.append(self.apply_fda_one_space(ml, level[r], outcome_table_binary))
|
|
576
|
+
|
|
577
|
+
# Combine radiomics tables
|
|
578
|
+
variable_table = combine_rad_tables(variable_tables)
|
|
579
|
+
|
|
580
|
+
# Apply FDA again on the combined radiomics table
|
|
581
|
+
variable_table = self.apply_fda_one_space(ml, variable_table, outcome_table_binary, del_variants=False)
|
|
582
|
+
|
|
583
|
+
# Add-up the tables
|
|
584
|
+
variable_tables_all_levels.append(variable_table)
|
|
585
|
+
|
|
586
|
+
# Combine radiomics tables of all 3 major levels (original, linear filters and textures)
|
|
587
|
+
variable_table_all_levels = combine_rad_tables(variable_tables_all_levels)
|
|
588
|
+
|
|
589
|
+
# Apply FDA again on the combined radiomics table
|
|
590
|
+
variable_table_all_levels = self.apply_fda_one_space(ml, variable_table_all_levels, outcome_table_binary, del_variants=False)
|
|
591
|
+
|
|
592
|
+
return variable_table_all_levels
|
|
593
|
+
|
|
594
|
+
def apply_random_fsr_one_space(
|
|
595
|
+
self,
|
|
596
|
+
ml: Dict,
|
|
597
|
+
variable_table: pd.DataFrame,
|
|
598
|
+
) -> List:
|
|
599
|
+
seed = ml['fSetReduction']['FDA']['seed']
|
|
600
|
+
|
|
601
|
+
# Setting the seed
|
|
602
|
+
np.random.seed(seed)
|
|
603
|
+
|
|
604
|
+
# Random select 10 columns (features)
|
|
605
|
+
random_df = np.random.choice(variable_table.columns.values.tolist(), 10, replace=False)
|
|
606
|
+
random_df = variable_table[random_df]
|
|
607
|
+
|
|
608
|
+
return finalize_rad_table(random_df)
|
|
609
|
+
|
|
610
|
+
def apply_random_fsr(
|
|
611
|
+
self,
|
|
612
|
+
ml: Dict,
|
|
613
|
+
variable_table: List,
|
|
614
|
+
) -> List:
|
|
615
|
+
"""
|
|
616
|
+
Applies random feature set reduction by choosing a random number of features.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
ml (dict): Machine learning dictionary containing the learning options.
|
|
620
|
+
variable_table (List): Table of variables.
|
|
621
|
+
outcome_table_binary (pd.DataFrame): Table of binary outcomes.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
List: Table of variables after feature set reduction.
|
|
625
|
+
"""
|
|
626
|
+
# Iinitilization
|
|
627
|
+
rad_tables = variable_table.copy()
|
|
628
|
+
n_rad_tables = len(rad_tables)
|
|
629
|
+
variable_tables = []
|
|
630
|
+
|
|
631
|
+
# Apply FDA for each image space/radiomics table
|
|
632
|
+
for r in range(n_rad_tables):
|
|
633
|
+
variable_tables.append(self.apply_random_fsr_one_space(ml, rad_tables[r]))
|
|
634
|
+
|
|
635
|
+
# Combine radiomics tables
|
|
636
|
+
variable_table = combine_rad_tables(variable_tables)
|
|
637
|
+
|
|
638
|
+
# Apply FDA again on the combined radiomics table
|
|
639
|
+
variable_table = self.apply_random_fsr_one_space(ml, variable_table)
|
|
640
|
+
|
|
641
|
+
return variable_table
|
|
642
|
+
|
|
643
|
+
def apply_fsr(self, ml: Dict, variable_table: List, outcome_table_binary: pd.DataFrame, path_save_logging: Path = None) -> List:
|
|
644
|
+
"""
|
|
645
|
+
Applies feature set reduction method.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
ml (dict): Machine learning dictionary containing the learning options.
|
|
649
|
+
variable_table (List): Table of variables.
|
|
650
|
+
outcome_table_binary (pd.DataFrame): Table of binary outcomes.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
List: Table of variables after feature set reduction.
|
|
654
|
+
"""
|
|
655
|
+
if self.method.lower() == "fda":
|
|
656
|
+
variable_table = self.apply_fda(ml, variable_table, outcome_table_binary, path_save_logging=path_save_logging)
|
|
657
|
+
elif self.method.lower() == "fdabalanced":
|
|
658
|
+
variable_table = self.apply_fda_balanced(ml, variable_table, outcome_table_binary)
|
|
659
|
+
elif self.method.lower() == "random":
|
|
660
|
+
variable_table = self.apply_random_fsr(ml, variable_table)
|
|
661
|
+
elif self.method == "LASSO":
|
|
662
|
+
raise NotImplementedError("LASSO not implemented yet.")
|
|
663
|
+
elif self.method == "mRMR":
|
|
664
|
+
raise NotImplementedError("mRMR not implemented yet.")
|
|
665
|
+
else:
|
|
666
|
+
raise ValueError("FSR method is None or unknown: " + self.method)
|
|
667
|
+
return variable_table
|