mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
MEDiml/learning/Stats.py
ADDED
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
# Description: All the functions related to statistics (p-values, metrics, etc.)
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Tuple
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import scipy
|
|
11
|
+
from sklearn import metrics
|
|
12
|
+
|
|
13
|
+
from MEDiml.utils.json_utils import load_json
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Stats:
|
|
17
|
+
"""
|
|
18
|
+
A class to perform statistical analysis on experiment results.
|
|
19
|
+
|
|
20
|
+
This class provides methods to retrieve patient IDs, predictions, and metrics from experiment data,
|
|
21
|
+
as well as compute the p-values for model comparison using various methods.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path_experiment (Path): Path to the folder containing the experiment data.
|
|
25
|
+
experiment (str): Name of the experiment.
|
|
26
|
+
levels (List): List of radiomics levels to analyze.
|
|
27
|
+
modalities (List): List of modalities to analyze.
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
path_experiment (Path): Path to the folder containing the experiment data.
|
|
31
|
+
experiment (str): Name of the experiment.
|
|
32
|
+
levels (List): List of radiomics levels to analyze.
|
|
33
|
+
modalities (List): List of modalities to analyze.
|
|
34
|
+
"""
|
|
35
|
+
def __init__(self, path_experiment: Path, experiment: str = "", levels: List = [], modalities: List = []):
|
|
36
|
+
# Initialization
|
|
37
|
+
self.path_experiment = path_experiment
|
|
38
|
+
self.experiment = experiment
|
|
39
|
+
self.levels = levels
|
|
40
|
+
self.modalities = modalities
|
|
41
|
+
|
|
42
|
+
# Safety assertion
|
|
43
|
+
self.__safety_assertion()
|
|
44
|
+
|
|
45
|
+
def __get_models_dicts(self, split_idx: int) -> Path:
|
|
46
|
+
"""
|
|
47
|
+
Retrieves the models dictionaries for a given split.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
split_idx (int): Index of the split.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List: List of paths to the models dictionaries.
|
|
54
|
+
"""
|
|
55
|
+
# Get level and modality
|
|
56
|
+
if len(self.modalities) == 1:
|
|
57
|
+
# Load ground truths and predictions
|
|
58
|
+
path_json_1 = self.__get_path_json(self.levels[0], self.modalities[0], split_idx)
|
|
59
|
+
path_json_2 = self.__get_path_json(self.levels[1], self.modalities[0], split_idx)
|
|
60
|
+
else:
|
|
61
|
+
# Load ground truths and predictions
|
|
62
|
+
path_json_1 = self.__get_path_json(self.levels[0], self.modalities[0], split_idx)
|
|
63
|
+
path_json_2 = self.__get_path_json(self.levels[0], self.modalities[1], split_idx)
|
|
64
|
+
return path_json_1, path_json_2
|
|
65
|
+
|
|
66
|
+
def __safety_assertion(self):
|
|
67
|
+
"""
|
|
68
|
+
Asserts that the input parameters are correct.
|
|
69
|
+
"""
|
|
70
|
+
if len(self.modalities) == 1:
|
|
71
|
+
assert len(self.levels) == 2, \
|
|
72
|
+
"For statistical analysis, the number of levels must be 2 for a single modality, or 1 for two modalities"
|
|
73
|
+
elif len(self.modalities) == 2:
|
|
74
|
+
assert len(self.levels) == 1, \
|
|
75
|
+
"For statistical analysis, the number of levels must be 1 for two modalities, or 2 for a single modality"
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError("The number of modalities must be 1 or 2")
|
|
78
|
+
|
|
79
|
+
def __get_path_json(self, level: str, modality: str, split_idx: int) -> Path:
|
|
80
|
+
"""
|
|
81
|
+
Retrieves the path to the models dictionary for a given split.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
level (str): Radiomics level.
|
|
85
|
+
modality (str): Modality.
|
|
86
|
+
split_idx (int): Index of the split.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Path: Path to the models dictionary.
|
|
90
|
+
"""
|
|
91
|
+
return self.path_experiment / f'learn__{self.experiment}_{level}_{modality}' / f'test__{split_idx:03d}' / 'run_results.json'
|
|
92
|
+
|
|
93
|
+
def __get_patients_and_predictions(
|
|
94
|
+
self,
|
|
95
|
+
split_idx: int
|
|
96
|
+
) -> tuple:
|
|
97
|
+
"""
|
|
98
|
+
Retrieves patient IDs, predictions of both models for a given split.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
split_idx (int): Index of the split.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
tuple: Tuple containing the patient IDs, predictions of the first model and predictions of the second model.
|
|
105
|
+
"""
|
|
106
|
+
# Get models dicts
|
|
107
|
+
path_json_1, path_json_2 = self.__get_models_dicts(split_idx)
|
|
108
|
+
|
|
109
|
+
# Load models dicts
|
|
110
|
+
model_one = load_json(path_json_1)
|
|
111
|
+
model_two = load_json(path_json_2)
|
|
112
|
+
|
|
113
|
+
# Get name models
|
|
114
|
+
name_model_one = list(model_one.keys())[0]
|
|
115
|
+
name_model_two = list(model_two.keys())[0]
|
|
116
|
+
|
|
117
|
+
# Get predictions
|
|
118
|
+
predictions_one = np.array(model_one[name_model_one]['test']['response'])
|
|
119
|
+
predictions_one = np.reshape(predictions_one, (predictions_one.shape[0])).tolist()
|
|
120
|
+
predictions_two = np.array(model_two[name_model_two]['test']['response'])
|
|
121
|
+
predictions_two = np.reshape(predictions_two, (predictions_two.shape[0])).tolist()
|
|
122
|
+
|
|
123
|
+
# Get patients ids
|
|
124
|
+
patients_ids_one = model_one[name_model_one]['test']['patients']
|
|
125
|
+
patients_ids_two = model_two[name_model_two]['test']['patients']
|
|
126
|
+
|
|
127
|
+
# Check if the number of patients is the same
|
|
128
|
+
patients_delete = []
|
|
129
|
+
if len(patients_ids_one) > len(patients_ids_two):
|
|
130
|
+
# Warn the user
|
|
131
|
+
warnings.warn("The number of patients is different for both models. Patients will be deleted to match the number of patients.")
|
|
132
|
+
|
|
133
|
+
# Delete patients
|
|
134
|
+
for patient_id in patients_ids_one:
|
|
135
|
+
if patient_id not in patients_ids_two:
|
|
136
|
+
patients_delete.append(patient_id)
|
|
137
|
+
predictions_one.pop(patients_ids_one.index(patient_id))
|
|
138
|
+
for patient in patients_delete:
|
|
139
|
+
patients_ids_one.remove(patient)
|
|
140
|
+
elif len(patients_ids_one) < len(patients_ids_two):
|
|
141
|
+
# Warn the user
|
|
142
|
+
warnings.warn("The number of patients is different for both models. Patients will be deleted to match the number of patients.")
|
|
143
|
+
|
|
144
|
+
# Delete patients
|
|
145
|
+
for patient_id in patients_ids_two:
|
|
146
|
+
if patient_id not in patients_ids_one:
|
|
147
|
+
patients_delete.append(patient_id)
|
|
148
|
+
predictions_two.pop(patients_ids_two.index(patient_id))
|
|
149
|
+
for patient in patients_delete:
|
|
150
|
+
patients_ids_two.remove(patient)
|
|
151
|
+
|
|
152
|
+
# Check if the patient IDs are the same
|
|
153
|
+
if patients_ids_one != patients_ids_two:
|
|
154
|
+
raise ValueError("The patient IDs must be the same for both models")
|
|
155
|
+
|
|
156
|
+
# Check if the number of predictions is the same
|
|
157
|
+
if len(predictions_one) != len(predictions_two):
|
|
158
|
+
raise ValueError("The number of predictions must be the same for both models")
|
|
159
|
+
|
|
160
|
+
return patients_ids_one, predictions_one, predictions_two
|
|
161
|
+
|
|
162
|
+
def __calc_pvalue(self, aucs: np.array, sigma: float) -> float:
|
|
163
|
+
"""
|
|
164
|
+
Computes p-values of the AUCs distribution.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
aucs(np.array): 1D array of AUCs.
|
|
168
|
+
sigma (flaot): AUC DeLong covariances
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
flaot: p-value of the AUCs.
|
|
172
|
+
"""
|
|
173
|
+
l = np.array([[1, -1]])
|
|
174
|
+
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
|
|
175
|
+
p_value = 2 * scipy.stats.norm.sf(z, loc=0, scale=1)
|
|
176
|
+
return p_value
|
|
177
|
+
|
|
178
|
+
def __corrected_std(self, differences: np.array, n_train: int, n_test: int) -> float:
|
|
179
|
+
"""
|
|
180
|
+
Corrects standard deviation using Nadeau and Bengio's approach.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
differences (np.array): Vector containing the differences in the score metrics of two models.
|
|
184
|
+
n_train (int): Number of samples in the training set.
|
|
185
|
+
n_test (int): Number of samples in the testing set.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
float: Variance-corrected standard deviation of the set of differences.
|
|
189
|
+
|
|
190
|
+
Reference:
|
|
191
|
+
`Statistical comparison of models
|
|
192
|
+
<https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html#comparing-two-models-frequentist-approach>.`
|
|
193
|
+
"""
|
|
194
|
+
# kr = k times r, r times repeated k-fold crossvalidation,
|
|
195
|
+
# kr equals the number of times the model was evaluated
|
|
196
|
+
kr = len(differences)
|
|
197
|
+
corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
|
|
198
|
+
corrected_std = np.sqrt(corrected_var)
|
|
199
|
+
return corrected_std
|
|
200
|
+
|
|
201
|
+
def __compute_midrank(self, x: np.array) -> np.array:
|
|
202
|
+
"""
|
|
203
|
+
Computes midranks for Delong p-value.
|
|
204
|
+
Args:
|
|
205
|
+
x(np.array): 1D array of probabilities.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
np.array: Midranks.
|
|
209
|
+
"""
|
|
210
|
+
J = np.argsort(x)
|
|
211
|
+
Z = x[J]
|
|
212
|
+
N = len(x)
|
|
213
|
+
T = np.zeros(N, dtype=np.float)
|
|
214
|
+
i = 0
|
|
215
|
+
while i < N:
|
|
216
|
+
j = i
|
|
217
|
+
while j < N and Z[j] == Z[i]:
|
|
218
|
+
j += 1
|
|
219
|
+
T[i:j] = 0.5*(i + j - 1)
|
|
220
|
+
i = j
|
|
221
|
+
T2 = np.empty(N, dtype=np.float)
|
|
222
|
+
# Note(kazeevn) +1 is due to Python using 0-based indexing
|
|
223
|
+
# instead of 1-based in the AUC formula in the paper
|
|
224
|
+
T2[J] = T + 1
|
|
225
|
+
return T2
|
|
226
|
+
|
|
227
|
+
def __fast_delong(self, predictions_sorted_transposed: np.array, label_1_count: int) -> Tuple[float, float]:
|
|
228
|
+
"""
|
|
229
|
+
Computes the empricial AUC and its covariance using the fast version of DeLong's method.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
predictions_sorted_transposed (np.array): a 2D numpy.array[n_classifiers, n_examples]
|
|
233
|
+
sorted such as the examples with label "1" are first.
|
|
234
|
+
label_1_count (int): number of examples with label "1".
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Tuple(float, float): (AUC value, DeLong covariance)
|
|
238
|
+
|
|
239
|
+
Reference:
|
|
240
|
+
`Python fast delong implementation <https://github.com/yandexdataschool/roc_comparison/tree/master>.`
|
|
241
|
+
@article{sun2014fast,
|
|
242
|
+
title={Fast Implementation of DeLong's Algorithm for
|
|
243
|
+
Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
|
|
244
|
+
author={Xu Sun and Weichao Xu},
|
|
245
|
+
journal={IEEE Signal Processing Letters},
|
|
246
|
+
volume={21},
|
|
247
|
+
number={11},
|
|
248
|
+
pages={1389--1393},
|
|
249
|
+
year={2014},
|
|
250
|
+
publisher={IEEE}
|
|
251
|
+
}
|
|
252
|
+
"""
|
|
253
|
+
# Short variables are named as they are in the paper
|
|
254
|
+
m = label_1_count
|
|
255
|
+
n = predictions_sorted_transposed.shape[1] - m
|
|
256
|
+
positive_examples = predictions_sorted_transposed[:, :m]
|
|
257
|
+
negative_examples = predictions_sorted_transposed[:, m:]
|
|
258
|
+
k = predictions_sorted_transposed.shape[0]
|
|
259
|
+
|
|
260
|
+
tx = np.empty([k, m], dtype=np.float)
|
|
261
|
+
ty = np.empty([k, n], dtype=np.float)
|
|
262
|
+
tz = np.empty([k, m + n], dtype=np.float)
|
|
263
|
+
for r in range(k):
|
|
264
|
+
tx[r, :] = self.__compute_midrank(positive_examples[r, :])
|
|
265
|
+
ty[r, :] = self.__compute_midrank(negative_examples[r, :])
|
|
266
|
+
tz[r, :] = self.__compute_midrank(predictions_sorted_transposed[r, :])
|
|
267
|
+
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
|
|
268
|
+
v01 = (tz[:, :m] - tx[:, :]) / n
|
|
269
|
+
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
|
|
270
|
+
sx = np.cov(v01)
|
|
271
|
+
sy = np.cov(v10)
|
|
272
|
+
delongcov = sx / m + sy / n
|
|
273
|
+
|
|
274
|
+
return aucs, delongcov
|
|
275
|
+
|
|
276
|
+
def __compute_ground_truth_statistics(self, ground_truth: np.array) -> Tuple[np.array, int]:
|
|
277
|
+
"""
|
|
278
|
+
Computes the order of the ground truth and the number of positive examples.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
ground_truth(np.array): np.array of 0 and 1.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Tuple[np.array, int]: ground truth ordered and the number of positive examples.
|
|
285
|
+
"""
|
|
286
|
+
assert np.array_equal(np.unique(ground_truth), [0, 1])
|
|
287
|
+
order = (-ground_truth).argsort()
|
|
288
|
+
label_1_count = int(ground_truth.sum())
|
|
289
|
+
return order, label_1_count
|
|
290
|
+
|
|
291
|
+
def __get_metrics(self, metric: str, split_idx: int) -> tuple:
|
|
292
|
+
"""
|
|
293
|
+
Initializes the p-value information that will be used to compute the p-values across all different methods.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
metric (str): Metric to retrieve.
|
|
297
|
+
split_idx (int): Index of the split.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
tuple: Tuple containing the metrics of the first model and metrics of the second model.
|
|
301
|
+
"""
|
|
302
|
+
# Get models dicts
|
|
303
|
+
path_json_1, path_json_2 = self.__get_models_dicts(split_idx)
|
|
304
|
+
|
|
305
|
+
# Load models dicts
|
|
306
|
+
model_one = load_json(path_json_1)
|
|
307
|
+
model_two = load_json(path_json_2)
|
|
308
|
+
|
|
309
|
+
# Get name models
|
|
310
|
+
name_model_one = list(model_one.keys())[0]
|
|
311
|
+
name_model_two = list(model_two.keys())[0]
|
|
312
|
+
|
|
313
|
+
# Get predictions
|
|
314
|
+
metric_one = model_one[name_model_one]['test']['metrics'][metric]
|
|
315
|
+
metric_two = model_two[name_model_two]['test']['metrics'][metric]
|
|
316
|
+
|
|
317
|
+
return metric_one, metric_two
|
|
318
|
+
|
|
319
|
+
def __delong_roc_test(self, ground_truth: np.array, predictions_one: list, predictions_two: list) -> float:
|
|
320
|
+
"""
|
|
321
|
+
Computes log(p-value) for hypothesis that two ROC AUCs are different
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
ground_truth(np.array): np.array of 0 and 1
|
|
325
|
+
predictions_one(np.array): np.array of floats of the probability of being class 1 for the first model.
|
|
326
|
+
predictions_two(np.array): np.array of floats of the probability of being class 1 for the second model.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
flaot: p-value of the AUCs.
|
|
330
|
+
"""
|
|
331
|
+
order, label_1_count = self.__compute_ground_truth_statistics(ground_truth)
|
|
332
|
+
predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
|
|
333
|
+
aucs, delongcov = self.__fast_delong(predictions_sorted_transposed, label_1_count)
|
|
334
|
+
return self.__calc_pvalue(aucs, delongcov)
|
|
335
|
+
|
|
336
|
+
@staticmethod
|
|
337
|
+
def get_aggregated_metric(
|
|
338
|
+
path_experiment: Path,
|
|
339
|
+
experiment: str,
|
|
340
|
+
level: str,
|
|
341
|
+
modality: str,
|
|
342
|
+
metric: str
|
|
343
|
+
) -> float:
|
|
344
|
+
"""
|
|
345
|
+
Calculates the p-value of the Delong test for the given experiment.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
path_experiment (Path): Path to the folder containing the experiment.
|
|
349
|
+
experiment (str): Name of the experiment.
|
|
350
|
+
level (str): Radiomics level. For example: 'morph'.
|
|
351
|
+
modality (str): Modality to analyze.
|
|
352
|
+
metric (str): Metric to analyze.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
float: p-value of the Delong test.
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
# Load outcomes dataframe
|
|
359
|
+
try:
|
|
360
|
+
outcomes = pd.read_csv(path_experiment / "outcomes.csv", sep=',')
|
|
361
|
+
except:
|
|
362
|
+
outcomes = pd.read_csv(path_experiment.parent / "outcomes.csv", sep=',')
|
|
363
|
+
|
|
364
|
+
# Initialization
|
|
365
|
+
predictions_all = list()
|
|
366
|
+
patients_ids_all = list()
|
|
367
|
+
nb_split = len([x[0] for x in os.walk(path_experiment / f'learn__{experiment}_{level}_{modality}')]) - 1
|
|
368
|
+
|
|
369
|
+
# For each split
|
|
370
|
+
for i in range(1, nb_split + 1):
|
|
371
|
+
# Load ground truths and predictions
|
|
372
|
+
path_json = path_experiment / f'learn__{experiment}_{level}_{modality}' / f'test__{i:03d}' / 'run_results.json'
|
|
373
|
+
|
|
374
|
+
# Load models dicts
|
|
375
|
+
model = load_json(path_json)
|
|
376
|
+
|
|
377
|
+
# Get name models
|
|
378
|
+
name_model = list(model.keys())[0]
|
|
379
|
+
|
|
380
|
+
# Get Model's threshold
|
|
381
|
+
thresh = model[name_model]['threshold']
|
|
382
|
+
|
|
383
|
+
# Get predictions
|
|
384
|
+
predictions = np.array(model[name_model]['test']['response'])
|
|
385
|
+
predictions = np.reshape(predictions, (predictions.shape[0])).tolist()
|
|
386
|
+
|
|
387
|
+
# Bring all predictions to 0.5
|
|
388
|
+
predictions = [prediction - thresh + 0.5 if thresh >= 0.5 else prediction + 0.5 - thresh for prediction in predictions]
|
|
389
|
+
predictions_all.extend(predictions)
|
|
390
|
+
|
|
391
|
+
# Get patients ids
|
|
392
|
+
patients_ids = model[name_model]['test']['patients']
|
|
393
|
+
|
|
394
|
+
# After verification, add-up patients IDs
|
|
395
|
+
patients_ids_all.extend(patients_ids)
|
|
396
|
+
|
|
397
|
+
# Get ground truth for selected patients
|
|
398
|
+
ground_truth = []
|
|
399
|
+
for patient in patients_ids_all:
|
|
400
|
+
ground_truth.append(outcomes[outcomes['PatientID'] == patient][outcomes.columns[-1]].values[0])
|
|
401
|
+
|
|
402
|
+
# to numpy array
|
|
403
|
+
ground_truth = np.array(ground_truth)
|
|
404
|
+
|
|
405
|
+
# Get aggregated metric
|
|
406
|
+
# AUC
|
|
407
|
+
if metric == 'AUC':
|
|
408
|
+
auc = metrics.roc_auc_score(ground_truth, predictions_all)
|
|
409
|
+
return auc
|
|
410
|
+
|
|
411
|
+
# AUPRC
|
|
412
|
+
elif metric == 'AUPRC':
|
|
413
|
+
auc = metrics.average_precision_score(ground_truth, predictions_all)
|
|
414
|
+
|
|
415
|
+
# Confusion matrix-based metrics
|
|
416
|
+
else:
|
|
417
|
+
TP = ((np.array(predictions_all) >= 0.5) & (ground_truth == 1)).sum()
|
|
418
|
+
TN = ((np.array(predictions_all) < 0.5) & (ground_truth == 0)).sum()
|
|
419
|
+
FP = ((np.array(predictions_all) >= 0.5) & (ground_truth == 0)).sum()
|
|
420
|
+
FN = ((np.array(predictions_all) < 0.5) & (ground_truth == 1)).sum()
|
|
421
|
+
|
|
422
|
+
# Asserts
|
|
423
|
+
assert TP + FN != 0, "TP + FN = 0, Division by 0"
|
|
424
|
+
assert TN + FP != 0, "TN + FP = 0, Division by 0"
|
|
425
|
+
|
|
426
|
+
# Sensitivity
|
|
427
|
+
if metric == 'Sensitivity':
|
|
428
|
+
sensitivity = TP / (TP + FN)
|
|
429
|
+
return sensitivity
|
|
430
|
+
|
|
431
|
+
# Specificity
|
|
432
|
+
elif metric == 'Specificity':
|
|
433
|
+
specificity = TN / (TN + FP)
|
|
434
|
+
return specificity
|
|
435
|
+
|
|
436
|
+
else:
|
|
437
|
+
raise ValueError(f"Metric {metric} not supported. Supported metrics: AUC, AUPRC, Sensitivity, Specificity.\
|
|
438
|
+
Update file Stats.py to add the new metric.")
|
|
439
|
+
|
|
440
|
+
def get_aggregated_delong_p_value(self) -> float:
|
|
441
|
+
"""
|
|
442
|
+
Calculates the p-value of the Delong test for the given experiment.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
float: p-value of the Delong test.
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
# Load outcomes dataframe
|
|
449
|
+
try:
|
|
450
|
+
outcomes = pd.read_csv(self.path_experiment / "outcomes.csv", sep=',')
|
|
451
|
+
except:
|
|
452
|
+
outcomes = pd.read_csv(self.path_experiment.parent / "outcomes.csv", sep=',')
|
|
453
|
+
|
|
454
|
+
# Initialization
|
|
455
|
+
predictions_one_all = list()
|
|
456
|
+
predictions_two_all = list()
|
|
457
|
+
patients_ids_all = list()
|
|
458
|
+
nb_split = len([x[0] for x in os.walk(self.path_experiment / f'learn__{self.experiment}_{self.levels[0]}_{self.modalities[0]}')]) - 1
|
|
459
|
+
|
|
460
|
+
# For each split
|
|
461
|
+
for i in range(1, nb_split + 1):
|
|
462
|
+
# Get predictions and patients ids
|
|
463
|
+
patients_ids, predictions_one, predictions_two = self.__get_patients_and_predictions(i)
|
|
464
|
+
|
|
465
|
+
# Add-up all information
|
|
466
|
+
predictions_one_all.extend(predictions_one)
|
|
467
|
+
predictions_two_all.extend(predictions_two)
|
|
468
|
+
patients_ids_all.extend(patients_ids)
|
|
469
|
+
|
|
470
|
+
# Get ground truth for selected patients
|
|
471
|
+
ground_truth = []
|
|
472
|
+
for patient in patients_ids_all:
|
|
473
|
+
ground_truth.append(outcomes[outcomes['PatientID'] == patient][outcomes.columns[-1]].values[0])
|
|
474
|
+
|
|
475
|
+
# to numpy array
|
|
476
|
+
ground_truth = np.array(ground_truth)
|
|
477
|
+
|
|
478
|
+
# Get p-value
|
|
479
|
+
pvalue = self.__delong_roc_test(ground_truth, predictions_one_all, predictions_two_all).item()
|
|
480
|
+
|
|
481
|
+
# Compute the median p-value of all splits
|
|
482
|
+
return pvalue
|
|
483
|
+
|
|
484
|
+
def get_bengio_p_value(self) -> float:
|
|
485
|
+
"""
|
|
486
|
+
Computes Bengio's right-tailed paired t-test with corrected variance.
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
float: p-value of the Bengio test.
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
# Initialization
|
|
493
|
+
metrics_one_all = list()
|
|
494
|
+
metrics_two_all = list()
|
|
495
|
+
nb_split = len([x[0] for x in os.walk(self.path_experiment / f'learn__{self.experiment}_{self.levels[0]}_{self.modalities[0]}')]) - 1
|
|
496
|
+
|
|
497
|
+
# For each split
|
|
498
|
+
for i in range(1, nb_split + 1):
|
|
499
|
+
# Get models dicts
|
|
500
|
+
path_json_1, path_json_2 = self.__get_models_dicts(i)
|
|
501
|
+
|
|
502
|
+
# Load patients train and test lists
|
|
503
|
+
patients_train = load_json(path_json_1.parent / 'patientsTrain.json')
|
|
504
|
+
patients_test = load_json(path_json_1.parent / 'patientsTest.json')
|
|
505
|
+
n_train = len(patients_train)
|
|
506
|
+
n_test = len(patients_test)
|
|
507
|
+
|
|
508
|
+
# Load models dicts
|
|
509
|
+
model_one = load_json(path_json_1)
|
|
510
|
+
model_two = load_json(path_json_2)
|
|
511
|
+
|
|
512
|
+
# Get name models
|
|
513
|
+
name_model_one = list(model_one.keys())[0]
|
|
514
|
+
name_model_two = list(model_two.keys())[0]
|
|
515
|
+
|
|
516
|
+
# Get predictions
|
|
517
|
+
metric_one = model_one[name_model_one]['test']['metrics']['AUC']
|
|
518
|
+
metric_two = model_two[name_model_two]['test']['metrics']['AUC']
|
|
519
|
+
|
|
520
|
+
# Add-up all information
|
|
521
|
+
metrics_one_all.append(metric_one)
|
|
522
|
+
metrics_two_all.append(metric_two)
|
|
523
|
+
|
|
524
|
+
# Check if the number of predictions is the same
|
|
525
|
+
if len(metrics_one_all) != len(metrics_two_all):
|
|
526
|
+
raise ValueError("The number of metrics must be the same for both models")
|
|
527
|
+
|
|
528
|
+
# Get differences
|
|
529
|
+
differences = np.array(metrics_one_all) - np.array(metrics_two_all)
|
|
530
|
+
df = differences.shape[0] - 1
|
|
531
|
+
|
|
532
|
+
# Get corrected std
|
|
533
|
+
mean = np.mean(differences)
|
|
534
|
+
std = self.__corrected_std(differences, n_train, n_test)
|
|
535
|
+
|
|
536
|
+
# Get p-value
|
|
537
|
+
t_stat = mean / std
|
|
538
|
+
p_val = scipy.stats.t.sf(np.abs(t_stat), df) # right-tailed t-test
|
|
539
|
+
|
|
540
|
+
return p_val
|
|
541
|
+
|
|
542
|
+
def get_delong_p_value(
|
|
543
|
+
self,
|
|
544
|
+
aggregate: bool = False,
|
|
545
|
+
) -> float:
|
|
546
|
+
"""
|
|
547
|
+
Calculates the p-value of the Delong test for the given experiment.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
aggregate (bool, optional): If True, aggregates the results of all the splits and computes one final p-value.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
float: p-value of the Delong test.
|
|
554
|
+
"""
|
|
555
|
+
|
|
556
|
+
# Check if aggregation is needed
|
|
557
|
+
if aggregate:
|
|
558
|
+
return self.get_aggregated_delong_p_value()
|
|
559
|
+
|
|
560
|
+
# Load outcomes dataframe
|
|
561
|
+
try:
|
|
562
|
+
outcomes = pd.read_csv(self.path_experiment / "outcomes.csv", sep=',')
|
|
563
|
+
except:
|
|
564
|
+
outcomes = pd.read_csv(self.path_experiment.parent / "outcomes.csv", sep=',')
|
|
565
|
+
|
|
566
|
+
# Initialization
|
|
567
|
+
nb_split = len([x[0] for x in os.walk(self.path_experiment / f'learn__{self.experiment}_{self.levels[0]}_{self.modalities[0]}')]) - 1
|
|
568
|
+
list_p_values_temp = list()
|
|
569
|
+
|
|
570
|
+
# For each split
|
|
571
|
+
for i in range(1, nb_split + 1):
|
|
572
|
+
# Get predictions and patients ids
|
|
573
|
+
patients_ids, predictions_one, predictions_two = self.__get_patients_and_predictions(i)
|
|
574
|
+
|
|
575
|
+
# Get ground truth for selected patients
|
|
576
|
+
ground_truth = []
|
|
577
|
+
for patient in patients_ids:
|
|
578
|
+
ground_truth.append(outcomes[outcomes['PatientID'] == patient][outcomes.columns[-1]].values[0])
|
|
579
|
+
|
|
580
|
+
# to numpy array
|
|
581
|
+
ground_truth = np.array(ground_truth)
|
|
582
|
+
|
|
583
|
+
# Get p-value
|
|
584
|
+
pvalue = self.__delong_roc_test(ground_truth, predictions_one, predictions_two).item()
|
|
585
|
+
|
|
586
|
+
list_p_values_temp.append(pvalue)
|
|
587
|
+
|
|
588
|
+
# Compute the median p-value of all splits
|
|
589
|
+
return np.median(list_p_values_temp)
|
|
590
|
+
|
|
591
|
+
def get_ttest_p_value(self, metric: str = 'AUC',) -> float:
|
|
592
|
+
"""
|
|
593
|
+
Calculates the p-value using the t-test for two related samples of scores.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
metric (str, optional): Metric to use for comparison. Defaults to 'AUC'.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
float: p-value of the Delong test.
|
|
600
|
+
"""
|
|
601
|
+
|
|
602
|
+
# Initialization
|
|
603
|
+
metric = metric.split('_')[0] if '_' in metric else metric
|
|
604
|
+
metrics_one_all = list()
|
|
605
|
+
metrics_two_all = list()
|
|
606
|
+
nb_split = len([x[0] for x in os.walk(self.path_experiment / f'learn__{self.experiment}_{self.levels[0]}_{self.modalities[0]}')]) - 1
|
|
607
|
+
|
|
608
|
+
# For each split
|
|
609
|
+
for i in range(1, nb_split + 1):
|
|
610
|
+
# Get metrics of the first and second model
|
|
611
|
+
metric_one, metric_two = self.__get_metrics(metric, i)
|
|
612
|
+
|
|
613
|
+
# Add-up all information
|
|
614
|
+
metrics_one_all.append(metric_one)
|
|
615
|
+
metrics_two_all.append(metric_two)
|
|
616
|
+
|
|
617
|
+
# Check if the number of predictions is the same
|
|
618
|
+
if len(metrics_one_all) != len(metrics_two_all):
|
|
619
|
+
raise ValueError("The number of metrics must be the same for both models")
|
|
620
|
+
|
|
621
|
+
# Compute p-value by performing paired t-test
|
|
622
|
+
_, p_value = scipy.stats.ttest_rel(metrics_one_all, metrics_two_all)
|
|
623
|
+
|
|
624
|
+
return p_value
|
|
625
|
+
|
|
626
|
+
def get_wilcoxin_p_value(self, metric: str = 'AUC',) -> float:
|
|
627
|
+
"""
|
|
628
|
+
Calculates the p-value using the t-test for two related samples of scores.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
metric (str, optional): Metric to analyze. Defaults to 'AUC'.
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
float: p-value of the Delong test.
|
|
635
|
+
"""
|
|
636
|
+
|
|
637
|
+
# Initialization
|
|
638
|
+
metric = metric.split('_')[0] if '_' in metric else metric
|
|
639
|
+
metrics_one_all = list()
|
|
640
|
+
metrics_two_all = list()
|
|
641
|
+
nb_split = len([x[0] for x in os.walk(self.path_experiment / f'learn__{self.experiment}_{self.levels[0]}_{self.modalities[0]}')]) - 1
|
|
642
|
+
|
|
643
|
+
# For each split
|
|
644
|
+
for i in range(1, nb_split + 1):
|
|
645
|
+
# Get metrics of the first and second model
|
|
646
|
+
metric_one, metric_two = self.__get_metrics(metric, i)
|
|
647
|
+
|
|
648
|
+
# Add-up all information
|
|
649
|
+
metrics_one_all.append(metric_one)
|
|
650
|
+
metrics_two_all.append(metric_two)
|
|
651
|
+
|
|
652
|
+
# Check if the number of predictions is the same
|
|
653
|
+
if len(metrics_one_all) != len(metrics_two_all):
|
|
654
|
+
raise ValueError("The number of metrics must be the same for both models")
|
|
655
|
+
|
|
656
|
+
# Compute p-value by performing wilcoxon signed rank test
|
|
657
|
+
_, p_value = scipy.stats.wilcoxon(metrics_one_all, metrics_two_all)
|
|
658
|
+
|
|
659
|
+
return p_value
|
|
660
|
+
|
|
661
|
+
def get_p_value(
|
|
662
|
+
self,
|
|
663
|
+
method: str,
|
|
664
|
+
metric: str = 'AUC',
|
|
665
|
+
aggregate: bool = False
|
|
666
|
+
) -> float:
|
|
667
|
+
"""
|
|
668
|
+
Calculates the p-value of the given method.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
method (str): Method to use to calculate the p-value. Available options:
|
|
672
|
+
- 'delong': Delong test.
|
|
673
|
+
- 'ttest': T-test.
|
|
674
|
+
- 'wilcoxon': Wilcoxon signed rank test.
|
|
675
|
+
- 'bengio': Bengio and Nadeau corrected t-test.
|
|
676
|
+
metric (str, optional): Metric to analyze. Defaults to 'AUC'.
|
|
677
|
+
aggregate (bool, optional): If True, aggregates the results of all the splits and computes one final p-value.
|
|
678
|
+
|
|
679
|
+
Returns:
|
|
680
|
+
float: p-value of the Delong test.
|
|
681
|
+
"""
|
|
682
|
+
# Assertions
|
|
683
|
+
assert method in ['delong', 'ttest', 'wilcoxon', 'bengio'], \
|
|
684
|
+
f'method must be either "delong", "ttest", "wilcoxon" or "bengio". Given: {method}'
|
|
685
|
+
|
|
686
|
+
# Get p-value
|
|
687
|
+
if method == 'delong':
|
|
688
|
+
return self.get_delong_p_value(aggregate)
|
|
689
|
+
elif method == 'ttest':
|
|
690
|
+
return self.get_ttest_p_value(metric)
|
|
691
|
+
elif method == 'wilcoxon':
|
|
692
|
+
return self.get_wilcoxin_p_value(metric)
|
|
693
|
+
elif method == 'bengio':
|
|
694
|
+
return self.get_bengio_p_value()
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from . import *
|
|
2
|
+
from .cleaning_utils import *
|
|
3
|
+
from .DataCleaner import DataCleaner
|
|
4
|
+
from .DesignExperiment import DesignExperiment
|
|
5
|
+
from .FSR import FSR
|
|
6
|
+
from .ml_utils import *
|
|
7
|
+
from .Normalization import Normalization
|
|
8
|
+
from .RadiomicsLearner import RadiomicsLearner
|
|
9
|
+
from .Results import Results
|
|
10
|
+
from .Stats import Stats
|