mediml 0.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- MEDiml/MEDscan.py +1696 -0
- MEDiml/__init__.py +21 -0
- MEDiml/biomarkers/BatchExtractor.py +806 -0
- MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
- MEDiml/biomarkers/__init__.py +16 -0
- MEDiml/biomarkers/diagnostics.py +125 -0
- MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
- MEDiml/biomarkers/glcm.py +1602 -0
- MEDiml/biomarkers/gldzm.py +523 -0
- MEDiml/biomarkers/glrlm.py +1315 -0
- MEDiml/biomarkers/glszm.py +555 -0
- MEDiml/biomarkers/int_vol_hist.py +527 -0
- MEDiml/biomarkers/intensity_histogram.py +615 -0
- MEDiml/biomarkers/local_intensity.py +89 -0
- MEDiml/biomarkers/morph.py +1756 -0
- MEDiml/biomarkers/ngldm.py +780 -0
- MEDiml/biomarkers/ngtdm.py +414 -0
- MEDiml/biomarkers/stats.py +373 -0
- MEDiml/biomarkers/utils.py +389 -0
- MEDiml/filters/TexturalFilter.py +299 -0
- MEDiml/filters/__init__.py +9 -0
- MEDiml/filters/apply_filter.py +134 -0
- MEDiml/filters/gabor.py +215 -0
- MEDiml/filters/laws.py +283 -0
- MEDiml/filters/log.py +147 -0
- MEDiml/filters/mean.py +121 -0
- MEDiml/filters/textural_filters_kernels.py +1738 -0
- MEDiml/filters/utils.py +107 -0
- MEDiml/filters/wavelet.py +237 -0
- MEDiml/learning/DataCleaner.py +198 -0
- MEDiml/learning/DesignExperiment.py +480 -0
- MEDiml/learning/FSR.py +667 -0
- MEDiml/learning/Normalization.py +112 -0
- MEDiml/learning/RadiomicsLearner.py +714 -0
- MEDiml/learning/Results.py +2237 -0
- MEDiml/learning/Stats.py +694 -0
- MEDiml/learning/__init__.py +10 -0
- MEDiml/learning/cleaning_utils.py +107 -0
- MEDiml/learning/ml_utils.py +1015 -0
- MEDiml/processing/__init__.py +6 -0
- MEDiml/processing/compute_suv_map.py +121 -0
- MEDiml/processing/discretisation.py +149 -0
- MEDiml/processing/interpolation.py +275 -0
- MEDiml/processing/resegmentation.py +66 -0
- MEDiml/processing/segmentation.py +912 -0
- MEDiml/utils/__init__.py +25 -0
- MEDiml/utils/batch_patients.py +45 -0
- MEDiml/utils/create_radiomics_table.py +131 -0
- MEDiml/utils/data_frame_export.py +42 -0
- MEDiml/utils/find_process_names.py +16 -0
- MEDiml/utils/get_file_paths.py +34 -0
- MEDiml/utils/get_full_rad_names.py +21 -0
- MEDiml/utils/get_institutions_from_ids.py +16 -0
- MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
- MEDiml/utils/get_patient_names.py +26 -0
- MEDiml/utils/get_radiomic_names.py +27 -0
- MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
- MEDiml/utils/image_reader_SITK.py +37 -0
- MEDiml/utils/image_volume_obj.py +22 -0
- MEDiml/utils/imref.py +340 -0
- MEDiml/utils/initialize_features_names.py +62 -0
- MEDiml/utils/inpolygon.py +159 -0
- MEDiml/utils/interp3.py +43 -0
- MEDiml/utils/json_utils.py +78 -0
- MEDiml/utils/mode.py +31 -0
- MEDiml/utils/parse_contour_string.py +58 -0
- MEDiml/utils/save_MEDscan.py +30 -0
- MEDiml/utils/strfind.py +32 -0
- MEDiml/utils/textureTools.py +188 -0
- MEDiml/utils/texture_features_names.py +115 -0
- MEDiml/utils/write_radiomics_csv.py +47 -0
- MEDiml/wrangling/DataManager.py +1724 -0
- MEDiml/wrangling/ProcessDICOM.py +512 -0
- MEDiml/wrangling/__init__.py +3 -0
- mediml-0.9.9.dist-info/LICENSE.md +674 -0
- mediml-0.9.9.dist-info/METADATA +232 -0
- mediml-0.9.9.dist-info/RECORD +78 -0
- mediml-0.9.9.dist-info/WHEEL +4 -0
MEDiml/filters/utils.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.signal import fftconvolve
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pad_imgs(
|
|
8
|
+
images: np.ndarray,
|
|
9
|
+
padding_length: List,
|
|
10
|
+
axis: List,
|
|
11
|
+
mode: str
|
|
12
|
+
)-> np.ndarray:
|
|
13
|
+
"""Apply padding on a 3d images using a 2D padding pattern.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
images (ndarray): a numpy array that represent the image.
|
|
17
|
+
padding_length (List): The padding length that will apply on each side of each axe.
|
|
18
|
+
axis (List): A list of axes on which the padding will be done.
|
|
19
|
+
mode (str): The padding mode. Check options here: `numpy.pad
|
|
20
|
+
<https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
ndarray: A numpy array that represent the padded image.
|
|
24
|
+
"""
|
|
25
|
+
pad_tuple = ()
|
|
26
|
+
j = 1
|
|
27
|
+
|
|
28
|
+
for i in range(np.ndim(images)):
|
|
29
|
+
if i in axis:
|
|
30
|
+
pad_tuple += ((padding_length[-j], padding_length[-j]),)
|
|
31
|
+
j += 1
|
|
32
|
+
else:
|
|
33
|
+
pad_tuple += ((0, 0),)
|
|
34
|
+
|
|
35
|
+
return np.pad(images, pad_tuple, mode=mode)
|
|
36
|
+
|
|
37
|
+
def convolve(
|
|
38
|
+
dim: int,
|
|
39
|
+
kernel: np.ndarray,
|
|
40
|
+
images: np.ndarray,
|
|
41
|
+
orthogonal_rot: bool=False,
|
|
42
|
+
mode: str = "symmetric"
|
|
43
|
+
) -> np.ndarray:
|
|
44
|
+
"""Convolve a given n-dimensional array with the kernel to generate a filtered image.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
dim (int): The dimension of the images.
|
|
48
|
+
kernel (ndarray): The kernel to use for the convolution.
|
|
49
|
+
images (ndarray): A n-dimensional numpy array that represent a batch of images to filter.
|
|
50
|
+
orthogonal_rot (bool, optional): If true, the 3D images will be rotated over coronal, axial and sagittal axis.
|
|
51
|
+
mode (str, optional): The padding mode. Check options here: `numpy.pad
|
|
52
|
+
<https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
ndarray: The filtered image.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
in_size = np.shape(images)
|
|
59
|
+
|
|
60
|
+
# We only handle 2D or 3D images.
|
|
61
|
+
assert len(in_size) == 3 or len(in_size) == 4, \
|
|
62
|
+
"The tensor should have the followed shape (B, H, W) or (B, D, H, W)"
|
|
63
|
+
|
|
64
|
+
if not orthogonal_rot:
|
|
65
|
+
# If we have a 2D kernel but a 3D images, we squeeze the tensor
|
|
66
|
+
if dim < len(in_size) - 1:
|
|
67
|
+
images = images.reshape((in_size[0] * in_size[1], in_size[2], in_size[3]))
|
|
68
|
+
|
|
69
|
+
# We compute the padding size along each dimension
|
|
70
|
+
padding = [int((kernel.shape[-1] - 1) / 2) for _ in range(dim)]
|
|
71
|
+
pad_axis_list = [i for i in range(1, dim+1)]
|
|
72
|
+
|
|
73
|
+
# We pad the images and we add the channel axis.
|
|
74
|
+
padded_imgs = pad_imgs(images, padding, pad_axis_list, mode)
|
|
75
|
+
new_imgs = np.expand_dims(padded_imgs, axis=1)
|
|
76
|
+
|
|
77
|
+
# Operate the convolution
|
|
78
|
+
if dim < len(in_size) - 1:
|
|
79
|
+
# If we have a 2D kernel but a 3D images, we convolve slice by slice
|
|
80
|
+
result_list = [fftconvolve(np.expand_dims(new_imgs[i], axis=0), kernel, mode='valid') for i in range(len(images))]
|
|
81
|
+
result = np.squeeze(np.stack(result_list), axis=2)
|
|
82
|
+
|
|
83
|
+
else :
|
|
84
|
+
result = fftconvolve(new_imgs, kernel, mode='valid')
|
|
85
|
+
|
|
86
|
+
# Reshape the data to retrieve the following format: (B, C, D, H, W)
|
|
87
|
+
if dim < len(in_size) - 1:
|
|
88
|
+
result = result.reshape((
|
|
89
|
+
in_size[0], in_size[1], result.shape[1], in_size[2], in_size[3])
|
|
90
|
+
).transpose(0, 2, 1, 3, 4)
|
|
91
|
+
|
|
92
|
+
# If we want orthogonal rotation
|
|
93
|
+
else:
|
|
94
|
+
coronal_imgs = images
|
|
95
|
+
axial_imgs, sagittal_imgs = np.rot90(images, 1, (1, 2)), np.rot90(images, 1, (1, 3))
|
|
96
|
+
|
|
97
|
+
result_coronal = convolve(dim, kernel, coronal_imgs, False, mode)
|
|
98
|
+
result_axial = convolve(dim, kernel, axial_imgs, False, mode)
|
|
99
|
+
result_sagittal = convolve(dim, kernel, sagittal_imgs, False, mode)
|
|
100
|
+
|
|
101
|
+
# split and unflip and stack the result on a new axis
|
|
102
|
+
result_axial = np.rot90(result_axial, 1, (3, 2))
|
|
103
|
+
result_sagittal = np.rot90(result_sagittal, 1, (4, 2))
|
|
104
|
+
|
|
105
|
+
result = np.stack([result_coronal, result_axial, result_sagittal])
|
|
106
|
+
|
|
107
|
+
return result
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import math
|
|
2
|
+
from itertools import combinations, permutations
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pywt
|
|
7
|
+
|
|
8
|
+
from ..MEDscan import MEDscan
|
|
9
|
+
from ..utils.image_volume_obj import image_volume_obj
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Wavelet():
|
|
13
|
+
"""
|
|
14
|
+
The wavelet filter class.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
ndims: int,
|
|
20
|
+
wavelet_name="haar",
|
|
21
|
+
padding="symmetric",
|
|
22
|
+
rot_invariance=False):
|
|
23
|
+
"""The constructor of the wavelet filter
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
ndims (int): The number of dimension of the images that will be filter as int.
|
|
27
|
+
wavelet_name (str): The name of the wavelet kernel as string.
|
|
28
|
+
padding (str): The padding type that will be used to produce the convolution
|
|
29
|
+
rot_invariance (bool): If true, rotation invariance will be done on the images.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
None
|
|
33
|
+
"""
|
|
34
|
+
self.dim = ndims
|
|
35
|
+
self.padding = padding
|
|
36
|
+
self.rot = rot_invariance
|
|
37
|
+
self.wavelet = None
|
|
38
|
+
self.kernel_length = None
|
|
39
|
+
self.create_kernel(wavelet_name)
|
|
40
|
+
|
|
41
|
+
def create_kernel(self,
|
|
42
|
+
wavelet_name: str):
|
|
43
|
+
"""Get the wavelet object and his kernel length.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
wavelet_name (str): A string that represent the wavelet name that will be use to create the kernel
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
None
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
self.wavelet = pywt.Wavelet(wavelet_name)
|
|
53
|
+
self.kernel_length = max(self.wavelet.rec_len, self.wavelet.dec_len)
|
|
54
|
+
|
|
55
|
+
def __unpad(self,
|
|
56
|
+
images: np.ndarray,
|
|
57
|
+
padding: List) -> np.ndarray:
|
|
58
|
+
"""Unpad a batch of images
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
images: A numpy nd-array or a list that represent the batch of padded images.
|
|
62
|
+
The shape should be (B, H, W) or (B, H, W, D)
|
|
63
|
+
padding: a list of length 2*self.dim that gives the length of padding on each side of each axis.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
ndarray: A numpy nd-array or a list that represent the batch of unpadded images
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
if self.dim == 2:
|
|
70
|
+
return images[:, padding[0]:-padding[1], padding[2]:-padding[3]]
|
|
71
|
+
elif self.dim == 3:
|
|
72
|
+
return images[:, padding[0]:-padding[1], padding[2]:-padding[3], padding[4]:-padding[5]]
|
|
73
|
+
else:
|
|
74
|
+
raise NotImplementedError
|
|
75
|
+
|
|
76
|
+
def __get_pad_length(self,
|
|
77
|
+
image_shape: List,
|
|
78
|
+
level: int) -> np.ndarray:
|
|
79
|
+
"""Compute the padding length needed to have a padded image where the length
|
|
80
|
+
along each axis is a multiple 2^level.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
image_shape (List): a list of integer that describe the length of the image along each axis.
|
|
84
|
+
level (int): The level of the wavelet transform
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
ndarray: An integer list of length 2*self.dim that gives the length of padding on each side of each axis.
|
|
88
|
+
"""
|
|
89
|
+
padding = []
|
|
90
|
+
ker_length = self.kernel_length*level
|
|
91
|
+
for l in image_shape:
|
|
92
|
+
padded_length = math.ceil((l + 2*(ker_length-1)) / 2**level) * 2**level - l
|
|
93
|
+
padding.extend([math.floor(padded_length/2), math.ceil(padded_length/2)])
|
|
94
|
+
|
|
95
|
+
return padding
|
|
96
|
+
|
|
97
|
+
def _pad_imgs(self,
|
|
98
|
+
images: np.ndarray,
|
|
99
|
+
padding,
|
|
100
|
+
axis: List):
|
|
101
|
+
"""Apply padding on a 3d images using a 2D padding pattern (special for wavelet).
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
images: a numpy array that represent the image.
|
|
105
|
+
padding: The padding length that will apply on each side of each axe.
|
|
106
|
+
axis: A list of axes on which the padding will be done.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
ndarray: A numpy array that represent the padded image.
|
|
110
|
+
"""
|
|
111
|
+
pad_tuple = ()
|
|
112
|
+
j = 0
|
|
113
|
+
|
|
114
|
+
for i in range(np.ndim(images)):
|
|
115
|
+
if i in axis:
|
|
116
|
+
pad_tuple += ((padding[j], padding[j+1]),)
|
|
117
|
+
j += 2
|
|
118
|
+
else:
|
|
119
|
+
pad_tuple += ((0, 0),)
|
|
120
|
+
|
|
121
|
+
return np.pad(images, pad_tuple, mode=self.padding)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def convolve(self,
|
|
125
|
+
images: np.ndarray,
|
|
126
|
+
_filter="LHL",
|
|
127
|
+
level=1)-> np.ndarray:
|
|
128
|
+
"""Filter a given batch of images using pywavelet.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
images (ndarray): A n-dimensional numpy array that represent the images to filter
|
|
132
|
+
_filter (str): The filter to uses.
|
|
133
|
+
level (int): The number of decomposition steps to perform.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
ndarray: The filtered image as numpy nd-array
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
# We pad the images
|
|
140
|
+
padding = self.__get_pad_length(np.shape(images[0]), level)
|
|
141
|
+
axis_list = [i for i in range(0, self.dim)]
|
|
142
|
+
images = np.expand_dims(self._pad_imgs(images[0], padding, axis_list), axis=0)
|
|
143
|
+
|
|
144
|
+
# We generate the to collect the result from pywavelet dictionary
|
|
145
|
+
_index = str().join(['a' if _filter[i] == 'L' else 'd' for i in range(len(_filter))])
|
|
146
|
+
|
|
147
|
+
if self.rot:
|
|
148
|
+
result = []
|
|
149
|
+
_index_list = np.unique([str().join(perm) for perm in permutations(_index, self.dim)])
|
|
150
|
+
|
|
151
|
+
# For each images, we flip each axis.
|
|
152
|
+
for image in images:
|
|
153
|
+
axis_rot = [comb for j in range(self.dim+1) for comb in combinations(np.arange(self.dim), j)]
|
|
154
|
+
images_rot = [np.flip(image, axis) for axis in axis_rot]
|
|
155
|
+
|
|
156
|
+
res_rot = []
|
|
157
|
+
for i in range(len(images_rot)):
|
|
158
|
+
filtered_image = pywt.swtn(images_rot[i], self.wavelet, level=level)[0]
|
|
159
|
+
res_rot.extend([np.flip(filtered_image[j], axis=axis_rot[i]) for j in _index_list])
|
|
160
|
+
|
|
161
|
+
result.extend([np.mean(res_rot, axis=0)])
|
|
162
|
+
else:
|
|
163
|
+
result = []
|
|
164
|
+
for i in range(len(images)):
|
|
165
|
+
result.extend([pywt.swtn(images[i], self.wavelet, level=level)[level-1][_index]])
|
|
166
|
+
|
|
167
|
+
return self.__unpad(np.array(result), padding)
|
|
168
|
+
|
|
169
|
+
def apply_wavelet(
|
|
170
|
+
input_images: Union[np.ndarray, image_volume_obj],
|
|
171
|
+
medscan: MEDscan = None,
|
|
172
|
+
ndims: int = 3,
|
|
173
|
+
wavelet_name: str = "haar",
|
|
174
|
+
subband: str = "LHL",
|
|
175
|
+
level: int = 1,
|
|
176
|
+
padding: str = "symmetric",
|
|
177
|
+
rot_invariance: bool = False
|
|
178
|
+
) -> np.ndarray:
|
|
179
|
+
"""Apply the mean filter to the input image
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
input_images (ndarray): The image to filter.
|
|
183
|
+
medscan (MEDscan, optional): The MEDscan object that will provide the filter parameters.
|
|
184
|
+
ndims (int, optional): The number of dimensions of the input image.
|
|
185
|
+
wavelet_name (str): The name of the wavelet kernel as string.
|
|
186
|
+
level (List[str], optional): The number of decompositions steps to perform.
|
|
187
|
+
subband (str, optional): String of the 1D wavelet kernels ("H" for high-pass
|
|
188
|
+
filter or "L" for low-pass filter). Must have a size of ``ndims``.
|
|
189
|
+
padding (str, optional): The padding type that will be used to produce the convolution. Check options
|
|
190
|
+
here: `numpy.pad <https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
|
|
191
|
+
rot_invariance (bool, optional): If true, rotation invariance will be done on the kernel.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
ndarray: The filtered image.
|
|
195
|
+
"""
|
|
196
|
+
# Check if the input is a numpy array or a Image volume object
|
|
197
|
+
spatial_ref = None
|
|
198
|
+
if type(input_images) == image_volume_obj:
|
|
199
|
+
spatial_ref = input_images.spatialRef
|
|
200
|
+
input_images = input_images.data
|
|
201
|
+
|
|
202
|
+
# Convert to shape : (B, W, H, D)
|
|
203
|
+
input_images = np.expand_dims(input_images.astype(np.float64), axis=0)
|
|
204
|
+
|
|
205
|
+
if medscan:
|
|
206
|
+
# Initialize filter class instance
|
|
207
|
+
_filter = Wavelet(
|
|
208
|
+
ndims=medscan.params.filter.wavelet.ndims,
|
|
209
|
+
wavelet_name=medscan.params.filter.wavelet.basis_function,
|
|
210
|
+
rot_invariance=medscan.params.filter.wavelet.rot_invariance,
|
|
211
|
+
padding=medscan.params.filter.wavelet.padding
|
|
212
|
+
)
|
|
213
|
+
# Run convolution
|
|
214
|
+
result = _filter.convolve(
|
|
215
|
+
input_images,
|
|
216
|
+
_filter=medscan.params.filter.wavelet.subband,
|
|
217
|
+
level=medscan.params.filter.wavelet.level
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
# Initialize filter class instance
|
|
221
|
+
_filter = Wavelet(
|
|
222
|
+
ndims=ndims,
|
|
223
|
+
wavelet_name=wavelet_name,
|
|
224
|
+
rot_invariance=rot_invariance,
|
|
225
|
+
padding=padding
|
|
226
|
+
)
|
|
227
|
+
# Run convolution
|
|
228
|
+
result = _filter.convolve(
|
|
229
|
+
input_images,
|
|
230
|
+
_filter=subband,
|
|
231
|
+
level=level
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if spatial_ref:
|
|
235
|
+
return image_volume_obj(np.squeeze(result), spatial_ref)
|
|
236
|
+
else:
|
|
237
|
+
return np.squeeze(result)
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataCleaner:
|
|
9
|
+
"""
|
|
10
|
+
Class that will clean features of the csv by removing features with too many missing values,
|
|
11
|
+
too little variation, too many missing values per sample, too little variation per sample,
|
|
12
|
+
and imputing missing values.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, df_features: pd.DataFrame, type: str = "continuous"):
|
|
15
|
+
"""
|
|
16
|
+
Constructor of the class DataCleaner
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
df_features (pd.DataFrame): Table of features.
|
|
20
|
+
type (str): Type of variable: "continuous", "hcategorical" or "icategorical". Defaults to "continuous".
|
|
21
|
+
"""
|
|
22
|
+
self.df_features = df_features
|
|
23
|
+
self.type = type
|
|
24
|
+
|
|
25
|
+
def __update_df_features(self, var_of_type: List[str], flag_var_out: List[bool]) -> List[str]:
|
|
26
|
+
"""
|
|
27
|
+
Updates the variable table by deleting the features that are not in the variable of type
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
var_of_type (List[str]): List of variable names.
|
|
31
|
+
flag_var_out (List[bool]): List of variables to flag out.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List[str]: List of variable names that were not flagged out.
|
|
35
|
+
"""
|
|
36
|
+
var_to_delete = np.delete(var_of_type, [i for i, v in enumerate(flag_var_out) if not v])
|
|
37
|
+
var_of_type = np.delete(var_of_type, [i for i, v in enumerate(flag_var_out) if v])
|
|
38
|
+
self.df_features = self.df_features.drop(var_to_delete, axis=1)
|
|
39
|
+
return var_of_type
|
|
40
|
+
|
|
41
|
+
def cut_off_missing_per_sample(self, var_of_type: List[str], missing_cutoff : float = 0.25) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Removes observations/samples with more than ``missing_cutoff`` missing features.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
var_of_type (List[str]): List of variable names.
|
|
47
|
+
missing_cutoff (float): Maximum percentage cut-offs of missing features per sample. Defaults to 25%.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
None.
|
|
51
|
+
"""
|
|
52
|
+
# Initialization
|
|
53
|
+
n_observation, n_features = self.df_features.shape
|
|
54
|
+
empty_vec = np.zeros(n_observation, dtype=int)
|
|
55
|
+
data = self.df_features[var_of_type]
|
|
56
|
+
empty_vec += data.isna().sum(axis=1).values
|
|
57
|
+
|
|
58
|
+
# Gathering results
|
|
59
|
+
ind_obs_out = np.where(((empty_vec/n_features) > missing_cutoff) == True)
|
|
60
|
+
self.df_features = self.df_features.drop(self.df_features.index[ind_obs_out])
|
|
61
|
+
|
|
62
|
+
def cut_off_missing_per_feature(self, var_of_type: List[str], missing_cutoff : float = 0.1) -> List[str]:
|
|
63
|
+
"""
|
|
64
|
+
Removes features with more than ``missing_cutoff`` missing patients.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
var_of_type (list): List of variable names.
|
|
68
|
+
missing_cutoff (float): maximal percentage cut-offs of missing patient samples per variable.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List[str]: List of variable names that were not flagged out.
|
|
72
|
+
"""
|
|
73
|
+
flag_var_out = (((self.df_features[var_of_type].isna().sum()) / self.df_features.shape[0]) > missing_cutoff)
|
|
74
|
+
return self.__update_df_features(var_of_type, flag_var_out)
|
|
75
|
+
|
|
76
|
+
def cut_off_variation(self, var_of_type: List[str], cov_cutoff : float = 0.1) -> List[str]:
|
|
77
|
+
"""
|
|
78
|
+
Removes features with a coefficient of variation (cov) less than ``cov_cutoff``.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
var_of_type (list): List of variable names.
|
|
82
|
+
cov_cutoff (float): minimal coefficient of variation cut-offs over samples per variable. Defaults to 10%.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
List[str]: List of variable names that were not flagged out.
|
|
86
|
+
"""
|
|
87
|
+
eps = np.finfo(np.float32).eps
|
|
88
|
+
cov_df_features = (self.df_features[var_of_type].std(skipna=True) / self.df_features[var_of_type].mean(skipna=True))
|
|
89
|
+
flag_var_out = cov_df_features.abs().add(eps) < cov_cutoff
|
|
90
|
+
return self.__update_df_features(var_of_type, flag_var_out)
|
|
91
|
+
|
|
92
|
+
def impute_missing(self, var_of_type: List[str], imputation_method : str = "mean") -> None:
|
|
93
|
+
"""
|
|
94
|
+
Imputes missing values of the features of type.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
var_of_type (list): List of variable names.
|
|
98
|
+
imputation_method (str): Method of imputation. Can be "mean", "median", "mode" or "random".
|
|
99
|
+
For "random" imputation, a seed can be provided by adding the seed value after the method
|
|
100
|
+
name, for example "random42".
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
None.
|
|
104
|
+
"""
|
|
105
|
+
if self.type in ['continuous', 'hcategorical']:
|
|
106
|
+
# random imputation
|
|
107
|
+
if 'random' in imputation_method:
|
|
108
|
+
if len(imputation_method) > 6:
|
|
109
|
+
try:
|
|
110
|
+
seed = int(imputation_method[7:])
|
|
111
|
+
random.seed(seed)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"Warning: Seed must be an integer. Random seed will be set to None. str({e})")
|
|
114
|
+
random.seed(a=None)
|
|
115
|
+
else:
|
|
116
|
+
random.seed(a=None)
|
|
117
|
+
self.df_features[var_of_type] = self.df_features[var_of_type].apply(lambda x: x.fillna(random.choice(list(x.dropna(axis=0)))))
|
|
118
|
+
|
|
119
|
+
# Imputation with median
|
|
120
|
+
elif 'median' in imputation_method:
|
|
121
|
+
self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].median())
|
|
122
|
+
|
|
123
|
+
# Imputation with mean
|
|
124
|
+
elif 'mean' in imputation_method:
|
|
125
|
+
self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].mean())
|
|
126
|
+
|
|
127
|
+
else:
|
|
128
|
+
raise ValueError("Imputation method for continuous and hcategorical features must be 'random', 'median' or 'mean'.")
|
|
129
|
+
|
|
130
|
+
elif self.type in ['icategorical']:
|
|
131
|
+
if 'random' in imputation_method:
|
|
132
|
+
if len(imputation_method) > 6:
|
|
133
|
+
seed = int(imputation_method[7:])
|
|
134
|
+
random.seed(seed)
|
|
135
|
+
else:
|
|
136
|
+
random.seed(a=None)
|
|
137
|
+
|
|
138
|
+
self.df_features[var_of_type] = self.df_features[var_of_type].apply(lambda x: x.fillna(random.choice(list(x.dropna(axis=0)))))
|
|
139
|
+
|
|
140
|
+
if 'mode' in imputation_method:
|
|
141
|
+
self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].mode().max())
|
|
142
|
+
else:
|
|
143
|
+
raise ValueError("Variable type must be 'continuous', 'hcategorical' or 'icategorical'.")
|
|
144
|
+
|
|
145
|
+
def __call__(self, cleaning_dict: Dict, imputation_method: str = "mean",
|
|
146
|
+
missing_cutoff_ps: float = 0.25, missing_cutoff_pf: float = 0.1,
|
|
147
|
+
cov_cutoff:float = 0.1) -> pd.DataFrame:
|
|
148
|
+
"""
|
|
149
|
+
Applies data cleaning to the features of type.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
cleaning_dict (dict): Dictionary of cleaning parameters (missing cutoffs and coefficient of variation cutoffs etc.).
|
|
153
|
+
var_of_type (list, optional): List of variable names.
|
|
154
|
+
imputation_method (str): Method of imputation. Can be "mean", "median", "mode" or "random".
|
|
155
|
+
For "random" imputation, a seed can be provided by adding the seed value after the method
|
|
156
|
+
name, for example "random42".
|
|
157
|
+
missing_cutoff_ps (float, optional): maximal percentage cut-offs of missing features per sample.
|
|
158
|
+
missing_cutoff_pf (float, optional): maximal percentage cut-offs of missing samples per variable.
|
|
159
|
+
cov_cutoff (float, optional): minimal coefficient of variation cut-offs over samples per variable.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
pd.DataFrame: Cleaned table of features.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
# Initialization
|
|
166
|
+
var_of_type = self.df_features.Properties['userData']['variables']['continuous']
|
|
167
|
+
|
|
168
|
+
# Retrieve thresholds from cleaning_dict if not None
|
|
169
|
+
if cleaning_dict is not None:
|
|
170
|
+
missing_cutoff_pf = cleaning_dict['missingCutoffpf']
|
|
171
|
+
missing_cutoff_ps = cleaning_dict['missingCutoffps']
|
|
172
|
+
cov_cutoff = cleaning_dict['covCutoff']
|
|
173
|
+
imputation_method = cleaning_dict['imputation']
|
|
174
|
+
|
|
175
|
+
# Replace infinite values with NaNs
|
|
176
|
+
self.df_features = self.df_features.replace([np.inf, -np.inf], np.nan)
|
|
177
|
+
|
|
178
|
+
# Remove features with more than missing_cutoff_pf missing samples (NaNs)
|
|
179
|
+
var_of_type = self.cut_off_missing_per_feature(var_of_type, missing_cutoff_pf)
|
|
180
|
+
|
|
181
|
+
# Check
|
|
182
|
+
if len(var_of_type) == 0:
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
# Remove features with a coefficient of variation less than cov_cutoff
|
|
186
|
+
var_of_type = self.cut_off_variation(var_of_type, cov_cutoff)
|
|
187
|
+
|
|
188
|
+
# Check
|
|
189
|
+
if len(var_of_type) == 0:
|
|
190
|
+
return None
|
|
191
|
+
|
|
192
|
+
# Remove scans with more than missing_cutoff_ps missing features
|
|
193
|
+
self.cut_off_missing_per_sample(var_of_type, missing_cutoff_ps)
|
|
194
|
+
|
|
195
|
+
# Impute missing values
|
|
196
|
+
self.impute_missing(var_of_type, imputation_method)
|
|
197
|
+
|
|
198
|
+
return self.df_features
|