mediml 0.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. MEDiml/MEDscan.py +1696 -0
  2. MEDiml/__init__.py +21 -0
  3. MEDiml/biomarkers/BatchExtractor.py +806 -0
  4. MEDiml/biomarkers/BatchExtractorTexturalFilters.py +840 -0
  5. MEDiml/biomarkers/__init__.py +16 -0
  6. MEDiml/biomarkers/diagnostics.py +125 -0
  7. MEDiml/biomarkers/get_oriented_bound_box.py +158 -0
  8. MEDiml/biomarkers/glcm.py +1602 -0
  9. MEDiml/biomarkers/gldzm.py +523 -0
  10. MEDiml/biomarkers/glrlm.py +1315 -0
  11. MEDiml/biomarkers/glszm.py +555 -0
  12. MEDiml/biomarkers/int_vol_hist.py +527 -0
  13. MEDiml/biomarkers/intensity_histogram.py +615 -0
  14. MEDiml/biomarkers/local_intensity.py +89 -0
  15. MEDiml/biomarkers/morph.py +1756 -0
  16. MEDiml/biomarkers/ngldm.py +780 -0
  17. MEDiml/biomarkers/ngtdm.py +414 -0
  18. MEDiml/biomarkers/stats.py +373 -0
  19. MEDiml/biomarkers/utils.py +389 -0
  20. MEDiml/filters/TexturalFilter.py +299 -0
  21. MEDiml/filters/__init__.py +9 -0
  22. MEDiml/filters/apply_filter.py +134 -0
  23. MEDiml/filters/gabor.py +215 -0
  24. MEDiml/filters/laws.py +283 -0
  25. MEDiml/filters/log.py +147 -0
  26. MEDiml/filters/mean.py +121 -0
  27. MEDiml/filters/textural_filters_kernels.py +1738 -0
  28. MEDiml/filters/utils.py +107 -0
  29. MEDiml/filters/wavelet.py +237 -0
  30. MEDiml/learning/DataCleaner.py +198 -0
  31. MEDiml/learning/DesignExperiment.py +480 -0
  32. MEDiml/learning/FSR.py +667 -0
  33. MEDiml/learning/Normalization.py +112 -0
  34. MEDiml/learning/RadiomicsLearner.py +714 -0
  35. MEDiml/learning/Results.py +2237 -0
  36. MEDiml/learning/Stats.py +694 -0
  37. MEDiml/learning/__init__.py +10 -0
  38. MEDiml/learning/cleaning_utils.py +107 -0
  39. MEDiml/learning/ml_utils.py +1015 -0
  40. MEDiml/processing/__init__.py +6 -0
  41. MEDiml/processing/compute_suv_map.py +121 -0
  42. MEDiml/processing/discretisation.py +149 -0
  43. MEDiml/processing/interpolation.py +275 -0
  44. MEDiml/processing/resegmentation.py +66 -0
  45. MEDiml/processing/segmentation.py +912 -0
  46. MEDiml/utils/__init__.py +25 -0
  47. MEDiml/utils/batch_patients.py +45 -0
  48. MEDiml/utils/create_radiomics_table.py +131 -0
  49. MEDiml/utils/data_frame_export.py +42 -0
  50. MEDiml/utils/find_process_names.py +16 -0
  51. MEDiml/utils/get_file_paths.py +34 -0
  52. MEDiml/utils/get_full_rad_names.py +21 -0
  53. MEDiml/utils/get_institutions_from_ids.py +16 -0
  54. MEDiml/utils/get_patient_id_from_scan_name.py +22 -0
  55. MEDiml/utils/get_patient_names.py +26 -0
  56. MEDiml/utils/get_radiomic_names.py +27 -0
  57. MEDiml/utils/get_scan_name_from_rad_name.py +22 -0
  58. MEDiml/utils/image_reader_SITK.py +37 -0
  59. MEDiml/utils/image_volume_obj.py +22 -0
  60. MEDiml/utils/imref.py +340 -0
  61. MEDiml/utils/initialize_features_names.py +62 -0
  62. MEDiml/utils/inpolygon.py +159 -0
  63. MEDiml/utils/interp3.py +43 -0
  64. MEDiml/utils/json_utils.py +78 -0
  65. MEDiml/utils/mode.py +31 -0
  66. MEDiml/utils/parse_contour_string.py +58 -0
  67. MEDiml/utils/save_MEDscan.py +30 -0
  68. MEDiml/utils/strfind.py +32 -0
  69. MEDiml/utils/textureTools.py +188 -0
  70. MEDiml/utils/texture_features_names.py +115 -0
  71. MEDiml/utils/write_radiomics_csv.py +47 -0
  72. MEDiml/wrangling/DataManager.py +1724 -0
  73. MEDiml/wrangling/ProcessDICOM.py +512 -0
  74. MEDiml/wrangling/__init__.py +3 -0
  75. mediml-0.9.9.dist-info/LICENSE.md +674 -0
  76. mediml-0.9.9.dist-info/METADATA +232 -0
  77. mediml-0.9.9.dist-info/RECORD +78 -0
  78. mediml-0.9.9.dist-info/WHEEL +4 -0
@@ -0,0 +1,107 @@
1
+ from typing import List
2
+
3
+ import numpy as np
4
+ from scipy.signal import fftconvolve
5
+
6
+
7
+ def pad_imgs(
8
+ images: np.ndarray,
9
+ padding_length: List,
10
+ axis: List,
11
+ mode: str
12
+ )-> np.ndarray:
13
+ """Apply padding on a 3d images using a 2D padding pattern.
14
+
15
+ Args:
16
+ images (ndarray): a numpy array that represent the image.
17
+ padding_length (List): The padding length that will apply on each side of each axe.
18
+ axis (List): A list of axes on which the padding will be done.
19
+ mode (str): The padding mode. Check options here: `numpy.pad
20
+ <https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
21
+
22
+ Returns:
23
+ ndarray: A numpy array that represent the padded image.
24
+ """
25
+ pad_tuple = ()
26
+ j = 1
27
+
28
+ for i in range(np.ndim(images)):
29
+ if i in axis:
30
+ pad_tuple += ((padding_length[-j], padding_length[-j]),)
31
+ j += 1
32
+ else:
33
+ pad_tuple += ((0, 0),)
34
+
35
+ return np.pad(images, pad_tuple, mode=mode)
36
+
37
+ def convolve(
38
+ dim: int,
39
+ kernel: np.ndarray,
40
+ images: np.ndarray,
41
+ orthogonal_rot: bool=False,
42
+ mode: str = "symmetric"
43
+ ) -> np.ndarray:
44
+ """Convolve a given n-dimensional array with the kernel to generate a filtered image.
45
+
46
+ Args:
47
+ dim (int): The dimension of the images.
48
+ kernel (ndarray): The kernel to use for the convolution.
49
+ images (ndarray): A n-dimensional numpy array that represent a batch of images to filter.
50
+ orthogonal_rot (bool, optional): If true, the 3D images will be rotated over coronal, axial and sagittal axis.
51
+ mode (str, optional): The padding mode. Check options here: `numpy.pad
52
+ <https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
53
+
54
+ Returns:
55
+ ndarray: The filtered image.
56
+ """
57
+
58
+ in_size = np.shape(images)
59
+
60
+ # We only handle 2D or 3D images.
61
+ assert len(in_size) == 3 or len(in_size) == 4, \
62
+ "The tensor should have the followed shape (B, H, W) or (B, D, H, W)"
63
+
64
+ if not orthogonal_rot:
65
+ # If we have a 2D kernel but a 3D images, we squeeze the tensor
66
+ if dim < len(in_size) - 1:
67
+ images = images.reshape((in_size[0] * in_size[1], in_size[2], in_size[3]))
68
+
69
+ # We compute the padding size along each dimension
70
+ padding = [int((kernel.shape[-1] - 1) / 2) for _ in range(dim)]
71
+ pad_axis_list = [i for i in range(1, dim+1)]
72
+
73
+ # We pad the images and we add the channel axis.
74
+ padded_imgs = pad_imgs(images, padding, pad_axis_list, mode)
75
+ new_imgs = np.expand_dims(padded_imgs, axis=1)
76
+
77
+ # Operate the convolution
78
+ if dim < len(in_size) - 1:
79
+ # If we have a 2D kernel but a 3D images, we convolve slice by slice
80
+ result_list = [fftconvolve(np.expand_dims(new_imgs[i], axis=0), kernel, mode='valid') for i in range(len(images))]
81
+ result = np.squeeze(np.stack(result_list), axis=2)
82
+
83
+ else :
84
+ result = fftconvolve(new_imgs, kernel, mode='valid')
85
+
86
+ # Reshape the data to retrieve the following format: (B, C, D, H, W)
87
+ if dim < len(in_size) - 1:
88
+ result = result.reshape((
89
+ in_size[0], in_size[1], result.shape[1], in_size[2], in_size[3])
90
+ ).transpose(0, 2, 1, 3, 4)
91
+
92
+ # If we want orthogonal rotation
93
+ else:
94
+ coronal_imgs = images
95
+ axial_imgs, sagittal_imgs = np.rot90(images, 1, (1, 2)), np.rot90(images, 1, (1, 3))
96
+
97
+ result_coronal = convolve(dim, kernel, coronal_imgs, False, mode)
98
+ result_axial = convolve(dim, kernel, axial_imgs, False, mode)
99
+ result_sagittal = convolve(dim, kernel, sagittal_imgs, False, mode)
100
+
101
+ # split and unflip and stack the result on a new axis
102
+ result_axial = np.rot90(result_axial, 1, (3, 2))
103
+ result_sagittal = np.rot90(result_sagittal, 1, (4, 2))
104
+
105
+ result = np.stack([result_coronal, result_axial, result_sagittal])
106
+
107
+ return result
@@ -0,0 +1,237 @@
1
+ import math
2
+ from itertools import combinations, permutations
3
+ from typing import List, Union
4
+
5
+ import numpy as np
6
+ import pywt
7
+
8
+ from ..MEDscan import MEDscan
9
+ from ..utils.image_volume_obj import image_volume_obj
10
+
11
+
12
+ class Wavelet():
13
+ """
14
+ The wavelet filter class.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ ndims: int,
20
+ wavelet_name="haar",
21
+ padding="symmetric",
22
+ rot_invariance=False):
23
+ """The constructor of the wavelet filter
24
+
25
+ Args:
26
+ ndims (int): The number of dimension of the images that will be filter as int.
27
+ wavelet_name (str): The name of the wavelet kernel as string.
28
+ padding (str): The padding type that will be used to produce the convolution
29
+ rot_invariance (bool): If true, rotation invariance will be done on the images.
30
+
31
+ Returns:
32
+ None
33
+ """
34
+ self.dim = ndims
35
+ self.padding = padding
36
+ self.rot = rot_invariance
37
+ self.wavelet = None
38
+ self.kernel_length = None
39
+ self.create_kernel(wavelet_name)
40
+
41
+ def create_kernel(self,
42
+ wavelet_name: str):
43
+ """Get the wavelet object and his kernel length.
44
+
45
+ Args:
46
+ wavelet_name (str): A string that represent the wavelet name that will be use to create the kernel
47
+
48
+ Returns:
49
+ None
50
+ """
51
+
52
+ self.wavelet = pywt.Wavelet(wavelet_name)
53
+ self.kernel_length = max(self.wavelet.rec_len, self.wavelet.dec_len)
54
+
55
+ def __unpad(self,
56
+ images: np.ndarray,
57
+ padding: List) -> np.ndarray:
58
+ """Unpad a batch of images
59
+
60
+ Args:
61
+ images: A numpy nd-array or a list that represent the batch of padded images.
62
+ The shape should be (B, H, W) or (B, H, W, D)
63
+ padding: a list of length 2*self.dim that gives the length of padding on each side of each axis.
64
+
65
+ Returns:
66
+ ndarray: A numpy nd-array or a list that represent the batch of unpadded images
67
+ """
68
+
69
+ if self.dim == 2:
70
+ return images[:, padding[0]:-padding[1], padding[2]:-padding[3]]
71
+ elif self.dim == 3:
72
+ return images[:, padding[0]:-padding[1], padding[2]:-padding[3], padding[4]:-padding[5]]
73
+ else:
74
+ raise NotImplementedError
75
+
76
+ def __get_pad_length(self,
77
+ image_shape: List,
78
+ level: int) -> np.ndarray:
79
+ """Compute the padding length needed to have a padded image where the length
80
+ along each axis is a multiple 2^level.
81
+
82
+ Args:
83
+ image_shape (List): a list of integer that describe the length of the image along each axis.
84
+ level (int): The level of the wavelet transform
85
+
86
+ Returns:
87
+ ndarray: An integer list of length 2*self.dim that gives the length of padding on each side of each axis.
88
+ """
89
+ padding = []
90
+ ker_length = self.kernel_length*level
91
+ for l in image_shape:
92
+ padded_length = math.ceil((l + 2*(ker_length-1)) / 2**level) * 2**level - l
93
+ padding.extend([math.floor(padded_length/2), math.ceil(padded_length/2)])
94
+
95
+ return padding
96
+
97
+ def _pad_imgs(self,
98
+ images: np.ndarray,
99
+ padding,
100
+ axis: List):
101
+ """Apply padding on a 3d images using a 2D padding pattern (special for wavelet).
102
+
103
+ Args:
104
+ images: a numpy array that represent the image.
105
+ padding: The padding length that will apply on each side of each axe.
106
+ axis: A list of axes on which the padding will be done.
107
+
108
+ Returns:
109
+ ndarray: A numpy array that represent the padded image.
110
+ """
111
+ pad_tuple = ()
112
+ j = 0
113
+
114
+ for i in range(np.ndim(images)):
115
+ if i in axis:
116
+ pad_tuple += ((padding[j], padding[j+1]),)
117
+ j += 2
118
+ else:
119
+ pad_tuple += ((0, 0),)
120
+
121
+ return np.pad(images, pad_tuple, mode=self.padding)
122
+
123
+
124
+ def convolve(self,
125
+ images: np.ndarray,
126
+ _filter="LHL",
127
+ level=1)-> np.ndarray:
128
+ """Filter a given batch of images using pywavelet.
129
+
130
+ Args:
131
+ images (ndarray): A n-dimensional numpy array that represent the images to filter
132
+ _filter (str): The filter to uses.
133
+ level (int): The number of decomposition steps to perform.
134
+
135
+ Returns:
136
+ ndarray: The filtered image as numpy nd-array
137
+ """
138
+
139
+ # We pad the images
140
+ padding = self.__get_pad_length(np.shape(images[0]), level)
141
+ axis_list = [i for i in range(0, self.dim)]
142
+ images = np.expand_dims(self._pad_imgs(images[0], padding, axis_list), axis=0)
143
+
144
+ # We generate the to collect the result from pywavelet dictionary
145
+ _index = str().join(['a' if _filter[i] == 'L' else 'd' for i in range(len(_filter))])
146
+
147
+ if self.rot:
148
+ result = []
149
+ _index_list = np.unique([str().join(perm) for perm in permutations(_index, self.dim)])
150
+
151
+ # For each images, we flip each axis.
152
+ for image in images:
153
+ axis_rot = [comb for j in range(self.dim+1) for comb in combinations(np.arange(self.dim), j)]
154
+ images_rot = [np.flip(image, axis) for axis in axis_rot]
155
+
156
+ res_rot = []
157
+ for i in range(len(images_rot)):
158
+ filtered_image = pywt.swtn(images_rot[i], self.wavelet, level=level)[0]
159
+ res_rot.extend([np.flip(filtered_image[j], axis=axis_rot[i]) for j in _index_list])
160
+
161
+ result.extend([np.mean(res_rot, axis=0)])
162
+ else:
163
+ result = []
164
+ for i in range(len(images)):
165
+ result.extend([pywt.swtn(images[i], self.wavelet, level=level)[level-1][_index]])
166
+
167
+ return self.__unpad(np.array(result), padding)
168
+
169
+ def apply_wavelet(
170
+ input_images: Union[np.ndarray, image_volume_obj],
171
+ medscan: MEDscan = None,
172
+ ndims: int = 3,
173
+ wavelet_name: str = "haar",
174
+ subband: str = "LHL",
175
+ level: int = 1,
176
+ padding: str = "symmetric",
177
+ rot_invariance: bool = False
178
+ ) -> np.ndarray:
179
+ """Apply the mean filter to the input image
180
+
181
+ Args:
182
+ input_images (ndarray): The image to filter.
183
+ medscan (MEDscan, optional): The MEDscan object that will provide the filter parameters.
184
+ ndims (int, optional): The number of dimensions of the input image.
185
+ wavelet_name (str): The name of the wavelet kernel as string.
186
+ level (List[str], optional): The number of decompositions steps to perform.
187
+ subband (str, optional): String of the 1D wavelet kernels ("H" for high-pass
188
+ filter or "L" for low-pass filter). Must have a size of ``ndims``.
189
+ padding (str, optional): The padding type that will be used to produce the convolution. Check options
190
+ here: `numpy.pad <https://numpy.org/doc/stable/reference/generated/numpy.pad.html>`__.
191
+ rot_invariance (bool, optional): If true, rotation invariance will be done on the kernel.
192
+
193
+ Returns:
194
+ ndarray: The filtered image.
195
+ """
196
+ # Check if the input is a numpy array or a Image volume object
197
+ spatial_ref = None
198
+ if type(input_images) == image_volume_obj:
199
+ spatial_ref = input_images.spatialRef
200
+ input_images = input_images.data
201
+
202
+ # Convert to shape : (B, W, H, D)
203
+ input_images = np.expand_dims(input_images.astype(np.float64), axis=0)
204
+
205
+ if medscan:
206
+ # Initialize filter class instance
207
+ _filter = Wavelet(
208
+ ndims=medscan.params.filter.wavelet.ndims,
209
+ wavelet_name=medscan.params.filter.wavelet.basis_function,
210
+ rot_invariance=medscan.params.filter.wavelet.rot_invariance,
211
+ padding=medscan.params.filter.wavelet.padding
212
+ )
213
+ # Run convolution
214
+ result = _filter.convolve(
215
+ input_images,
216
+ _filter=medscan.params.filter.wavelet.subband,
217
+ level=medscan.params.filter.wavelet.level
218
+ )
219
+ else:
220
+ # Initialize filter class instance
221
+ _filter = Wavelet(
222
+ ndims=ndims,
223
+ wavelet_name=wavelet_name,
224
+ rot_invariance=rot_invariance,
225
+ padding=padding
226
+ )
227
+ # Run convolution
228
+ result = _filter.convolve(
229
+ input_images,
230
+ _filter=subband,
231
+ level=level
232
+ )
233
+
234
+ if spatial_ref:
235
+ return image_volume_obj(np.squeeze(result), spatial_ref)
236
+ else:
237
+ return np.squeeze(result)
@@ -0,0 +1,198 @@
1
+ import random
2
+ from typing import Dict, List
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ class DataCleaner:
9
+ """
10
+ Class that will clean features of the csv by removing features with too many missing values,
11
+ too little variation, too many missing values per sample, too little variation per sample,
12
+ and imputing missing values.
13
+ """
14
+ def __init__(self, df_features: pd.DataFrame, type: str = "continuous"):
15
+ """
16
+ Constructor of the class DataCleaner
17
+
18
+ Args:
19
+ df_features (pd.DataFrame): Table of features.
20
+ type (str): Type of variable: "continuous", "hcategorical" or "icategorical". Defaults to "continuous".
21
+ """
22
+ self.df_features = df_features
23
+ self.type = type
24
+
25
+ def __update_df_features(self, var_of_type: List[str], flag_var_out: List[bool]) -> List[str]:
26
+ """
27
+ Updates the variable table by deleting the features that are not in the variable of type
28
+
29
+ Args:
30
+ var_of_type (List[str]): List of variable names.
31
+ flag_var_out (List[bool]): List of variables to flag out.
32
+
33
+ Returns:
34
+ List[str]: List of variable names that were not flagged out.
35
+ """
36
+ var_to_delete = np.delete(var_of_type, [i for i, v in enumerate(flag_var_out) if not v])
37
+ var_of_type = np.delete(var_of_type, [i for i, v in enumerate(flag_var_out) if v])
38
+ self.df_features = self.df_features.drop(var_to_delete, axis=1)
39
+ return var_of_type
40
+
41
+ def cut_off_missing_per_sample(self, var_of_type: List[str], missing_cutoff : float = 0.25) -> None:
42
+ """
43
+ Removes observations/samples with more than ``missing_cutoff`` missing features.
44
+
45
+ Args:
46
+ var_of_type (List[str]): List of variable names.
47
+ missing_cutoff (float): Maximum percentage cut-offs of missing features per sample. Defaults to 25%.
48
+
49
+ Returns:
50
+ None.
51
+ """
52
+ # Initialization
53
+ n_observation, n_features = self.df_features.shape
54
+ empty_vec = np.zeros(n_observation, dtype=int)
55
+ data = self.df_features[var_of_type]
56
+ empty_vec += data.isna().sum(axis=1).values
57
+
58
+ # Gathering results
59
+ ind_obs_out = np.where(((empty_vec/n_features) > missing_cutoff) == True)
60
+ self.df_features = self.df_features.drop(self.df_features.index[ind_obs_out])
61
+
62
+ def cut_off_missing_per_feature(self, var_of_type: List[str], missing_cutoff : float = 0.1) -> List[str]:
63
+ """
64
+ Removes features with more than ``missing_cutoff`` missing patients.
65
+
66
+ Args:
67
+ var_of_type (list): List of variable names.
68
+ missing_cutoff (float): maximal percentage cut-offs of missing patient samples per variable.
69
+
70
+ Returns:
71
+ List[str]: List of variable names that were not flagged out.
72
+ """
73
+ flag_var_out = (((self.df_features[var_of_type].isna().sum()) / self.df_features.shape[0]) > missing_cutoff)
74
+ return self.__update_df_features(var_of_type, flag_var_out)
75
+
76
+ def cut_off_variation(self, var_of_type: List[str], cov_cutoff : float = 0.1) -> List[str]:
77
+ """
78
+ Removes features with a coefficient of variation (cov) less than ``cov_cutoff``.
79
+
80
+ Args:
81
+ var_of_type (list): List of variable names.
82
+ cov_cutoff (float): minimal coefficient of variation cut-offs over samples per variable. Defaults to 10%.
83
+
84
+ Returns:
85
+ List[str]: List of variable names that were not flagged out.
86
+ """
87
+ eps = np.finfo(np.float32).eps
88
+ cov_df_features = (self.df_features[var_of_type].std(skipna=True) / self.df_features[var_of_type].mean(skipna=True))
89
+ flag_var_out = cov_df_features.abs().add(eps) < cov_cutoff
90
+ return self.__update_df_features(var_of_type, flag_var_out)
91
+
92
+ def impute_missing(self, var_of_type: List[str], imputation_method : str = "mean") -> None:
93
+ """
94
+ Imputes missing values of the features of type.
95
+
96
+ Args:
97
+ var_of_type (list): List of variable names.
98
+ imputation_method (str): Method of imputation. Can be "mean", "median", "mode" or "random".
99
+ For "random" imputation, a seed can be provided by adding the seed value after the method
100
+ name, for example "random42".
101
+
102
+ Returns:
103
+ None.
104
+ """
105
+ if self.type in ['continuous', 'hcategorical']:
106
+ # random imputation
107
+ if 'random' in imputation_method:
108
+ if len(imputation_method) > 6:
109
+ try:
110
+ seed = int(imputation_method[7:])
111
+ random.seed(seed)
112
+ except Exception as e:
113
+ print(f"Warning: Seed must be an integer. Random seed will be set to None. str({e})")
114
+ random.seed(a=None)
115
+ else:
116
+ random.seed(a=None)
117
+ self.df_features[var_of_type] = self.df_features[var_of_type].apply(lambda x: x.fillna(random.choice(list(x.dropna(axis=0)))))
118
+
119
+ # Imputation with median
120
+ elif 'median' in imputation_method:
121
+ self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].median())
122
+
123
+ # Imputation with mean
124
+ elif 'mean' in imputation_method:
125
+ self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].mean())
126
+
127
+ else:
128
+ raise ValueError("Imputation method for continuous and hcategorical features must be 'random', 'median' or 'mean'.")
129
+
130
+ elif self.type in ['icategorical']:
131
+ if 'random' in imputation_method:
132
+ if len(imputation_method) > 6:
133
+ seed = int(imputation_method[7:])
134
+ random.seed(seed)
135
+ else:
136
+ random.seed(a=None)
137
+
138
+ self.df_features[var_of_type] = self.df_features[var_of_type].apply(lambda x: x.fillna(random.choice(list(x.dropna(axis=0)))))
139
+
140
+ if 'mode' in imputation_method:
141
+ self.df_features[var_of_type] = self.df_features[var_of_type].fillna(self.df_features[var_of_type].mode().max())
142
+ else:
143
+ raise ValueError("Variable type must be 'continuous', 'hcategorical' or 'icategorical'.")
144
+
145
+ def __call__(self, cleaning_dict: Dict, imputation_method: str = "mean",
146
+ missing_cutoff_ps: float = 0.25, missing_cutoff_pf: float = 0.1,
147
+ cov_cutoff:float = 0.1) -> pd.DataFrame:
148
+ """
149
+ Applies data cleaning to the features of type.
150
+
151
+ Args:
152
+ cleaning_dict (dict): Dictionary of cleaning parameters (missing cutoffs and coefficient of variation cutoffs etc.).
153
+ var_of_type (list, optional): List of variable names.
154
+ imputation_method (str): Method of imputation. Can be "mean", "median", "mode" or "random".
155
+ For "random" imputation, a seed can be provided by adding the seed value after the method
156
+ name, for example "random42".
157
+ missing_cutoff_ps (float, optional): maximal percentage cut-offs of missing features per sample.
158
+ missing_cutoff_pf (float, optional): maximal percentage cut-offs of missing samples per variable.
159
+ cov_cutoff (float, optional): minimal coefficient of variation cut-offs over samples per variable.
160
+
161
+ Returns:
162
+ pd.DataFrame: Cleaned table of features.
163
+ """
164
+
165
+ # Initialization
166
+ var_of_type = self.df_features.Properties['userData']['variables']['continuous']
167
+
168
+ # Retrieve thresholds from cleaning_dict if not None
169
+ if cleaning_dict is not None:
170
+ missing_cutoff_pf = cleaning_dict['missingCutoffpf']
171
+ missing_cutoff_ps = cleaning_dict['missingCutoffps']
172
+ cov_cutoff = cleaning_dict['covCutoff']
173
+ imputation_method = cleaning_dict['imputation']
174
+
175
+ # Replace infinite values with NaNs
176
+ self.df_features = self.df_features.replace([np.inf, -np.inf], np.nan)
177
+
178
+ # Remove features with more than missing_cutoff_pf missing samples (NaNs)
179
+ var_of_type = self.cut_off_missing_per_feature(var_of_type, missing_cutoff_pf)
180
+
181
+ # Check
182
+ if len(var_of_type) == 0:
183
+ return None
184
+
185
+ # Remove features with a coefficient of variation less than cov_cutoff
186
+ var_of_type = self.cut_off_variation(var_of_type, cov_cutoff)
187
+
188
+ # Check
189
+ if len(var_of_type) == 0:
190
+ return None
191
+
192
+ # Remove scans with more than missing_cutoff_ps missing features
193
+ self.cut_off_missing_per_sample(var_of_type, missing_cutoff_ps)
194
+
195
+ # Impute missing values
196
+ self.impute_missing(var_of_type, imputation_method)
197
+
198
+ return self.df_features