dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
- ml_tools/ETL_engineering.py +543 -0
- ml_tools/MICE_imputation.py +27 -28
- ml_tools/PSO_optimization.py +15 -15
- ml_tools/VIF_factor.py +20 -17
- ml_tools/data_exploration.py +58 -32
- ml_tools/ensemble_learning.py +40 -42
- ml_tools/handle_excel.py +98 -78
- ml_tools/logger.py +13 -11
- ml_tools/utilities.py +165 -60
- dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0
ml_tools/MICE_imputation.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
import miceforest as mf
|
|
3
|
-
import
|
|
3
|
+
from pathlib import Path
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
5
|
import numpy as np
|
|
6
|
-
from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
|
|
6
|
+
from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values, make_fullpath
|
|
7
7
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
8
|
-
from typing import Optional
|
|
8
|
+
from typing import Optional, Union
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
@@ -60,7 +60,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
60
60
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
63
|
+
def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
64
64
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
65
65
|
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
66
66
|
save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
|
|
@@ -72,7 +72,7 @@ def get_na_column_names(df: pd.DataFrame):
|
|
|
72
72
|
|
|
73
73
|
|
|
74
74
|
#Convergence diagnostic
|
|
75
|
-
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
|
|
75
|
+
def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: Union[str,Path], fontsize: int=16):
|
|
76
76
|
"""
|
|
77
77
|
Generate and save convergence diagnostic plots for imputed variables.
|
|
78
78
|
|
|
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
90
90
|
raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
91
91
|
|
|
92
92
|
# Check path
|
|
93
|
-
|
|
93
|
+
root_path = make_fullpath(root_dir, make=True)
|
|
94
94
|
|
|
95
95
|
# Styling parameters
|
|
96
96
|
label_font = {'size': fontsize, 'weight': 'bold'}
|
|
@@ -99,8 +99,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
99
99
|
for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
|
|
100
100
|
#Check directory for current dataset
|
|
101
101
|
dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
|
|
102
|
-
local_save_dir =
|
|
103
|
-
os.makedirs(local_save_dir, exist_ok=True)
|
|
102
|
+
local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
|
|
104
103
|
|
|
105
104
|
for feature_name in column_names:
|
|
106
105
|
means_per_iteration = []
|
|
@@ -121,8 +120,8 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
121
120
|
plt.grid(True)
|
|
122
121
|
|
|
123
122
|
feature_save_name = sanitize_filename(feature_name)
|
|
124
|
-
|
|
125
|
-
save_path =
|
|
123
|
+
feature_save_name = feature_save_name + ".svg"
|
|
124
|
+
save_path = local_save_dir / feature_save_name
|
|
126
125
|
plt.savefig(save_path, bbox_inches='tight', format="svg")
|
|
127
126
|
plt.close()
|
|
128
127
|
|
|
@@ -130,18 +129,17 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
130
129
|
|
|
131
130
|
|
|
132
131
|
# Imputed distributions
|
|
133
|
-
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
|
|
132
|
+
def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: Union[str, Path], column_names: list[str], one_plot: bool=False, fontsize: int=14):
|
|
134
133
|
'''
|
|
135
134
|
It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
|
|
136
135
|
|
|
137
136
|
Set `one_plot=True` to save a single image including all feature distribution plots instead.
|
|
138
137
|
'''
|
|
139
138
|
# Check path
|
|
140
|
-
|
|
139
|
+
root_path = make_fullpath(root_dir, make=True)
|
|
140
|
+
|
|
141
141
|
local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
|
|
142
|
-
local_save_dir =
|
|
143
|
-
if not os.path.isdir(local_save_dir):
|
|
144
|
-
os.makedirs(local_save_dir)
|
|
142
|
+
local_save_dir = make_fullpath(root_path / local_dir_name, make=True)
|
|
145
143
|
|
|
146
144
|
# Styling parameters
|
|
147
145
|
legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
|
|
@@ -191,9 +189,11 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
191
189
|
|
|
192
190
|
# sanitize savename
|
|
193
191
|
feature_save_name = sanitize_filename(filename)
|
|
192
|
+
feature_save_name = feature_save_name + ".svg"
|
|
193
|
+
new_save_path = local_save_dir / feature_save_name
|
|
194
194
|
|
|
195
195
|
fig.savefig(
|
|
196
|
-
|
|
196
|
+
new_save_path,
|
|
197
197
|
format='svg',
|
|
198
198
|
bbox_inches='tight',
|
|
199
199
|
pad_inches=0.1
|
|
@@ -213,8 +213,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
213
213
|
print(f"{local_dir_name} completed.")
|
|
214
214
|
|
|
215
215
|
|
|
216
|
-
def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
217
|
-
save_datasets_dir: str, save_metrics_dir: str,
|
|
216
|
+
def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
|
|
217
|
+
save_datasets_dir: Union[str,Path], save_metrics_dir: Union[str,Path],
|
|
218
218
|
binary_columns: Optional[list[str]]=None,
|
|
219
219
|
resulting_datasets: int=1,
|
|
220
220
|
iterations: int=20,
|
|
@@ -230,15 +230,14 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
|
230
230
|
Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
|
|
231
231
|
"""
|
|
232
232
|
# Check paths
|
|
233
|
-
|
|
234
|
-
|
|
233
|
+
save_datasets_path = make_fullpath(save_datasets_dir, make=True)
|
|
234
|
+
save_metrics_path = make_fullpath(save_metrics_dir, make=True)
|
|
235
235
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
all_file_paths = list(list_csv_paths(df_path_or_dir).values())
|
|
236
|
+
input_path = make_fullpath(df_path_or_dir)
|
|
237
|
+
if input_path.is_file():
|
|
238
|
+
all_file_paths = [input_path]
|
|
240
239
|
else:
|
|
241
|
-
|
|
240
|
+
all_file_paths = list(list_csv_paths(input_path).values())
|
|
242
241
|
|
|
243
242
|
for df_path in all_file_paths:
|
|
244
243
|
df, df_name = load_dataframe(df_path=df_path)
|
|
@@ -247,13 +246,13 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
|
|
|
247
246
|
|
|
248
247
|
kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
|
|
249
248
|
|
|
250
|
-
save_imputed_datasets(save_dir=
|
|
249
|
+
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
251
250
|
|
|
252
251
|
imputed_column_names = get_na_column_names(df=df)
|
|
253
252
|
|
|
254
|
-
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=
|
|
253
|
+
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
255
254
|
|
|
256
|
-
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=
|
|
255
|
+
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
|
|
257
256
|
|
|
258
257
|
|
|
259
258
|
def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
ml_tools/PSO_optimization.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
-
import
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
import xgboost as xgb
|
|
4
4
|
import lightgbm as lgb
|
|
5
5
|
from sklearn.ensemble import HistGradientBoostingRegressor
|
|
@@ -7,7 +7,7 @@ from sklearn.base import ClassifierMixin
|
|
|
7
7
|
from typing import Literal, Union, Tuple, Dict, Optional
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from copy import deepcopy
|
|
10
|
-
from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe
|
|
10
|
+
from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
|
|
11
11
|
import torch
|
|
12
12
|
from tqdm import trange
|
|
13
13
|
|
|
@@ -36,7 +36,7 @@ class ObjectiveFunction():
|
|
|
36
36
|
binary_features : int
|
|
37
37
|
Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
|
|
38
38
|
"""
|
|
39
|
-
def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
|
|
39
|
+
def __init__(self, trained_model_path: Union[str, Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
|
|
40
40
|
self.binary_features = binary_features
|
|
41
41
|
self.is_hybrid = False if binary_features <= 0 else True
|
|
42
42
|
self.use_noise = add_noise
|
|
@@ -129,7 +129,7 @@ class ObjectiveFunction():
|
|
|
129
129
|
return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
|
|
132
|
+
def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
|
|
133
133
|
"""
|
|
134
134
|
Loads multiple objective functions from serialized models in the given directory.
|
|
135
135
|
|
|
@@ -174,7 +174,7 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
|
|
|
174
174
|
return names
|
|
175
175
|
|
|
176
176
|
|
|
177
|
-
def _save_results(*dicts, save_dir: str, target_name: str):
|
|
177
|
+
def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
|
|
178
178
|
combined_dict = dict()
|
|
179
179
|
for single_dict in dicts:
|
|
180
180
|
combined_dict.update(single_dict)
|
|
@@ -187,14 +187,14 @@ def _save_results(*dicts, save_dir: str, target_name: str):
|
|
|
187
187
|
def run_pso(lower_boundaries: list[float],
|
|
188
188
|
upper_boundaries: list[float],
|
|
189
189
|
objective_function: ObjectiveFunction,
|
|
190
|
-
save_results_dir: str,
|
|
190
|
+
save_results_dir: Union[str,Path],
|
|
191
191
|
auto_binary_boundaries: bool=True,
|
|
192
192
|
target_name: Union[str, None]=None,
|
|
193
193
|
feature_names: Union[list[str], None]=None,
|
|
194
194
|
swarm_size: int=200,
|
|
195
|
-
max_iterations: int=
|
|
195
|
+
max_iterations: int=3000,
|
|
196
196
|
random_state: int=101,
|
|
197
|
-
post_hoc_analysis: Optional[int]=
|
|
197
|
+
post_hoc_analysis: Optional[int]=10) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
|
|
198
198
|
"""
|
|
199
199
|
Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
|
|
200
200
|
|
|
@@ -206,7 +206,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
206
206
|
Upper bounds for each feature in the search space (as many as features expected by the model).
|
|
207
207
|
objective_function : ObjectiveFunction
|
|
208
208
|
A callable object encapsulating a tree-based regression model.
|
|
209
|
-
save_results_dir : str
|
|
209
|
+
save_results_dir : str | Path
|
|
210
210
|
Directory path to save the results CSV file.
|
|
211
211
|
auto_binary_boundaries : bool
|
|
212
212
|
Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
|
|
@@ -281,7 +281,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
281
281
|
"particle_output": False,
|
|
282
282
|
}
|
|
283
283
|
|
|
284
|
-
|
|
284
|
+
save_results_path = make_fullpath(save_results_dir, make=True)
|
|
285
285
|
|
|
286
286
|
if post_hoc_analysis is None or post_hoc_analysis == 1:
|
|
287
287
|
arguments.update({"seed": random_state})
|
|
@@ -301,7 +301,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
301
301
|
best_target_named = {target_name: best_target}
|
|
302
302
|
|
|
303
303
|
# save results
|
|
304
|
-
_save_results(best_features_named, best_target_named, save_dir=
|
|
304
|
+
_save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
|
|
305
305
|
|
|
306
306
|
return best_features_named, best_target_named
|
|
307
307
|
else:
|
|
@@ -327,7 +327,7 @@ def run_pso(lower_boundaries: list[float],
|
|
|
327
327
|
all_best_targets_named = {target_name: all_best_targets}
|
|
328
328
|
|
|
329
329
|
# save results
|
|
330
|
-
_save_results(all_best_features_named, all_best_targets_named, save_dir=
|
|
330
|
+
_save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
|
|
331
331
|
|
|
332
332
|
return all_best_features_named, all_best_targets_named # type: ignore
|
|
333
333
|
|
|
@@ -340,8 +340,8 @@ def _pso(func: ObjectiveFunction,
|
|
|
340
340
|
lb: np.ndarray,
|
|
341
341
|
ub: np.ndarray,
|
|
342
342
|
device: torch.device,
|
|
343
|
-
swarmsize
|
|
344
|
-
maxiter
|
|
343
|
+
swarmsize: int,
|
|
344
|
+
maxiter: int,
|
|
345
345
|
omega = 0.729, # Clerc and Kennedy’s constriction coefficient
|
|
346
346
|
phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
|
|
347
347
|
phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
|
|
@@ -391,7 +391,7 @@ def _pso(func: ObjectiveFunction,
|
|
|
391
391
|
If True, returns the full history of particle positions and objective scores at each iteration.
|
|
392
392
|
|
|
393
393
|
seed : int or None, default=None
|
|
394
|
-
Random seed for reproducibility. If None,
|
|
394
|
+
Random seed for reproducibility. If None, the random state is not fixed.
|
|
395
395
|
|
|
396
396
|
Returns
|
|
397
397
|
-------
|
ml_tools/VIF_factor.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import numpy as np
|
|
4
4
|
import matplotlib.pyplot as plt
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Optional, Union
|
|
6
6
|
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
7
7
|
from statsmodels.tools.tools import add_constant
|
|
8
8
|
import warnings
|
|
9
|
-
import
|
|
10
|
-
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -22,7 +22,7 @@ def compute_vif(
|
|
|
22
22
|
use_columns: Optional[list[str]] = None,
|
|
23
23
|
ignore_columns: Optional[list[str]] = None,
|
|
24
24
|
max_features_to_plot: int = 20,
|
|
25
|
-
save_dir: Optional[str] = None,
|
|
25
|
+
save_dir: Optional[Union[str,Path]] = None,
|
|
26
26
|
filename: Optional[str] = None,
|
|
27
27
|
fontsize: int = 14,
|
|
28
28
|
show_plot: bool = True,
|
|
@@ -36,7 +36,7 @@ def compute_vif(
|
|
|
36
36
|
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
37
37
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
38
38
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
39
|
-
save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
|
|
39
|
+
save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
|
|
40
40
|
filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
|
|
41
41
|
fontsize (int): Base fontsize to scale title and labels on the plot.
|
|
42
42
|
show_plot (bool): Display plot.
|
|
@@ -128,15 +128,16 @@ def compute_vif(
|
|
|
128
128
|
plt.tight_layout()
|
|
129
129
|
|
|
130
130
|
if save_dir:
|
|
131
|
-
|
|
131
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
132
132
|
if filename is None:
|
|
133
133
|
filename = "VIF_plot.svg"
|
|
134
134
|
else:
|
|
135
135
|
filename = sanitize_filename(filename)
|
|
136
|
+
filename = "VIF_" + filename
|
|
136
137
|
if not filename.endswith(".svg"):
|
|
137
138
|
filename += ".svg"
|
|
138
|
-
|
|
139
|
-
plt.savefig(
|
|
139
|
+
full_save_path = save_path / filename
|
|
140
|
+
plt.savefig(full_save_path, format='svg', bbox_inches='tight')
|
|
140
141
|
print(f"\tSaved VIF plot: '{filename}'")
|
|
141
142
|
|
|
142
143
|
if show_plot:
|
|
@@ -176,9 +177,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
176
177
|
return result_df, to_drop
|
|
177
178
|
|
|
178
179
|
|
|
179
|
-
def compute_vif_multi(input_directory: str,
|
|
180
|
-
output_plot_directory: str,
|
|
181
|
-
output_dataset_directory: Optional[str] = None,
|
|
180
|
+
def compute_vif_multi(input_directory: Union[str, Path],
|
|
181
|
+
output_plot_directory: Union[str, Path],
|
|
182
|
+
output_dataset_directory: Optional[Union[str, Path]] = None,
|
|
182
183
|
use_columns: Optional[list[str]] = None,
|
|
183
184
|
ignore_columns: Optional[list[str]] = None,
|
|
184
185
|
max_features_to_plot: int = 20,
|
|
@@ -188,9 +189,9 @@ def compute_vif_multi(input_directory: str,
|
|
|
188
189
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
189
190
|
|
|
190
191
|
Args:
|
|
191
|
-
input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
|
|
192
|
-
output_plot_directory (str): Save plots to this directory.
|
|
193
|
-
output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
|
|
192
|
+
input_directory (str | Path): Target directory with CSV files able to be loaded as DataFrame.
|
|
193
|
+
output_plot_directory (str | Path): Save plots to this directory.
|
|
194
|
+
output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
|
|
194
195
|
use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
|
|
195
196
|
ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
|
|
196
197
|
max_features_to_plot (int): Adjust the number of features shown in the plot.
|
|
@@ -202,7 +203,9 @@ def compute_vif_multi(input_directory: str,
|
|
|
202
203
|
A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
|
|
203
204
|
"""
|
|
204
205
|
if output_dataset_directory is not None:
|
|
205
|
-
|
|
206
|
+
output_dataset_path = make_fullpath(output_dataset_directory, make=True)
|
|
207
|
+
else:
|
|
208
|
+
output_dataset_path = None
|
|
206
209
|
|
|
207
210
|
for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
|
|
208
211
|
vif_dataframe = compute_vif(df=df,
|
|
@@ -215,12 +218,12 @@ def compute_vif_multi(input_directory: str,
|
|
|
215
218
|
show_plot=False,
|
|
216
219
|
verbose=False)
|
|
217
220
|
|
|
218
|
-
if
|
|
221
|
+
if output_dataset_path is not None:
|
|
219
222
|
new_filename = df_name + '_VIF'
|
|
220
223
|
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
221
224
|
|
|
222
225
|
if len(dropped_cols) > 0:
|
|
223
|
-
save_dataframe(df=result_df, save_dir=
|
|
226
|
+
save_dataframe(df=result_df, save_dir=output_dataset_path, filename=new_filename)
|
|
224
227
|
|
|
225
228
|
|
|
226
229
|
def info():
|
ml_tools/data_exploration.py
CHANGED
|
@@ -5,9 +5,9 @@ import seaborn as sns
|
|
|
5
5
|
from IPython import get_ipython
|
|
6
6
|
from IPython.display import clear_output
|
|
7
7
|
import time
|
|
8
|
-
from typing import Union, Literal, Dict, Tuple, List
|
|
9
|
-
import
|
|
10
|
-
from .utilities import sanitize_filename, _script_info
|
|
8
|
+
from typing import Union, Literal, Dict, Tuple, List, Optional
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from .utilities import sanitize_filename, _script_info, make_fullpath
|
|
11
11
|
import re
|
|
12
12
|
|
|
13
13
|
|
|
@@ -59,26 +59,48 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
59
59
|
return summary
|
|
60
60
|
|
|
61
61
|
|
|
62
|
-
def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
|
|
62
|
+
def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
|
|
63
63
|
"""
|
|
64
|
-
Drops rows
|
|
64
|
+
Drops rows from the DataFrame using a two-stage strategy:
|
|
65
|
+
|
|
66
|
+
1. If `targets`, remove any row where all target columns are missing.
|
|
67
|
+
2. Among features, drop those with more than `threshold` fraction of missing values.
|
|
65
68
|
|
|
66
69
|
Parameters:
|
|
67
70
|
df (pd.DataFrame): The input DataFrame.
|
|
68
|
-
|
|
71
|
+
targets (list[str] | None): List of target column names.
|
|
72
|
+
threshold (float): Maximum allowed fraction of missing values in feature columns.
|
|
69
73
|
|
|
70
74
|
Returns:
|
|
71
|
-
pd.DataFrame: A
|
|
75
|
+
pd.DataFrame: A cleaned DataFrame with problematic rows removed.
|
|
72
76
|
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
if
|
|
77
|
-
|
|
77
|
+
df_clean = df.copy()
|
|
78
|
+
|
|
79
|
+
# Stage 1: Drop rows with all target columns missing
|
|
80
|
+
if targets is not None:
|
|
81
|
+
target_na = df_clean[targets].isnull().all(axis=1)
|
|
82
|
+
if target_na.any():
|
|
83
|
+
print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
84
|
+
df_clean = df_clean[~target_na]
|
|
85
|
+
else:
|
|
86
|
+
print("✅ No rows with all targets missing.")
|
|
78
87
|
else:
|
|
79
|
-
|
|
88
|
+
targets = []
|
|
89
|
+
|
|
90
|
+
# Stage 2: Drop rows based on feature column missing values
|
|
91
|
+
feature_cols = [col for col in df_clean.columns if col not in targets]
|
|
92
|
+
if feature_cols:
|
|
93
|
+
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
94
|
+
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
|
|
95
|
+
if len(rows_to_drop) > 0:
|
|
96
|
+
print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
97
|
+
df_clean = df_clean.drop(index=rows_to_drop)
|
|
98
|
+
else:
|
|
99
|
+
print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
100
|
+
else:
|
|
101
|
+
print("⚠️ No feature columns available to evaluate.")
|
|
80
102
|
|
|
81
|
-
return
|
|
103
|
+
return df_clean
|
|
82
104
|
|
|
83
105
|
|
|
84
106
|
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
@@ -205,13 +227,16 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
|
|
|
205
227
|
return df_cont, df_bin # type: ignore
|
|
206
228
|
|
|
207
229
|
|
|
208
|
-
def plot_correlation_heatmap(df: pd.DataFrame,
|
|
230
|
+
def plot_correlation_heatmap(df: pd.DataFrame,
|
|
231
|
+
save_dir: Union[str, Path, None] = None,
|
|
232
|
+
plot_title: str="Correlation Heatmap",
|
|
233
|
+
method: Literal["pearson", "kendall", "spearman"]="pearson"):
|
|
209
234
|
"""
|
|
210
235
|
Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
|
|
211
236
|
|
|
212
237
|
Args:
|
|
213
238
|
df (pd.DataFrame): The input dataset.
|
|
214
|
-
save_dir (str | None): If provided, the heatmap will be saved to this directory as a svg file.
|
|
239
|
+
save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
|
|
215
240
|
plot_title: To make different plots, or overwrite existing ones.
|
|
216
241
|
method (str): Correlation method to use. Must be one of:
|
|
217
242
|
- 'pearson' (default): measures linear correlation (assumes normally distributed data),
|
|
@@ -254,10 +279,13 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
|
|
|
254
279
|
plt.tight_layout()
|
|
255
280
|
|
|
256
281
|
if save_dir:
|
|
282
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
257
283
|
# sanitize the plot title to save the file
|
|
258
284
|
plot_title = sanitize_filename(plot_title)
|
|
259
|
-
|
|
260
|
-
|
|
285
|
+
plot_title = plot_title + ".svg"
|
|
286
|
+
|
|
287
|
+
full_path = save_path / plot_title
|
|
288
|
+
|
|
261
289
|
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
262
290
|
print(f"Saved correlation heatmap: '{plot_title}.svg'")
|
|
263
291
|
|
|
@@ -322,7 +350,7 @@ def check_value_distributions(df: pd.DataFrame, view_frequencies: bool=True, bin
|
|
|
322
350
|
user_input_ = input("Press enter to continue")
|
|
323
351
|
|
|
324
352
|
|
|
325
|
-
def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
|
|
353
|
+
def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
|
|
326
354
|
"""
|
|
327
355
|
Plots and saves the value distributions for all (or selected) columns in a DataFrame,
|
|
328
356
|
with adaptive binning for numerical columns when appropriate.
|
|
@@ -335,7 +363,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
|
|
|
335
363
|
|
|
336
364
|
Args:
|
|
337
365
|
df (pd.DataFrame): The input DataFrame whose columns are to be analyzed.
|
|
338
|
-
save_dir (str): Directory path where the plots will be saved. Will be created if it does not exist.
|
|
366
|
+
save_dir (str | Path): Directory path where the plots will be saved. Will be created if it does not exist.
|
|
339
367
|
bin_threshold (int): Minimum number of unique values required to trigger binning
|
|
340
368
|
for numerical columns.
|
|
341
369
|
skip_cols_with_key (str | None): If provided, any column whose name contains this
|
|
@@ -346,8 +374,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
|
|
|
346
374
|
- All non-alphanumeric characters in column names are sanitized for safe file naming.
|
|
347
375
|
- Colormap is automatically adapted based on the number of categories or bins.
|
|
348
376
|
"""
|
|
349
|
-
|
|
350
|
-
os.makedirs(save_dir, exist_ok=True)
|
|
377
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
351
378
|
|
|
352
379
|
dict_to_plot_std = dict()
|
|
353
380
|
dict_to_plot_freq = dict()
|
|
@@ -384,13 +411,12 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
|
|
|
384
411
|
view_freq = 100 * view_std / view_std.sum() # Percentage
|
|
385
412
|
# view_freq = df[col].value_counts(normalize=True, bins=10) # relative percentages
|
|
386
413
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
saved_plots += 1
|
|
414
|
+
dict_to_plot_std[col] = dict(view_std)
|
|
415
|
+
dict_to_plot_freq[col] = dict(view_freq)
|
|
416
|
+
saved_plots += 1
|
|
391
417
|
|
|
392
418
|
# plot helper
|
|
393
|
-
def _plot_helper(dict_: dict, target_dir:
|
|
419
|
+
def _plot_helper(dict_: dict, target_dir: Path, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
|
|
394
420
|
for col, data in dict_.items():
|
|
395
421
|
safe_col = sanitize_filename(col)
|
|
396
422
|
|
|
@@ -412,15 +438,15 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
|
|
|
412
438
|
plt.gca().set_facecolor('#f9f9f9')
|
|
413
439
|
plt.tight_layout()
|
|
414
440
|
|
|
415
|
-
plot_path =
|
|
441
|
+
plot_path = target_dir / f"{safe_col}.png"
|
|
416
442
|
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
|
|
417
443
|
plt.close()
|
|
418
444
|
|
|
419
445
|
# Save plots
|
|
420
|
-
freq_dir =
|
|
421
|
-
std_dir =
|
|
422
|
-
|
|
423
|
-
|
|
446
|
+
freq_dir = save_path / "Distribution_Frequency"
|
|
447
|
+
std_dir = save_path / "Distribution_Counts"
|
|
448
|
+
freq_dir.mkdir(parents=True, exist_ok=True)
|
|
449
|
+
std_dir.mkdir(parents=True, exist_ok=True)
|
|
424
450
|
_plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
|
|
425
451
|
_plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")
|
|
426
452
|
|