dragon-ml-toolbox 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (25) hide show
  1. {dragon_ml_toolbox-2.0.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-2.1.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/MICE_imputation.py +27 -28
  4. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/PSO_optimization.py +12 -12
  5. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/VIF_factor.py +20 -17
  6. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/data_exploration.py +58 -32
  7. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/ensemble_learning.py +40 -42
  8. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/handle_excel.py +98 -78
  9. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/logger.py +13 -11
  10. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/utilities.py +100 -46
  11. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/pyproject.toml +1 -1
  12. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/LICENSE +0 -0
  13. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/LICENSE-THIRD-PARTY.md +0 -0
  14. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/README.md +0 -0
  15. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  16. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  17. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  18. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  19. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/__init__.py +0 -0
  20. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/_particle_swarm_optimization.py +0 -0
  21. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/datasetmaster.py +0 -0
  22. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/pytorch_models.py +0 -0
  23. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/trainer.py +0 -0
  24. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/ml_tools/vision_helpers.py +0 -0
  25. {dragon_ml_toolbox-2.0.0 → dragon_ml_toolbox-2.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,11 +1,11 @@
1
1
  import pandas as pd
2
2
  import miceforest as mf
3
- import os
3
+ from pathlib import Path
4
4
  import matplotlib.pyplot as plt
5
5
  import numpy as np
6
- from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
6
+ from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values, make_fullpath
7
7
  from plotnine import ggplot, labs, theme, element_blank # type: ignore
8
- from typing import Optional
8
+ from typing import Optional, Union
9
9
 
10
10
 
11
11
  __all__ = [
@@ -60,7 +60,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
60
60
  return kernel, imputed_datasets, imputed_dataset_names
61
61
 
62
62
 
63
- def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
63
+ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
64
64
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
65
65
  merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
66
66
  save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
@@ -72,7 +72,7 @@ def get_na_column_names(df: pd.DataFrame):
72
72
 
73
73
 
74
74
  #Convergence diagnostic
75
- def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
75
+ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: Union[str,Path], fontsize: int=16):
76
76
  """
77
77
  Generate and save convergence diagnostic plots for imputed variables.
78
78
 
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
90
90
  raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
91
91
 
92
92
  # Check path
93
- os.makedirs(root_dir, exist_ok=True)
93
+ root_path = make_fullpath(root_dir, make=True)
94
94
 
95
95
  # Styling parameters
96
96
  label_font = {'size': fontsize, 'weight': 'bold'}
@@ -99,8 +99,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
99
99
  for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
100
100
  #Check directory for current dataset
101
101
  dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
102
- local_save_dir = os.path.join(root_dir, dataset_file_dir)
103
- os.makedirs(local_save_dir, exist_ok=True)
102
+ local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
104
103
 
105
104
  for feature_name in column_names:
106
105
  means_per_iteration = []
@@ -121,8 +120,8 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
121
120
  plt.grid(True)
122
121
 
123
122
  feature_save_name = sanitize_filename(feature_name)
124
-
125
- save_path = os.path.join(local_save_dir, feature_save_name + ".svg")
123
+ feature_save_name = feature_save_name + ".svg"
124
+ save_path = local_save_dir / feature_save_name
126
125
  plt.savefig(save_path, bbox_inches='tight', format="svg")
127
126
  plt.close()
128
127
 
@@ -130,18 +129,17 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
130
129
 
131
130
 
132
131
  # Imputed distributions
133
- def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
132
+ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: Union[str, Path], column_names: list[str], one_plot: bool=False, fontsize: int=14):
134
133
  '''
135
134
  It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
136
135
 
137
136
  Set `one_plot=True` to save a single image including all feature distribution plots instead.
138
137
  '''
139
138
  # Check path
140
- os.makedirs(root_dir, exist_ok=True)
139
+ root_path = make_fullpath(root_dir, make=True)
140
+
141
141
  local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
142
- local_save_dir = os.path.join(root_dir, local_dir_name)
143
- if not os.path.isdir(local_save_dir):
144
- os.makedirs(local_save_dir)
142
+ local_save_dir = make_fullpath(root_path / local_dir_name, make=True)
145
143
 
146
144
  # Styling parameters
147
145
  legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
@@ -191,9 +189,11 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
191
189
 
192
190
  # sanitize savename
193
191
  feature_save_name = sanitize_filename(filename)
192
+ feature_save_name = feature_save_name + ".svg"
193
+ new_save_path = local_save_dir / feature_save_name
194
194
 
195
195
  fig.savefig(
196
- os.path.join(local_save_dir, feature_save_name + ".svg"),
196
+ new_save_path,
197
197
  format='svg',
198
198
  bbox_inches='tight',
199
199
  pad_inches=0.1
@@ -213,8 +213,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
213
213
  print(f"{local_dir_name} completed.")
214
214
 
215
215
 
216
- def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
217
- save_datasets_dir: str, save_metrics_dir: str,
216
+ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
217
+ save_datasets_dir: Union[str,Path], save_metrics_dir: Union[str,Path],
218
218
  binary_columns: Optional[list[str]]=None,
219
219
  resulting_datasets: int=1,
220
220
  iterations: int=20,
@@ -230,15 +230,14 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
230
230
  Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
231
231
  """
232
232
  # Check paths
233
- os.makedirs(save_datasets_dir, exist_ok=True)
234
- os.makedirs(save_metrics_dir, exist_ok=True)
233
+ save_datasets_path = make_fullpath(save_datasets_dir, make=True)
234
+ save_metrics_path = make_fullpath(save_metrics_dir, make=True)
235
235
 
236
- if os.path.isfile(df_path_or_dir):
237
- all_file_paths = [df_path_or_dir]
238
- elif os.path.isdir(df_path_or_dir):
239
- all_file_paths = list(list_csv_paths(df_path_or_dir).values())
236
+ input_path = make_fullpath(df_path_or_dir)
237
+ if input_path.is_file():
238
+ all_file_paths = [input_path]
240
239
  else:
241
- raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
240
+ all_file_paths = list(list_csv_paths(input_path).values())
242
241
 
243
242
  for df_path in all_file_paths:
244
243
  df, df_name = load_dataframe(df_path=df_path)
@@ -247,13 +246,13 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
247
246
 
248
247
  kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
249
248
 
250
- save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
249
+ save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
251
250
 
252
251
  imputed_column_names = get_na_column_names(df=df)
253
252
 
254
- get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
253
+ get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
255
254
 
256
- get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
255
+ get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
257
256
 
258
257
 
259
258
  def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
@@ -1,5 +1,5 @@
1
1
  import numpy as np
2
- import os
2
+ from pathlib import Path
3
3
  import xgboost as xgb
4
4
  import lightgbm as lgb
5
5
  from sklearn.ensemble import HistGradientBoostingRegressor
@@ -7,7 +7,7 @@ from sklearn.base import ClassifierMixin
7
7
  from typing import Literal, Union, Tuple, Dict, Optional
8
8
  import pandas as pd
9
9
  from copy import deepcopy
10
- from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe
10
+ from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
11
11
  import torch
12
12
  from tqdm import trange
13
13
 
@@ -36,7 +36,7 @@ class ObjectiveFunction():
36
36
  binary_features : int
37
37
  Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
38
38
  """
39
- def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
39
+ def __init__(self, trained_model_path: Union[str, Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
40
40
  self.binary_features = binary_features
41
41
  self.is_hybrid = False if binary_features <= 0 else True
42
42
  self.use_noise = add_noise
@@ -129,7 +129,7 @@ class ObjectiveFunction():
129
129
  return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
130
130
 
131
131
 
132
- def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
132
+ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
133
133
  """
134
134
  Loads multiple objective functions from serialized models in the given directory.
135
135
 
@@ -174,7 +174,7 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
174
174
  return names
175
175
 
176
176
 
177
- def _save_results(*dicts, save_dir: str, target_name: str):
177
+ def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
178
178
  combined_dict = dict()
179
179
  for single_dict in dicts:
180
180
  combined_dict.update(single_dict)
@@ -187,14 +187,14 @@ def _save_results(*dicts, save_dir: str, target_name: str):
187
187
  def run_pso(lower_boundaries: list[float],
188
188
  upper_boundaries: list[float],
189
189
  objective_function: ObjectiveFunction,
190
- save_results_dir: str,
190
+ save_results_dir: Union[str,Path],
191
191
  auto_binary_boundaries: bool=True,
192
192
  target_name: Union[str, None]=None,
193
193
  feature_names: Union[list[str], None]=None,
194
194
  swarm_size: int=200,
195
- max_iterations: int=1000,
195
+ max_iterations: int=3000,
196
196
  random_state: int=101,
197
- post_hoc_analysis: Optional[int]=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
197
+ post_hoc_analysis: Optional[int]=10) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
198
198
  """
199
199
  Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
200
200
 
@@ -206,7 +206,7 @@ def run_pso(lower_boundaries: list[float],
206
206
  Upper bounds for each feature in the search space (as many as features expected by the model).
207
207
  objective_function : ObjectiveFunction
208
208
  A callable object encapsulating a tree-based regression model.
209
- save_results_dir : str
209
+ save_results_dir : str | Path
210
210
  Directory path to save the results CSV file.
211
211
  auto_binary_boundaries : bool
212
212
  Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
@@ -281,7 +281,7 @@ def run_pso(lower_boundaries: list[float],
281
281
  "particle_output": False,
282
282
  }
283
283
 
284
- os.makedirs(save_results_dir, exist_ok=True)
284
+ save_results_path = make_fullpath(save_results_dir, make=True)
285
285
 
286
286
  if post_hoc_analysis is None or post_hoc_analysis == 1:
287
287
  arguments.update({"seed": random_state})
@@ -301,7 +301,7 @@ def run_pso(lower_boundaries: list[float],
301
301
  best_target_named = {target_name: best_target}
302
302
 
303
303
  # save results
304
- _save_results(best_features_named, best_target_named, save_dir=save_results_dir, target_name=target_name)
304
+ _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
305
305
 
306
306
  return best_features_named, best_target_named
307
307
  else:
@@ -327,7 +327,7 @@ def run_pso(lower_boundaries: list[float],
327
327
  all_best_targets_named = {target_name: all_best_targets}
328
328
 
329
329
  # save results
330
- _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_dir, target_name=target_name)
330
+ _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
331
331
 
332
332
  return all_best_features_named, all_best_targets_named # type: ignore
333
333
 
@@ -2,12 +2,12 @@
2
2
  import pandas as pd
3
3
  import numpy as np
4
4
  import matplotlib.pyplot as plt
5
- from typing import Optional
5
+ from typing import Optional, Union
6
6
  from statsmodels.stats.outliers_influence import variance_inflation_factor
7
7
  from statsmodels.tools.tools import add_constant
8
8
  import warnings
9
- import os
10
- from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
9
+ from pathlib import Path
10
+ from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
11
11
 
12
12
 
13
13
  __all__ = [
@@ -22,7 +22,7 @@ def compute_vif(
22
22
  use_columns: Optional[list[str]] = None,
23
23
  ignore_columns: Optional[list[str]] = None,
24
24
  max_features_to_plot: int = 20,
25
- save_dir: Optional[str] = None,
25
+ save_dir: Optional[Union[str,Path]] = None,
26
26
  filename: Optional[str] = None,
27
27
  fontsize: int = 14,
28
28
  show_plot: bool = True,
@@ -36,7 +36,7 @@ def compute_vif(
36
36
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
37
37
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
38
38
  max_features_to_plot (int): Adjust the number of features shown in the plot.
39
- save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
39
+ save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
40
40
  filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
41
41
  fontsize (int): Base fontsize to scale title and labels on the plot.
42
42
  show_plot (bool): Display plot.
@@ -128,15 +128,16 @@ def compute_vif(
128
128
  plt.tight_layout()
129
129
 
130
130
  if save_dir:
131
- os.makedirs(save_dir, exist_ok=True)
131
+ save_path = make_fullpath(save_dir, make=True)
132
132
  if filename is None:
133
133
  filename = "VIF_plot.svg"
134
134
  else:
135
135
  filename = sanitize_filename(filename)
136
+ filename = "VIF_" + filename
136
137
  if not filename.endswith(".svg"):
137
138
  filename += ".svg"
138
- save_path = os.path.join(save_dir, "VIF_" + filename)
139
- plt.savefig(save_path, format='svg', bbox_inches='tight')
139
+ full_save_path = save_path / filename
140
+ plt.savefig(full_save_path, format='svg', bbox_inches='tight')
140
141
  print(f"\tSaved VIF plot: '{filename}'")
141
142
 
142
143
  if show_plot:
@@ -176,9 +177,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
176
177
  return result_df, to_drop
177
178
 
178
179
 
179
- def compute_vif_multi(input_directory: str,
180
- output_plot_directory: str,
181
- output_dataset_directory: Optional[str] = None,
180
+ def compute_vif_multi(input_directory: Union[str, Path],
181
+ output_plot_directory: Union[str, Path],
182
+ output_dataset_directory: Optional[Union[str, Path]] = None,
182
183
  use_columns: Optional[list[str]] = None,
183
184
  ignore_columns: Optional[list[str]] = None,
184
185
  max_features_to_plot: int = 20,
@@ -188,9 +189,9 @@ def compute_vif_multi(input_directory: str,
188
189
  Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
189
190
 
190
191
  Args:
191
- input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
192
- output_plot_directory (str): Save plots to this directory.
193
- output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
192
+ input_directory (str | Path): Target directory with CSV files able to be loaded as DataFrame.
193
+ output_plot_directory (str | Path): Save plots to this directory.
194
+ output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
194
195
  use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
195
196
  ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
196
197
  max_features_to_plot (int): Adjust the number of features shown in the plot.
@@ -202,7 +203,9 @@ def compute_vif_multi(input_directory: str,
202
203
  A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
203
204
  """
204
205
  if output_dataset_directory is not None:
205
- os.makedirs(output_dataset_directory, exist_ok=True)
206
+ output_dataset_path = make_fullpath(output_dataset_directory, make=True)
207
+ else:
208
+ output_dataset_path = None
206
209
 
207
210
  for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
208
211
  vif_dataframe = compute_vif(df=df,
@@ -215,12 +218,12 @@ def compute_vif_multi(input_directory: str,
215
218
  show_plot=False,
216
219
  verbose=False)
217
220
 
218
- if output_dataset_directory is not None:
221
+ if output_dataset_path is not None:
219
222
  new_filename = df_name + '_VIF'
220
223
  result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
221
224
 
222
225
  if len(dropped_cols) > 0:
223
- save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
226
+ save_dataframe(df=result_df, save_dir=output_dataset_path, filename=new_filename)
224
227
 
225
228
 
226
229
  def info():
@@ -5,9 +5,9 @@ import seaborn as sns
5
5
  from IPython import get_ipython
6
6
  from IPython.display import clear_output
7
7
  import time
8
- from typing import Union, Literal, Dict, Tuple, List
9
- import os
10
- from .utilities import sanitize_filename, _script_info
8
+ from typing import Union, Literal, Dict, Tuple, List, Optional
9
+ from pathlib import Path
10
+ from .utilities import sanitize_filename, _script_info, make_fullpath
11
11
  import re
12
12
 
13
13
 
@@ -59,26 +59,48 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
59
59
  return summary
60
60
 
61
61
 
62
- def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
62
+ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
63
63
  """
64
- Drops rows with more than `threshold` fraction of missing values.
64
+ Drops rows from the DataFrame using a two-stage strategy:
65
+
66
+ 1. If `targets`, remove any row where all target columns are missing.
67
+ 2. Among features, drop those with more than `threshold` fraction of missing values.
65
68
 
66
69
  Parameters:
67
70
  df (pd.DataFrame): The input DataFrame.
68
- threshold (float): Fraction of missing values above which rows are dropped.
71
+ targets (list[str] | None): List of target column names.
72
+ threshold (float): Maximum allowed fraction of missing values in feature columns.
69
73
 
70
74
  Returns:
71
- pd.DataFrame: A new DataFrame without the dropped rows.
75
+ pd.DataFrame: A cleaned DataFrame with problematic rows removed.
72
76
  """
73
- missing_fraction = df.isnull().mean(axis=1)
74
- rows_to_drop = missing_fraction[missing_fraction > threshold].index
75
-
76
- if len(rows_to_drop) > 0:
77
- print(f"Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing data.")
77
+ df_clean = df.copy()
78
+
79
+ # Stage 1: Drop rows with all target columns missing
80
+ if targets is not None:
81
+ target_na = df_clean[targets].isnull().all(axis=1)
82
+ if target_na.any():
83
+ print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
84
+ df_clean = df_clean[~target_na]
85
+ else:
86
+ print("✅ No rows with all targets missing.")
78
87
  else:
79
- print(f"No rows have more than {threshold*100:.0f}% missing data.")
88
+ targets = []
89
+
90
+ # Stage 2: Drop rows based on feature column missing values
91
+ feature_cols = [col for col in df_clean.columns if col not in targets]
92
+ if feature_cols:
93
+ feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
94
+ rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
95
+ if len(rows_to_drop) > 0:
96
+ print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
97
+ df_clean = df_clean.drop(index=rows_to_drop)
98
+ else:
99
+ print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
100
+ else:
101
+ print("⚠️ No feature columns available to evaluate.")
80
102
 
81
- return df.drop(index=rows_to_drop)
103
+ return df_clean
82
104
 
83
105
 
84
106
  def split_features_targets(df: pd.DataFrame, targets: list[str]):
@@ -205,13 +227,16 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
205
227
  return df_cont, df_bin # type: ignore
206
228
 
207
229
 
208
- def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
230
+ def plot_correlation_heatmap(df: pd.DataFrame,
231
+ save_dir: Union[str, Path, None] = None,
232
+ plot_title: str="Correlation Heatmap",
233
+ method: Literal["pearson", "kendall", "spearman"]="pearson"):
209
234
  """
210
235
  Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
211
236
 
212
237
  Args:
213
238
  df (pd.DataFrame): The input dataset.
214
- save_dir (str | None): If provided, the heatmap will be saved to this directory as a svg file.
239
+ save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
215
240
  plot_title: To make different plots, or overwrite existing ones.
216
241
  method (str): Correlation method to use. Must be one of:
217
242
  - 'pearson' (default): measures linear correlation (assumes normally distributed data),
@@ -254,10 +279,13 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
254
279
  plt.tight_layout()
255
280
 
256
281
  if save_dir:
282
+ save_path = make_fullpath(save_dir, make=True)
257
283
  # sanitize the plot title to save the file
258
284
  plot_title = sanitize_filename(plot_title)
259
- os.makedirs(save_dir, exist_ok=True)
260
- full_path = os.path.join(save_dir, plot_title + ".svg")
285
+ plot_title = plot_title + ".svg"
286
+
287
+ full_path = save_path / plot_title
288
+
261
289
  plt.savefig(full_path, bbox_inches="tight", format='svg')
262
290
  print(f"Saved correlation heatmap: '{plot_title}.svg'")
263
291
 
@@ -322,7 +350,7 @@ def check_value_distributions(df: pd.DataFrame, view_frequencies: bool=True, bin
322
350
  user_input_ = input("Press enter to continue")
323
351
 
324
352
 
325
- def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
353
+ def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
326
354
  """
327
355
  Plots and saves the value distributions for all (or selected) columns in a DataFrame,
328
356
  with adaptive binning for numerical columns when appropriate.
@@ -335,7 +363,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
335
363
 
336
364
  Args:
337
365
  df (pd.DataFrame): The input DataFrame whose columns are to be analyzed.
338
- save_dir (str): Directory path where the plots will be saved. Will be created if it does not exist.
366
+ save_dir (str | Path): Directory path where the plots will be saved. Will be created if it does not exist.
339
367
  bin_threshold (int): Minimum number of unique values required to trigger binning
340
368
  for numerical columns.
341
369
  skip_cols_with_key (str | None): If provided, any column whose name contains this
@@ -346,8 +374,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
346
374
  - All non-alphanumeric characters in column names are sanitized for safe file naming.
347
375
  - Colormap is automatically adapted based on the number of categories or bins.
348
376
  """
349
- if save_dir is not None:
350
- os.makedirs(save_dir, exist_ok=True)
377
+ save_path = make_fullpath(save_dir, make=True)
351
378
 
352
379
  dict_to_plot_std = dict()
353
380
  dict_to_plot_freq = dict()
@@ -384,13 +411,12 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
384
411
  view_freq = 100 * view_std / view_std.sum() # Percentage
385
412
  # view_freq = df[col].value_counts(normalize=True, bins=10) # relative percentages
386
413
 
387
- if save_dir:
388
- dict_to_plot_std[col] = dict(view_std)
389
- dict_to_plot_freq[col] = dict(view_freq)
390
- saved_plots += 1
414
+ dict_to_plot_std[col] = dict(view_std)
415
+ dict_to_plot_freq[col] = dict(view_freq)
416
+ saved_plots += 1
391
417
 
392
418
  # plot helper
393
- def _plot_helper(dict_: dict, target_dir: str, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
419
+ def _plot_helper(dict_: dict, target_dir: Path, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
394
420
  for col, data in dict_.items():
395
421
  safe_col = sanitize_filename(col)
396
422
 
@@ -412,15 +438,15 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
412
438
  plt.gca().set_facecolor('#f9f9f9')
413
439
  plt.tight_layout()
414
440
 
415
- plot_path = os.path.join(target_dir, f"{safe_col}.png")
441
+ plot_path = target_dir / f"{safe_col}.png"
416
442
  plt.savefig(plot_path, dpi=300, bbox_inches="tight")
417
443
  plt.close()
418
444
 
419
445
  # Save plots
420
- freq_dir = os.path.join(save_dir, "Distribution_Frequency")
421
- std_dir = os.path.join(save_dir, "Distribution_Counts")
422
- os.makedirs(freq_dir, exist_ok=True)
423
- os.makedirs(std_dir, exist_ok=True)
446
+ freq_dir = save_path / "Distribution_Frequency"
447
+ std_dir = save_path / "Distribution_Counts"
448
+ freq_dir.mkdir(parents=True, exist_ok=True)
449
+ std_dir.mkdir(parents=True, exist_ok=True)
424
450
  _plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
425
451
  _plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")
426
452