PyPI - dragon-ml-toolbox - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
ml_tools/ETL_engineering.py +543 -0
ml_tools/MICE_imputation.py +27 -28
ml_tools/PSO_optimization.py +15 -15
ml_tools/VIF_factor.py +20 -17
ml_tools/data_exploration.py +58 -32
ml_tools/ensemble_learning.py +40 -42
ml_tools/handle_excel.py +98 -78
ml_tools/logger.py +13 -11
ml_tools/utilities.py +165 -60
dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import pandas as pd
 import miceforest as mf
-import os
+from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
-from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values
+from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values, make_fullpath
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
-from typing import Optional
+from typing import Optional, Union
 __all__ = [
@@ -60,7 +60,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     return kernel, imputed_datasets, imputed_dataset_names
-def save_imputed_datasets(save_dir: str, imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
+def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
     for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
         merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
         save_dataframe(df=merged_df, save_dir=save_dir, filename=subname)
@@ -72,7 +72,7 @@ def get_na_column_names(df: pd.DataFrame):
 #Convergence diagnostic
-def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: str, fontsize: int=16):
+def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_names: list[str], column_names: list[str], root_dir: Union[str,Path], fontsize: int=16):
     """
     Generate and save convergence diagnostic plots for imputed variables.
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
         raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
     # Check path
-    os.makedirs(root_dir, exist_ok=True)
+    root_path = make_fullpath(root_dir, make=True)
     # Styling parameters
     label_font = {'size': fontsize, 'weight': 'bold'}
@@ -99,8 +99,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
     for dataset_id, imputed_dataset_name in zip(range(dataset_count), imputed_dataset_names):
         #Check directory for current dataset
         dataset_file_dir = f"Convergence_Metrics_{imputed_dataset_name}"
-        local_save_dir = os.path.join(root_dir, dataset_file_dir)
-        os.makedirs(local_save_dir, exist_ok=True)
+        local_save_dir = make_fullpath(input_path=root_path / dataset_file_dir, make=True)
         for feature_name in column_names:
             means_per_iteration = []
@@ -121,8 +120,8 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             plt.grid(True)
             feature_save_name = sanitize_filename(feature_name)
-            save_path = os.path.join(local_save_dir, feature_save_name + ".svg")
+            feature_save_name = feature_save_name + ".svg"
+            save_path = local_save_dir / feature_save_name
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
@@ -130,18 +129,17 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
 # Imputed distributions
-def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: str, column_names: list[str], one_plot: bool=False, fontsize: int=14):
+def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_dir: Union[str, Path], column_names: list[str], one_plot: bool=False, fontsize: int=14):
     '''
     It works using miceforest's authors implementation of the method `.plot_imputed_distributions()`.
     Set `one_plot=True` to save a single image including all feature distribution plots instead.
     '''
     # Check path
-    os.makedirs(root_dir, exist_ok=True)
+    root_path = make_fullpath(root_dir, make=True)
     local_dir_name = f"Distribution_Metrics_{df_name}_imputed"
-    local_save_dir = os.path.join(root_dir, local_dir_name)
-    if not os.path.isdir(local_save_dir):
-        os.makedirs(local_save_dir)
+    local_save_dir = make_fullpath(root_path / local_dir_name, make=True)
     # Styling parameters
     legend_kwargs = {'frameon': True, 'facecolor': 'white', 'framealpha': 0.8}
@@ -191,9 +189,11 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         # sanitize savename
         feature_save_name = sanitize_filename(filename)
+        feature_save_name = feature_save_name + ".svg"
+        new_save_path = local_save_dir / feature_save_name
         fig.savefig(
-            os.path.join(local_save_dir, feature_save_name + ".svg"),
+            new_save_path,
             format='svg',
             bbox_inches='tight',
             pad_inches=0.1
@@ -213,8 +213,8 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     print(f"{local_dir_name} completed.")
-def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
-                      save_datasets_dir: str, save_metrics_dir: str,
+def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
+                      save_datasets_dir: Union[str,Path], save_metrics_dir: Union[str,Path],
                       binary_columns: Optional[list[str]]=None,
                       resulting_datasets: int=1,
                       iterations: int=20,
@@ -230,15 +230,14 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
     Target columns must be skipped from the imputation. Binary columns will be thresholded after imputation.
     """
     # Check paths
-    os.makedirs(save_datasets_dir, exist_ok=True)
-    os.makedirs(save_metrics_dir, exist_ok=True)
+    save_datasets_path = make_fullpath(save_datasets_dir, make=True)
+    save_metrics_path = make_fullpath(save_metrics_dir, make=True)
-    if os.path.isfile(df_path_or_dir):
-        all_file_paths = [df_path_or_dir]
-    elif os.path.isdir(df_path_or_dir):
-        all_file_paths = list(list_csv_paths(df_path_or_dir).values())
+    input_path = make_fullpath(df_path_or_dir)
+    if input_path.is_file():
+        all_file_paths = [input_path]
     else:
-        raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
+        all_file_paths = list(list_csv_paths(input_path).values())
     for df_path in all_file_paths:
         df, df_name = load_dataframe(df_path=df_path)
@@ -247,13 +246,13 @@ def run_mice_pipeline(df_path_or_dir: str, target_columns: list[str],
         kernel, imputed_datasets, imputed_dataset_names = apply_mice(df=df, df_name=df_name, binary_columns=binary_columns, resulting_datasets=resulting_datasets, iterations=iterations, random_state=random_state)
-        save_imputed_datasets(save_dir=save_datasets_dir, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
+        save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
         imputed_column_names = get_na_column_names(df=df)
-        get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
+        get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
-        get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
+        get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
 def _skip_targets(df: pd.DataFrame, target_cols: list[str]):

ml_tools/PSO_optimization.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import numpy as np
-import os
+from pathlib import Path
 import xgboost as xgb
 import lightgbm as lgb
 from sklearn.ensemble import HistGradientBoostingRegressor
@@ -7,7 +7,7 @@ from sklearn.base import ClassifierMixin
 from typing import Literal, Union, Tuple, Dict, Optional
 import pandas as pd
 from copy import deepcopy
-from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe
+from .utilities import _script_info, threshold_binary_values, threshold_binary_values_batch, deserialize_object, list_files_by_extension, save_dataframe, make_fullpath
 import torch
 from tqdm import trange
@@ -36,7 +36,7 @@ class ObjectiveFunction():
     binary_features : int
         Number of binary features located at the END of the feature vector. Model should be trained with continuous features first, followed by binary.
     """
-    def __init__(self, trained_model_path: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
+    def __init__(self, trained_model_path: Union[str, Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int) -> None:
         self.binary_features = binary_features
         self.is_hybrid = False if binary_features <= 0 else True
         self.use_noise = add_noise
@@ -129,7 +129,7 @@ class ObjectiveFunction():
         return (f"<ObjectiveFunction(model={type(self.model).__name__}, use_noise={self.use_noise}, is_hybrid={self.is_hybrid}, task='{self.task}')>")
-def multiple_objective_functions_from_dir(directory: str, add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
+def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise: bool, task: Literal["maximization", "minimization"], binary_features: int):
     """
     Loads multiple objective functions from serialized models in the given directory.
@@ -174,7 +174,7 @@ def _set_feature_names(size: int, names: Union[list[str], None]):
         return names
-def _save_results(*dicts, save_dir: str, target_name: str):
+def _save_results(*dicts, save_dir: Union[str,Path], target_name: str):
     combined_dict = dict()
     for single_dict in dicts:
         combined_dict.update(single_dict)
@@ -187,14 +187,14 @@ def _save_results(*dicts, save_dir: str, target_name: str):
 def run_pso(lower_boundaries: list[float],
             upper_boundaries: list[float],
             objective_function: ObjectiveFunction,
-            save_results_dir: str,
+            save_results_dir: Union[str,Path],
             auto_binary_boundaries: bool=True,
             target_name: Union[str, None]=None,
             feature_names: Union[list[str], None]=None,
             swarm_size: int=200,
-            max_iterations: int=1000,
+            max_iterations: int=3000,
             random_state: int=101,
-            post_hoc_analysis: Optional[int]=3) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
+            post_hoc_analysis: Optional[int]=10) -> Tuple[Dict[str, float | list[float]], Dict[str, float | list[float]]]:
     """
     Executes Particle Swarm Optimization (PSO) to optimize a given objective function and saves the results as a CSV file.
@@ -206,7 +206,7 @@ def run_pso(lower_boundaries: list[float],
         Upper bounds for each feature in the search space (as many as features expected by the model).
     objective_function : ObjectiveFunction
         A callable object encapsulating a tree-based regression model.
-    save_results_dir : str
+    save_results_dir : str | Path
         Directory path to save the results CSV file.
     auto_binary_boundaries : bool
         Use `ObjectiveFunction.binary_features` to append as many binary boundaries as needed to `lower_boundaries` and `upper_boundaries` automatically.
@@ -281,7 +281,7 @@ def run_pso(lower_boundaries: list[float],
             "particle_output": False,
     }
-    os.makedirs(save_results_dir, exist_ok=True)
+    save_results_path = make_fullpath(save_results_dir, make=True)
     if post_hoc_analysis is None or post_hoc_analysis == 1:
         arguments.update({"seed": random_state})
@@ -301,7 +301,7 @@ def run_pso(lower_boundaries: list[float],
         best_target_named = {target_name: best_target}
         # save results
-        _save_results(best_features_named, best_target_named, save_dir=save_results_dir, target_name=target_name)
+        _save_results(best_features_named, best_target_named, save_dir=save_results_path, target_name=target_name)
         return best_features_named, best_target_named
     else:
@@ -327,7 +327,7 @@ def run_pso(lower_boundaries: list[float],
         all_best_targets_named = {target_name: all_best_targets}
         # save results
-        _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_dir, target_name=target_name)
+        _save_results(all_best_features_named, all_best_targets_named, save_dir=save_results_path, target_name=target_name)
         return all_best_features_named, all_best_targets_named # type: ignore
@@ -340,8 +340,8 @@ def _pso(func: ObjectiveFunction,
          lb: np.ndarray,
          ub: np.ndarray,
          device: torch.device,
-         swarmsize=100,
-         maxiter=100,
+         swarmsize: int,
+         maxiter: int,
          omega = 0.729,     # Clerc and Kennedy’s constriction coefficient
          phip = 1.49445,    # Clerc and Kennedy’s constriction coefficient
          phig = 1.49445,    # Clerc and Kennedy’s constriction coefficient
@@ -391,7 +391,7 @@ def _pso(func: ObjectiveFunction,
         If True, returns the full history of particle positions and objective scores at each iteration.
     seed : int or None, default=None
-        Random seed for reproducibility. If None, defaults to 42.
+        Random seed for reproducibility. If None, the random state is not fixed.
     Returns
     -------

ml_tools/VIF_factor.py CHANGED Viewed

@@ -2,12 +2,12 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-from typing import Optional
+from typing import Optional, Union
 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from statsmodels.tools.tools import add_constant
 import warnings
-import os
-from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
+from pathlib import Path
+from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info, make_fullpath
 __all__ = [
@@ -22,7 +22,7 @@ def compute_vif(
     use_columns: Optional[list[str]] = None,
     ignore_columns: Optional[list[str]] = None,
     max_features_to_plot: int = 20,
-    save_dir: Optional[str] = None,
+    save_dir: Optional[Union[str,Path]] = None,
     filename: Optional[str] = None,
     fontsize: int = 14,
     show_plot: bool = True,
@@ -36,7 +36,7 @@ def compute_vif(
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
         ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
-        save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
+        save_dir (str | Path | None): Directory to save the plot as SVG. If None, the plot is not saved.
         filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
         fontsize (int): Base fontsize to scale title and labels on the plot.
         show_plot (bool): Display plot.
@@ -128,15 +128,16 @@ def compute_vif(
             plt.tight_layout()
             if save_dir:
-                os.makedirs(save_dir, exist_ok=True)
+                save_path = make_fullpath(save_dir, make=True)
                 if filename is None:
                     filename = "VIF_plot.svg"
                 else:
                     filename = sanitize_filename(filename)
+                    filename = "VIF_" + filename
                     if not filename.endswith(".svg"):
                         filename += ".svg"
-                save_path = os.path.join(save_dir, "VIF_" + filename)
-                plt.savefig(save_path, format='svg', bbox_inches='tight')
+                full_save_path = save_path / filename
+                plt.savefig(full_save_path, format='svg', bbox_inches='tight')
                 print(f"\tSaved VIF plot: '{filename}'")
             if show_plot:
@@ -176,9 +177,9 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
     return result_df, to_drop
-def compute_vif_multi(input_directory: str,
-                      output_plot_directory: str,
-                      output_dataset_directory: Optional[str] = None,
+def compute_vif_multi(input_directory: Union[str, Path],
+                      output_plot_directory: Union[str, Path],
+                      output_dataset_directory: Optional[Union[str, Path]] = None,
                       use_columns: Optional[list[str]] = None,
                       ignore_columns: Optional[list[str]] = None,
                       max_features_to_plot: int = 20,
@@ -188,9 +189,9 @@ def compute_vif_multi(input_directory: str,
     Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
     Args:
-        input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
-        output_plot_directory (str): Save plots to this directory.
-        output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
+        input_directory (str | Path): Target directory with CSV files able to be loaded as DataFrame.
+        output_plot_directory (str | Path): Save plots to this directory.
+        output_dataset_directory (str | Path | None): If provided, saves new CSV files to this directory.
         use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
         ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
         max_features_to_plot (int): Adjust the number of features shown in the plot.
@@ -202,7 +203,9 @@ def compute_vif_multi(input_directory: str,
     A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
     """
     if output_dataset_directory is not None:
-        os.makedirs(output_dataset_directory, exist_ok=True)
+        output_dataset_path = make_fullpath(output_dataset_directory, make=True)
+    else:
+        output_dataset_path = None
     for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
         vif_dataframe = compute_vif(df=df,
@@ -215,12 +218,12 @@ def compute_vif_multi(input_directory: str,
                             show_plot=False,
                             verbose=False)
-        if output_dataset_directory is not None:
+        if output_dataset_path is not None:
             new_filename = df_name + '_VIF'
             result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
             if len(dropped_cols) > 0:
-                save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
+                save_dataframe(df=result_df, save_dir=output_dataset_path, filename=new_filename)
 def info():

ml_tools/data_exploration.py CHANGED Viewed

@@ -5,9 +5,9 @@ import seaborn as sns
 from IPython import get_ipython
 from IPython.display import clear_output
 import time
-from typing import Union, Literal, Dict, Tuple, List
-import os
-from .utilities import sanitize_filename, _script_info
+from typing import Union, Literal, Dict, Tuple, List, Optional
+from pathlib import Path
+from .utilities import sanitize_filename, _script_info, make_fullpath
 import re
@@ -59,26 +59,48 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     return summary
-def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
+def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
     """
-    Drops rows with more than `threshold` fraction of missing values.
+    Drops rows from the DataFrame using a two-stage strategy:
+    1. If `targets`, remove any row where all target columns are missing.
+    2. Among features, drop those with more than `threshold` fraction of missing values.
     Parameters:
         df (pd.DataFrame): The input DataFrame.
-        threshold (float): Fraction of missing values above which rows are dropped.
+        targets (list[str] | None): List of target column names.
+        threshold (float): Maximum allowed fraction of missing values in feature columns.
     Returns:
-        pd.DataFrame: A new DataFrame without the dropped rows.
+        pd.DataFrame: A cleaned DataFrame with problematic rows removed.
     """
-    missing_fraction = df.isnull().mean(axis=1)
-    rows_to_drop = missing_fraction[missing_fraction > threshold].index
-    if len(rows_to_drop) > 0:
-        print(f"Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing data.")
+    df_clean = df.copy()
+    # Stage 1: Drop rows with all target columns missing
+    if targets is not None:
+        target_na = df_clean[targets].isnull().all(axis=1)
+        if target_na.any():
+            print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
+            df_clean = df_clean[~target_na]
+        else:
+            print("✅ No rows with all targets missing.")
     else:
-        print(f"No rows have more than {threshold*100:.0f}% missing data.")
+        targets = []
+    # Stage 2: Drop rows based on feature column missing values
+    feature_cols = [col for col in df_clean.columns if col not in targets]
+    if feature_cols:
+        feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
+        rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
+        if len(rows_to_drop) > 0:
+            print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
+            df_clean = df_clean.drop(index=rows_to_drop)
+        else:
+            print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
+    else:
+        print("⚠️ No feature columns available to evaluate.")
-    return df.drop(index=rows_to_drop)
+    return df_clean
 def split_features_targets(df: pd.DataFrame, targets: list[str]):
@@ -205,13 +227,16 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
     return df_cont, df_bin # type: ignore
-def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
+def plot_correlation_heatmap(df: pd.DataFrame,
+                             save_dir: Union[str, Path, None] = None,
+                             plot_title: str="Correlation Heatmap",
+                             method: Literal["pearson", "kendall", "spearman"]="pearson"):
     """
     Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
     Args:
         df (pd.DataFrame): The input dataset.
-        save_dir (str | None): If provided, the heatmap will be saved to this directory as a svg file.
+        save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
         plot_title: To make different plots, or overwrite existing ones.
         method (str): Correlation method to use. Must be one of:
             - 'pearson' (default): measures linear correlation (assumes normally distributed data),
@@ -254,10 +279,13 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
     plt.tight_layout()
     if save_dir:
+        save_path = make_fullpath(save_dir, make=True)
         # sanitize the plot title to save the file
         plot_title = sanitize_filename(plot_title)
-        os.makedirs(save_dir, exist_ok=True)
-        full_path = os.path.join(save_dir, plot_title + ".svg")
+        plot_title = plot_title + ".svg"
+        full_path = save_path / plot_title
         plt.savefig(full_path, bbox_inches="tight", format='svg')
         print(f"Saved correlation heatmap: '{plot_title}.svg'")
@@ -322,7 +350,7 @@ def check_value_distributions(df: pd.DataFrame, view_frequencies: bool=True, bin
         user_input_ = input("Press enter to continue")
-def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
+def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
     """
     Plots and saves the value distributions for all (or selected) columns in a DataFrame,
     with adaptive binning for numerical columns when appropriate.
@@ -335,7 +363,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
     Args:
         df (pd.DataFrame): The input DataFrame whose columns are to be analyzed.
-        save_dir (str): Directory path where the plots will be saved. Will be created if it does not exist.
+        save_dir (str | Path): Directory path where the plots will be saved. Will be created if it does not exist.
         bin_threshold (int): Minimum number of unique values required to trigger binning
             for numerical columns.
         skip_cols_with_key (str | None): If provided, any column whose name contains this
@@ -346,8 +374,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
         - All non-alphanumeric characters in column names are sanitized for safe file naming.
         - Colormap is automatically adapted based on the number of categories or bins.
     """
-    if save_dir is not None:
-        os.makedirs(save_dir, exist_ok=True)
+    save_path = make_fullpath(save_dir, make=True)
     dict_to_plot_std = dict()
     dict_to_plot_freq = dict()
@@ -384,13 +411,12 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
             view_freq = 100 * view_std / view_std.sum() # Percentage
         # view_freq = df[col].value_counts(normalize=True, bins=10)  # relative percentages
-        if save_dir:
-            dict_to_plot_std[col] = dict(view_std)
-            dict_to_plot_freq[col] = dict(view_freq)
-            saved_plots += 1
+        dict_to_plot_std[col] = dict(view_std)
+        dict_to_plot_freq[col] = dict(view_freq)
+        saved_plots += 1
     # plot helper
-    def _plot_helper(dict_: dict, target_dir: str, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
+    def _plot_helper(dict_: dict, target_dir: Path, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
         for col, data in dict_.items():
             safe_col = sanitize_filename(col)
@@ -412,15 +438,15 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
             plt.gca().set_facecolor('#f9f9f9')
             plt.tight_layout()
-            plot_path = os.path.join(target_dir, f"{safe_col}.png")
+            plot_path = target_dir / f"{safe_col}.png"
             plt.savefig(plot_path, dpi=300, bbox_inches="tight")
             plt.close()
     # Save plots
-    freq_dir = os.path.join(save_dir, "Distribution_Frequency")
-    std_dir = os.path.join(save_dir, "Distribution_Counts")
-    os.makedirs(freq_dir, exist_ok=True)
-    os.makedirs(std_dir, exist_ok=True)
+    freq_dir = save_path / "Distribution_Frequency"
+    std_dir = save_path / "Distribution_Counts"
+    freq_dir.mkdir(parents=True, exist_ok=True)
+    std_dir.mkdir(parents=True, exist_ok=True)
     _plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
     _plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")

dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl