PyPI - dragon-ml-toolbox - Versions diffs - 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (11) hide show

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.3.2
+Version: 1.4.0
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -27,6 +27,7 @@ Requires-Dist: ipython
 Requires-Dist: ipykernel
 Requires-Dist: notebook
 Requires-Dist: jupyterlab
+Requires-Dist: ipywidgets
 Requires-Dist: joblib
 Requires-Dist: xgboost
 Requires-Dist: lightgbm<=4.5.0

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,19 @@
-dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-1.3.2.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
-ml_tools/MICE_imputation.py,sha256=71Kdi5rhPePIT5rJKIyRCM7ORPSjeujQCzKcLIwXs90,9428
+dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-1.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
+ml_tools/MICE_imputation.py,sha256=4kqZiesk8vyh4MBLnNE9grflG4fDusqzuYBElsbk4LY,9484
+ml_tools/VIF_factor.py,sha256=rHSAxQcXLrG8dIjCXBAvETsSkCBfYus9NqimOnm2Bvk,9559
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ml_tools/data_exploration.py,sha256=laTNbN5_xlhqWiKfF-cJ9yMZ8zAM2a-AryqgiIQBBLg,26649
+ml_tools/data_exploration.py,sha256=qtkGumckC2PmTpj3brVFi072ewX0OI6dwUF4Or7Yikg,21341
 ml_tools/datasetmaster.py,sha256=VUneKshnmjOGbtqVVGTFcIMRKF3s6ZDYrosIYKDjD80,28956
-ml_tools/ensemble_learning.py,sha256=5UmlXI3Orm5zL0P07Ub_Y0gwjruH-REHY-cFWQpJWb0,29085
+ml_tools/ensemble_learning.py,sha256=wK6mtOE4v9AWlxkcWhJj5XZjREChxb46kE0i2IxS-OE,28372
 ml_tools/handle_excel.py,sha256=IR0VQc3hYdmjwC31E5YxDnRcWig4jSIx7Y_7to-KZz4,11969
 ml_tools/logger.py,sha256=XwSpCUzw2Le24fJHyljBxNLgw63SwjZ0pMjTJqf0ylI,4622
 ml_tools/particle_swarm_optimization.py,sha256=jpkje4OETC9fyISxxUTx4XGrImSU6gDEcwz46ZDs2bQ,19250
 ml_tools/pytorch_models.py,sha256=Oykw02sOZLCjvSadQd64UGesBN7kq0x1EGXHusvYiQI,9908
 ml_tools/trainer.py,sha256=Zd7AaHeoNd8dEas2JChWoHaCUpWUVRDUMybuHaKJ0XY,16740
-ml_tools/utilities.py,sha256=mG_--EFplfI9H7OhrWI8VkdNJtTbs4Wbz32xvcFWps8,5518
+ml_tools/utilities.py,sha256=gr1cyRUfZcRo9fjWpCaQkrvWY0-xJnDJdrE8JEsOi8o,6309
 ml_tools/vision_helpers.py,sha256=lBAW6dzAK-HOswAt1fU_tfP9hkNLY5D8c_I_7hhEXno,7528
-dragon_ml_toolbox-1.3.2.dist-info/METADATA,sha256=NgNKZD1v97kBBdE96OJELolvlAXviJ-DgJvZAjjy5Ik,2309
-dragon_ml_toolbox-1.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-1.3.2.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-1.3.2.dist-info/RECORD,,
+dragon_ml_toolbox-1.4.0.dist-info/METADATA,sha256=V7Y96iAbgX6Xl6RWzEt4nGfKMZe4cuLs0BrFQghXxX8,2335
+dragon_ml_toolbox-1.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-1.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-1.4.0.dist-info/RECORD,,

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -120,7 +120,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     '''
     # Check path
     os.makedirs(root_dir, exist_ok=True)
-    local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
+    local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
     if not os.path.isdir(local_save_dir):
         os.makedirs(local_save_dir)
@@ -169,8 +169,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         # Adjust layout and save
         # fig.tight_layout()
         # fig.subplots_adjust(bottom=0.2, left=0.2)  # Optional, depending on overflow
+        # sanitize savename
+        feature_save_name = sanitize_filename(filename)
         fig.savefig(
-            os.path.join(local_save_dir, filename + ".svg"),
+            os.path.join(local_save_dir, feature_save_name + ".svg"),
             format='svg',
             bbox_inches='tight',
             pad_inches=0.1
@@ -185,8 +189,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     else:
         for feature in column_names:
             fig = kernel.plot_imputed_distributions(variables=[feature])
-            feature_save_name = sanitize_filename(feature)
-            _process_figure(fig, feature_save_name)
+            _process_figure(fig, feature)
     print("\tImputed distributions saved successfully.")
@@ -207,7 +210,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
     if os.path.isfile(df_path_or_dir):
         all_file_paths = [df_path_or_dir]
     elif os.path.isdir(df_path_or_dir):
-        all_file_paths, _ = list_csv_paths(df_path_or_dir)
+        all_file_paths = list_csv_paths(df_path_or_dir).values()
     else:
         raise ValueError(f"Invalid path or directory: {df_path_or_dir}")

ml_tools/VIF_factor.py ADDED Viewed

@@ -0,0 +1,209 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Optional
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.tools.tools import add_constant
+import warnings
+import os
+from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe
+def compute_vif(
+    df: pd.DataFrame,
+    target_columns: Optional[list[str]] = None,
+    ignore_columns: Optional[list[str]] = None,
+    max_features_to_plot: int = 20,
+    save_dir: Optional[str] = None,
+    filename: Optional[str] = None,
+    fontsize: int = 14,
+    show_plot: bool = True,
+) -> pd.DataFrame:
+    """
+    Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        max_features_to_plot (int): Adjust the number of features shown in the plot.
+        save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
+        filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
+        fontsize (int): Base fontsize to scale title and labels on the plot.
+        show_plot (bool): Display plot.
+    Returns:
+        pd.DataFrame: DataFrame with features and their corresponding VIF values.
+    NOTE:
+    **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
+    A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
+    A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
+    """
+    ground_truth_cols = df.columns.to_list()
+    if target_columns is None:
+        sanitized_columns = df.select_dtypes(include='number').columns.tolist()
+        missing_features = set(ground_truth_cols) - set(sanitized_columns)
+        if missing_features:
+            print(f"⚠️ These columns are not Numeric:\n{missing_features}")
+    else:
+        sanitized_columns = list()
+        for feature in target_columns:
+            if feature not in ground_truth_cols:
+                print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+            else:
+                sanitized_columns.append(feature)
+    if ignore_columns is not None and target_columns is None:
+        missing_ignore = set(ignore_columns) - set(ground_truth_cols)
+        if missing_ignore:
+            print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+        sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
+    X = df[sanitized_columns].copy()
+    X = add_constant(X, has_constant='add')
+    vif_data = pd.DataFrame()
+    vif_data["feature"] = X.columns # type: ignore
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        vif_data["VIF"] = [
+            variance_inflation_factor(X.values, i) for i in range(X.shape[1]) # type: ignore
+        ]
+    # Replace infinite values (perfect multicollinearity)
+    vif_data["VIF"] = vif_data["VIF"].replace([np.inf, -np.inf], 999.0)
+    # Drop the constant column
+    vif_data = vif_data[vif_data["feature"] != "const"]
+    # Add color coding
+    def vif_color(v: float) -> str:
+        if v >= 10:
+            return "red"
+        elif v >= 5:
+            return "gold"
+        else:
+            return "green"
+    vif_data["color"] = vif_data["VIF"].apply(vif_color)
+    # Sort by VIF descending
+    vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)
+    # Filter for plotting
+    plot_data = vif_data.head(max_features_to_plot)
+    if save_dir or show_plot:
+        if not plot_data.empty:
+            plt.figure(figsize=(10, 6))
+            plt.barh(
+                plot_data["feature"],
+                plot_data["VIF"],
+                color=plot_data["color"],
+                edgecolor='black'
+            )
+            plt.title("Variance Inflation Factor (VIF) per Feature", fontsize=fontsize+1)
+            plt.xlabel("VIF value", fontsize=fontsize)
+            plt.xticks(fontsize=fontsize)
+            plt.yticks(fontsize=fontsize)
+            plt.axvline(x=5, color='gold', linestyle='--', label='VIF = 5')
+            plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10')
+            plt.xlim(0, 12)
+            plt.legend(loc='lower right', fontsize=fontsize-1)
+            plt.gca().invert_yaxis()
+            plt.grid(axis='x', linestyle='--', alpha=0.5)
+            plt.tight_layout()
+            if save_dir:
+                os.makedirs(save_dir, exist_ok=True)
+                if filename is None:
+                    filename = "VIF_plot.svg"
+                else:
+                    filename = sanitize_filename(filename)
+                    if not filename.endswith(".svg"):
+                        filename += ".svg"
+                save_path = os.path.join(save_dir, "VIF_" + filename)
+                plt.savefig(save_path, format='svg', bbox_inches='tight')
+                print(f"\tSaved VIF plot: '{filename}'")
+            if show_plot:
+                plt.show()
+            plt.close()
+    return vif_data.drop(columns="color")
+def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
+    """
+    Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
+    Args:
+        df (pd.DataFrame): Original DataFrame containing the columns to test.
+        vif_df (pd.DataFrame): DataFrame with 'feature' and 'VIF' columns as returned by `compute_vif()`.
+        threshold (float): VIF threshold above which columns will be dropped.
+    Returns:
+        pd.DataFrame: A new DataFrame with high-VIF columns removed.
+    """
+    # Ensure expected structure
+    if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
+        raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
+    # Identify features to drop
+    to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
+    print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    result_df = df.drop(columns=to_drop)
+    if result_df.empty:
+        print(f"\t⚠️ Warning: All columns were dropped.")
+    return result_df
+def compute_vif_multi(input_directory: str,
+                      output_plot_directory: str,
+                      output_dataset_directory: Optional[str] = None,
+                      target_columns: Optional[list[str]] = None,
+                      ignore_columns: Optional[list[str]] = None,
+                      max_features_to_plot: int = 20,
+                      fontsize: int = 14):
+    """
+    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
+    Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
+    Args:
+        input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
+        output_plot_directory (str): Save plots to this directory.
+        output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
+        target_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        max_features_to_plot (int): Adjust the number of features shown in the plot.
+        fontsize (int): Base fontsize to scale title and labels on hte plot.
+    NOTE:
+    **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
+    A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
+    A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
+    """
+    if output_dataset_directory is not None:
+        os.makedirs(output_dataset_directory, exist_ok=True)
+    for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
+        vif_dataframe = compute_vif(df=df,
+                            target_columns=target_columns,
+                            ignore_columns=ignore_columns,
+                            max_features_to_plot=max_features_to_plot,
+                            fontsize=fontsize,
+                            save_dir=output_plot_directory,
+                            filename=df_name,
+                            show_plot=False)
+        if output_dataset_directory is not None:
+            new_filename = 'VIF_' + df_name
+            result_df = drop_vif_based(df=df, vif_df=vif_dataframe)
+            save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)

ml_tools/data_exploration.py CHANGED Viewed

@@ -2,12 +2,10 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-from statsmodels.stats.outliers_influence import variance_inflation_factor
-from statsmodels.tools.tools import add_constant
 from IPython import get_ipython
 from IPython.display import clear_output
 import time
-from typing import Union, Literal, Dict, Tuple, Optional
+from typing import Union, Literal, Dict, Tuple
 import os
 import sys
 import textwrap
@@ -26,10 +24,7 @@ __all__ = ["summarize_dataframe",
            "plot_value_distributions",
            "clip_outliers_single",
            "clip_outliers_multi",
-           "merge_dataframes",
-           "save_dataframe",
-           "compute_vif",
-           "drop_vif_based"]
+           "merge_dataframes"]
 def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
@@ -278,7 +273,7 @@ def check_value_distributions(df: pd.DataFrame, view_frequencies: bool=True, bin
     Notes:
         - Binning is adaptive: if quantile binning results in ≤ 2 unique bins, raw values are used instead.
     """
-    # cherrypick columns
+    # cherry-pick columns
     if skip_cols_with_key is not None:
         columns = [col for col in df.columns if skip_cols_with_key not in col]
     else:
@@ -351,7 +346,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
     dict_to_plot_std = dict()
     dict_to_plot_freq = dict()
-    # cherrypick columns
+    # cherry-pick columns
     if skip_cols_with_key is not None:
         columns = [col for col in df.columns if skip_cols_with_key not in col]
     else:
@@ -399,7 +394,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
                 labels = data.keys()
             plt.figure(figsize=(10, 6))
-            colors = plt.cm.tab20.colors if len(data) <= 20 else plt.cm.viridis(np.linspace(0, 1, len(data)))
+            colors = plt.cm.tab20.colors if len(data) <= 20 else plt.cm.viridis(np.linspace(0, 1, len(data))) # type: ignore
             plt.bar(labels, data.values(), color=colors[:len(data)], alpha=0.85)
             plt.xlabel("Values", fontsize=base_fontsize)
@@ -574,141 +569,6 @@ def merge_dataframes(
     return merged_df
-def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
-    """
-    Save a pandas DataFrame to a CSV file.
-    Parameters:
-        df: pandas.DataFrame to save
-        save_dir: str, directory where the CSV file will be saved.
-        filename: str, CSV filename, extension will be added if missing.
-    """
-    os.makedirs(save_dir, exist_ok=True)
-    filename = sanitize_filename(filename)
-    if not filename.endswith('.csv'):
-        filename += '.csv'
-    output_path = os.path.join(save_dir, filename)
-    df.to_csv(output_path, index=False, encoding='utf-8')
-    print(f"Saved file: '{filename}'")
-def compute_vif(
-    df: pd.DataFrame,
-    features: Optional[list[str]] = None,
-    ignore_cols: Optional[list[str]] = None,
-    plot: bool = True,
-    save_dir: Union[str, None] = None
-) -> pd.DataFrame:
-    """
-    Computes Variance Inflation Factors (VIF) for numeric features, optionally plots and saves the results.
-    There cannot be empty values in the dataset.
-    Args:
-        df (pd.DataFrame): The input DataFrame.
-        features (list[str] | None): Optional list of column names to evaluate. Defaults to all numeric columns.
-        ignore_cols (list[str] | None): Optional list of column names to ignore.
-        plot (bool): Whether to display a barplot of VIF values.
-        save_dir (str | None): Directory to save the plot as SVG. If None, plot is not saved.
-    Returns:
-        pd.DataFrame: DataFrame with features and corresponding VIF values, sorted descending.
-    NOTE:
-    **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
-    A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
-    A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
-    """
-    if features is None:
-        features = df.select_dtypes(include='number').columns.tolist()
-    if ignore_cols is not None:
-        missing = set(ignore_cols) - set(features)
-        if missing:
-            raise ValueError(f"The following 'columns to ignore' are not in the Dataframe:\n{missing}")
-        features = [f for f in features if f not in ignore_cols]
-    X = df[features].copy()
-    X = add_constant(X, has_constant='add')
-    vif_data = pd.DataFrame()
-    vif_data["feature"] = X.columns
-    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
-    # Drop the constant column
-    vif_data = vif_data[vif_data["feature"] != "const"]
-    vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True) # type: ignore
-    # Add color coding based on thresholds
-    def vif_color(v: float) -> str:
-        if v > 10:
-            return "red"
-        elif v > 5:
-            return "gold"
-        else:
-            return "green"
-    vif_data["color"] = vif_data["VIF"].apply(vif_color)
-    # Plot
-    if plot or save_dir:
-        plt.figure(figsize=(10, 6))
-        bars = plt.barh(
-            vif_data["feature"],
-            vif_data["VIF"],
-            color=vif_data["color"],
-            edgecolor='black'
-        )
-        plt.title("Variance Inflation Factor (VIF) per Feature")
-        plt.xlabel("VIF")
-        plt.axvline(x=5, color='gold', linestyle='--', label='VIF = 5')
-        plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10')
-        plt.legend(loc='lower right')
-        plt.gca().invert_yaxis()
-        plt.grid(axis='x', linestyle='--', alpha=0.5)
-        plt.tight_layout()
-        if save_dir:
-            os.makedirs(save_dir, exist_ok=True)
-            save_path = os.path.join(save_dir, "VIF_plot.svg")
-            plt.savefig(save_path, format='svg', bbox_inches='tight')
-            print(f"Saved VIF plot to: {save_path}")
-        if plot:
-            plt.show()
-        plt.close()
-    return vif_data.drop(columns="color")
-def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> pd.DataFrame:
-    """
-    Drops features from the original DataFrame based on their VIF values exceeding a given threshold.
-    Args:
-        df (pd.DataFrame): Original DataFrame containing the features.
-        vif_df (pd.DataFrame): DataFrame with 'feature' and 'VIF' columns as returned by `compute_vif()`.
-        threshold (float): VIF threshold above which features will be dropped.
-    Returns:
-        pd.DataFrame: A new DataFrame with high-VIF features removed.
-    """
-    # Ensure expected structure
-    if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
-        raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
-    # Identify features to drop
-    to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
-    print(f"Dropping {len(to_drop)} feature(s) with VIF > {threshold}: {to_drop}")
-    return df.drop(columns=to_drop, errors="ignore")
 def _is_notebook():
     return get_ipython() is not None

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -139,8 +139,9 @@ def get_models(task: Literal["classification", "regression"], random_state: int=
 ###### 3. Process Dataset ######
 # function to split data into train and test
-def _split_data(features, target, test_size, random_state):
-    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state, stratify=target)
+def _split_data(features, target, test_size, random_state, task):
+    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state,
+                                                        stratify=target if task=="classification" else None)
     return X_train, X_test, y_train, y_test
 # function to standardize the data
@@ -176,7 +177,7 @@ def _resample(X_train_scaled: np.ndarray, y_train: pd.Series,
     else:
         raise ValueError(f"Invalid resampling strategy: {strategy}")
-    X_res, y_res = resample_algorithm.fit_resample(X_train_scaled, y_train)
+    X_res, y_res, *_ = resample_algorithm.fit_resample(X_train_scaled, y_train)
     return X_res, y_res
 # DATASET PIPELINE
@@ -199,7 +200,7 @@ def dataset_pipeline(df_features: pd.DataFrame, df_target: pd.Series, task: Lite
         print(f"\tUnique values for '{df_target.name}': {unique_values}")
     #Train test split
-    X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state)
+    X_train, X_test, y_train, y_test = _split_data(features=df_features, target=df_target, test_size=test_size, random_state=random_state, task=task)
     #DEBUG
     if debug:
@@ -343,8 +344,7 @@ def plot_roc_curve(
     color: str = "darkorange",
     figure_size: tuple = (10, 10),
     linewidth: int = 2,
-    title_fontsize: int = 24,
-    label_fontsize: int = 24,
+    base_fontsize: int = 24,
     input_features: Optional[np.ndarray] = None,
 ) -> plt.Figure: # type: ignore
     """
@@ -402,11 +402,11 @@ def plot_roc_curve(
     ax.plot(fpr, tpr, color=color, lw=linewidth, label=f"AUC = {auc_score:.2f}")
     ax.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1)
-    ax.set_title(f"{model_name} - {target_name}", fontsize=title_fontsize)
-    ax.set_xlabel("False Positive Rate", fontsize=label_fontsize)
-    ax.set_ylabel("True Positive Rate", fontsize=label_fontsize)
-    ax.tick_params(axis='both', labelsize=label_fontsize)
-    ax.legend(loc="lower right", fontsize=label_fontsize)
+    ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
+    ax.set_xlabel("False Positive Rate", fontsize=base_fontsize)
+    ax.set_ylabel("True Positive Rate", fontsize=base_fontsize)
+    ax.tick_params(axis='both', labelsize=base_fontsize)
+    ax.legend(loc="lower right", fontsize=base_fontsize)
     ax.grid(True)
     # Save figure
@@ -416,6 +416,7 @@ def plot_roc_curve(
     return fig
 # function to evaluate the model and save metrics (Regression)
 def evaluate_model_regression(model, model_name: str,
                                save_dir: str,
@@ -423,8 +424,7 @@ def evaluate_model_regression(model, model_name: str,
                                target_id: str,
                                figure_size: tuple = (12, 8),
                                alpha_transparency: float = 0.5,
-                               title_fontsize: int = 24,
-                               normal_fontsize: int = 24):
+                               base_fontsize: int = 24):
     # Generate predictions
     y_pred = model.predict(x_test_scaled)
@@ -448,9 +448,9 @@ def evaluate_model_regression(model, model_name: str,
     plt.figure(figsize=figure_size)
     plt.scatter(y_pred, residuals, alpha=alpha_transparency)
     plt.axhline(0, color='red', linestyle='--')
-    plt.xlabel("Predicted Values", fontsize=normal_fontsize)
-    plt.ylabel("Residuals", fontsize=normal_fontsize)
-    plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=title_fontsize)
+    plt.xlabel("Predicted Values", fontsize=base_fontsize)
+    plt.ylabel("Residuals", fontsize=base_fontsize)
+    plt.title(f"{model_name} - Residual Plot for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
     plt.savefig(os.path.join(save_dir, f"Residual_Plot_{target_id}.svg"), bbox_inches='tight', format="svg")
@@ -462,9 +462,9 @@ def evaluate_model_regression(model, model_name: str,
     plt.plot([single_y_test.min(), single_y_test.max()],
              [single_y_test.min(), single_y_test.max()],
              'k--', lw=2)
-    plt.xlabel('True Values', fontsize=normal_fontsize)
-    plt.ylabel('Predictions', fontsize=normal_fontsize)
-    plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=title_fontsize)
+    plt.xlabel('True Values', fontsize=base_fontsize)
+    plt.ylabel('Predictions', fontsize=base_fontsize)
+    plt.title(f"{model_name} - True vs Predicted for {target_id}", fontsize=base_fontsize)
     plt.grid(True)
     plot_path = os.path.join(save_dir, f"Regression_Plot_{target_id}.svg")
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
@@ -473,52 +473,53 @@ def evaluate_model_regression(model, model_name: str,
     return y_pred
 # Get SHAP values
-def get_shap_values(model, model_name: str,
-                   save_dir: str,
-                   features_to_explain: np.ndarray,
-                   feature_names: list[str],
-                   target_id: str,
-                   task: Literal["classification", "regression"],
-                   max_display_features: int=8,
-                   figsize: tuple=(14, 20),
-                   title_fontsize: int=38,
-                   label_fontsize: int=38,
-                   plot_type: Literal["bar", "dot"] = "dot"
-                   ):
+def get_shap_values(
+    model,
+    model_name: str,
+    save_dir: str,
+    features_to_explain: np.ndarray,
+    feature_names: list[str],
+    target_id: str,
+    task: Literal["classification", "regression"],
+    max_display_features: int = 10,
+    figsize: tuple = (16, 20),
+    base_fontsize: int = 38,
+):
     """
     Universal SHAP explainer for regression and classification.
-    - Use `X_train` (or a subsample of it) to see how the model explains the data it was trained on.
-	- Use `X_test` (or a hold-out set) to see how the model explains unseen data.
-	- Use the entire dataset to get the global view.
+        * Use `X_train` (or a subsample of it) to see how the model explains the data it was trained on.
+	    * Use `X_test` (or a hold-out set) to see how the model explains unseen data.
+	    * Use the entire dataset to get the global view.
     Parameters:
-    - 'task': 'regression' or 'classification'
-    - 'features_to_explain': Should match the model's training data format, including scaling.
-    - 'save_dir': Directory to save visualizations
+        task: 'regression' or 'classification'
+        features_to_explain: Should match the model's training data format, including scaling.
+        save_dir: Directory to save visualizations
     """
-    def _create_shap_plot(shap_values, features, feature_names,
-                         full_save_path: str, plot_type: str,
-                         title: str):
-        """Helper function to create and save SHAP plots"""
-        # Set style
-        preferred_styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
-        for style in preferred_styles:
+    def _apply_plot_style():
+        styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
+        for style in styles:
             if style in plt.style.available or style == 'default':
                 plt.style.use(style)
                 break
+    def _configure_rcparams():
+        plt.rc('font', size=base_fontsize)
+        plt.rc('axes', titlesize=base_fontsize)
+        plt.rc('axes', labelsize=base_fontsize)
+        plt.rc('xtick', labelsize=base_fontsize)
+        plt.rc('ytick', labelsize=base_fontsize + 2)
+        plt.rc('legend', fontsize=base_fontsize)
+        plt.rc('figure', titlesize=base_fontsize)
+    def _create_shap_plot(shap_values, features, save_path: str, plot_type: str, title: str):
+        _apply_plot_style()
+        _configure_rcparams()
         plt.figure(figsize=figsize)
-        #set rc parameters for better readability
-        plt.rc('font', size=label_fontsize)
-        plt.rc('axes', titlesize=title_fontsize)
-        plt.rc('axes', labelsize=label_fontsize)
-        plt.rc('xtick', labelsize=label_fontsize)
-        plt.rc('ytick', labelsize=label_fontsize)
-        plt.rc('legend', fontsize=label_fontsize)
-        plt.rc('figure', titlesize=title_fontsize)
-        # Create the SHAP plot
         shap.summary_plot(
             shap_values=shap_values,
             features=features,
@@ -528,85 +529,75 @@ def get_shap_values(model, model_name: str,
             plot_size=figsize,
             max_display=max_display_features,
             alpha=0.7,
-            color=plt.get_cmap('viridis') # type: ignore
+            # color='viridis'
         )
-        # Add professional styling
         ax = plt.gca()
-        ax.set_xlabel("SHAP Value Impact", fontsize=title_fontsize, weight='bold')
-        ax.set_ylabel("Features", fontsize=title_fontsize, weight='bold')
-        plt.title(title, fontsize=title_fontsize, pad=20, weight='bold')
-        # Manually fix tick fonts
+        ax.set_xlabel("SHAP Value Impact", fontsize=base_fontsize + 2, weight='bold', labelpad=20)
+        plt.title(title, fontsize=base_fontsize + 2, pad=20, weight='bold')
         for tick in ax.get_xticklabels():
-            tick.set_fontsize(label_fontsize)
-            tick.set_rotation(45)
+            tick.set_fontsize(base_fontsize)
+            tick.set_rotation(30)
         for tick in ax.get_yticklabels():
-            tick.set_fontsize(label_fontsize)
+            tick.set_fontsize(base_fontsize + 2)
-        # Handle colorbar for dot plots
         if plot_type == "dot":
             cb = plt.gcf().axes[-1]
-            # cb.set_ylabel("Feature Value", size=label_fontsize)
             cb.set_ylabel("", size=1)
-            cb.tick_params(labelsize=label_fontsize - 2)
-        # Save and clean up
-        plt.savefig(
-            full_save_path,
-            bbox_inches='tight',
-            facecolor='white',
-            format="svg"
-        )
+            cb.tick_params(labelsize=base_fontsize - 2)
+        plt.savefig(save_path, bbox_inches='tight', facecolor='white', format="svg")
         plt.close()
-        rcdefaults()  # Reset rc parameters to default
-    # START
-    explainer = shap.TreeExplainer(model)
-    shap_values = explainer.shap_values(features_to_explain)
-    # Handle different model types
-    if task == 'classification':
-        # Determine if multiclass
-        try:
-            is_multiclass = len(model.classes_) > 2
-            class_names = model.classes_
-        except AttributeError:
-            is_multiclass = isinstance(shap_values, list) and len(shap_values) > 1
-            class_names = list(range(len(shap_values))) if is_multiclass else [0, 1]
+        rcdefaults()
+    def _plot_for_classification(shap_values, class_names):
+        is_multiclass = isinstance(shap_values, list) and len(shap_values) > 1
         if is_multiclass:
-            for class_idx, (class_shap, class_name) in enumerate(zip(shap_values, class_names)):
+            for class_shap, class_name in zip(shap_values, class_names):
+                for plot_type in ["bar", "dot"]:
+                    _create_shap_plot(
+                        shap_values=class_shap,
+                        features=features_to_explain,
+                        save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}_{plot_type}.svg"),
+                        plot_type=plot_type,
+                        title=f"{model_name} - {target_id} (Class {class_name})"
+                    )
+        else:
+            values = shap_values[1] if isinstance(shap_values, list) else shap_values
+            for plot_type in ["bar", "dot"]:
                 _create_shap_plot(
-                    shap_values=class_shap,
+                    shap_values=values,
                     features=features_to_explain,
-                    feature_names=feature_names,
-                    full_save_path=os.path.join(save_dir, f"SHAP_{target_id}_Class{class_name}.svg"),
+                    save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
                     plot_type=plot_type,
-                    title=f"{model_name} - {target_id} (Class {class_name})"
+                    title=f"{model_name} - {target_id}"
                 )
-        else:
-            # Handle binary classification (single array case)
-            plot_vals = shap_values[1] if isinstance(shap_values, list) else shap_values
+    def _plot_for_regression(shap_values):
+        for plot_type in ["bar", "dot"]:
             _create_shap_plot(
-                shap_values=plot_vals,
+                shap_values=shap_values,
                 features=features_to_explain,
-                feature_names=feature_names,
-                full_save_path=os.path.join(save_dir, f"SHAP_{target_id}.svg"),
+                save_path=os.path.join(save_dir, f"SHAP_{target_id}_{plot_type}.svg"),
                 plot_type=plot_type,
                 title=f"{model_name} - {target_id}"
             )
-    else:  # Regression
-        _create_shap_plot(
-            shap_values=shap_values,
-            features=features_to_explain,
-            feature_names=feature_names,
-            full_save_path=os.path.join(save_dir, f"SHAP_{target_id}.svg"),
-            plot_type=plot_type,
-            title=f"{model_name} - {target_id}"
-        )
+    explainer = shap.TreeExplainer(model)
+    shap_values = explainer.shap_values(features_to_explain)
+    if task == 'classification':
+        try:
+            class_names = model.classes_ if hasattr(model, 'classes_') else list(range(len(shap_values)))
+        except Exception:
+            class_names = list(range(len(shap_values)))
+        _plot_for_classification(shap_values, class_names)
+    else:
+        _plot_for_regression(shap_values)
 # TRAIN TEST PIPELINE
 def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["classification", "regression"],
              train_features: np.ndarray, train_target: np.ndarray,
@@ -653,7 +644,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
     return trained_model, y_pred
 ###### 5. Execution ######
-def run_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"]="regression",
+def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], task: Literal["classification", "regression"],
          resample_strategy: Literal[r"ADASYN", r'SMOTE', r'RANDOM', r'UNDERSAMPLE', None]=None, scaler: Literal["standard", "minmax", "maxabs"]="minmax", save_model: bool=False,
          test_size: float=0.2, debug:bool=False, L1_regularization: float=0.5, L2_regularization: float=0.5, learning_rate: float=0.005, random_state: int=101):
     #Check paths
@@ -672,15 +663,15 @@ def run_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], ta
             #Train models
             for model_name, model in models_dict.items():
                 train_test_pipeline(model=model, model_name=model_name, dataset_id=dataframe_name, task=task,
-                                    train_features=X_train, train_target=y_train,
+                                    train_features=X_train, train_target=y_train, # type: ignore
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_id=target_name, scaler_object=scaler_object,
                                     debug=debug, save_dir=save_dir, save_model=save_model)
-    print("\nTraining and evaluation complete.")
+    print("\n✅ Training and evaluation complete.")
 def _check_paths(datasets_dir: str, save_dir:str):
     if not os.path.isdir(save_dir):
         os.makedirs(save_dir)
     if not os.path.isdir(datasets_dir):
-        raise IOError(f"Datasets directory '{datasets_dir}' not found.\nCheck path or run MICE script first.")
+        raise IOError(f"Datasets directory '{datasets_dir}' not found.")

ml_tools/utilities.py CHANGED Viewed

@@ -6,17 +6,15 @@ from pathlib import Path
 import re
-def list_csv_paths(directory: str) -> tuple[list[str], list[str]]:
+def list_csv_paths(directory: str) -> dict[str, str]:
     """
-    Lists all CSV files in a given directory and returns their paths with corresponding base names.
+    Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
     Parameters:
         directory (str): Path to the directory containing `.csv` files.
     Returns:
-        Tuple ([List[str], List[str]]):
-        - List of absolute paths to `.csv` files.
-        - List of corresponding base names (without extensions).
+        (dict[str, str]): Mapping {name, path}.
     """
     dir_path = Path(directory).expanduser().resolve()
@@ -26,11 +24,15 @@ def list_csv_paths(directory: str) -> tuple[list[str], list[str]]:
     csv_paths = list(dir_path.glob("*.csv"))
     if not csv_paths:
         raise IOError(f"No CSV files found in directory: {dir_path}")
+    # make a dictionary of paths and names
+    name_path_dict = {p.stem: str(p) for p in csv_paths}
+    print("🗂️ CSV files found:")
+    for name in name_path_dict.keys():
+        print(f"\t{name}")
-    paths = [str(p) for p in csv_paths]
-    names = [p.stem for p in csv_paths]
-    return paths, names
+    return name_path_dict
 def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
@@ -49,7 +51,7 @@ def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
     df_name = path.stem
     if df.empty:
         raise ValueError(f"DataFrame '{df_name}' is empty.")
-    print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
+    print(f"\n💿 Loaded dataset: '{df_name}' with shape: {df.shape}")
     return df, df_name
@@ -71,9 +73,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
     - CSV files are read using UTF-8 encoding.
     - Output is streamed via a generator to support lazy loading of multiple datasets.
     """
-    for df_path, df_name in list_csv_paths(datasets_dir):
-        df = pd.read_csv(df_path)
-        print(f"Loaded dataset: '{df_name}' with shape: {df.shape}")
+    for df_name, df_path in list_csv_paths(datasets_dir).items():
+        df, _ = load_dataframe(df_path)
         yield df, df_name
@@ -166,3 +167,28 @@ def sanitize_filename(filename: str) -> str:
     return sanitized
+def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
+    """
+    Save a pandas DataFrame to a CSV file.
+    Parameters:
+        df: pandas.DataFrame to save
+        save_dir: str, directory where the CSV file will be saved.
+        filename: str, CSV filename, extension will be added if missing.
+    """
+    if df.empty:
+        print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
+        return
+    os.makedirs(save_dir, exist_ok=True)
+    filename = sanitize_filename(filename)
+    if not filename.endswith('.csv'):
+        filename += '.csv'
+    output_path = os.path.join(save_dir, filename)
+    df.to_csv(output_path, index=False, encoding='utf-8')
+    print(f"✅ Saved file: '{filename}'")

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.0py3-none-any.whl