PyPI - dragon-ml-toolbox - Versions diffs - 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl - Mend

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (19) hide show

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA +19 -2
dragon_ml_toolbox-1.4.1.dist-info/RECORD +19 -0
ml_tools/MICE_imputation.py +24 -6
ml_tools/VIF_factor.py +224 -0
ml_tools/data_exploration.py +74 -286
ml_tools/datasetmaster.py +13 -1
ml_tools/ensemble_learning.py +128 -129
ml_tools/handle_excel.py +32 -9
ml_tools/logger.py +10 -1
ml_tools/particle_swarm_optimization.py +71 -34
ml_tools/pytorch_models.py +13 -1
ml_tools/trainer.py +10 -30
ml_tools/utilities.py +122 -14
ml_tools/vision_helpers.py +14 -1
dragon_ml_toolbox-1.3.2.dist-info/RECORD +0 -18
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-1.3.2.dist-info → dragon_ml_toolbox-1.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 1.3.2
+Version: 1.4.1
 Summary: A collection of tools for data science and machine learning projects
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -27,6 +27,7 @@ Requires-Dist: ipython
 Requires-Dist: ipykernel
 Requires-Dist: notebook
 Requires-Dist: jupyterlab
+Requires-Dist: ipywidgets
 Requires-Dist: joblib
 Requires-Dist: xgboost
 Requires-Dist: lightgbm<=4.5.0
@@ -79,7 +80,7 @@ Clone the repository and install in editable mode with optional dependencies:
 ```bash
 git clone https://github.com/DrAg0n-BoRn/ML_tools.git
 cd ML_tools
-pip install -e '.[pytorch]'
+pip install -e .
 ```
 ## Usage
@@ -90,3 +91,19 @@ After installation, import modules like this:
 from ml_tools.utilities import sanitize_filename
 from ml_tools.logger import custom_logger
 ```
+## Available modules
+```bash
+data_exploration
+datasetmaster
+ensemble_learning
+handle_excel
+logger
+MICE_imputation
+particle_swarm_optimization
+trainer
+utilities
+VIF_factor
+vision_helpers
+```

dragon_ml_toolbox-1.4.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-1.4.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=e1Hg5ZtaBpDV7ZvxhLe1ac28l7nMjvi1MSE5YvB1s-o,1472
+ml_tools/MICE_imputation.py,sha256=CK0tYZ_kQkdETohOlhI7RP7oFkJTXrP-XtIxb--dzpU,9726
+ml_tools/VIF_factor.py,sha256=LQWr1P8WYij07FX_3RZC6Rr22bfAMnrt0Lhvi7SbBpY,9846
+ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ml_tools/data_exploration.py,sha256=FXP5i6bQo8J3RCyLRmlX-qJVh4VH8DbMjrdUmyd1mF0,18708
+ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
+ml_tools/ensemble_learning.py,sha256=khXXRiR7boWwI4CAvb2bxzS3fhLADNETMOiRe3ihZ4Y,28821
+ml_tools/handle_excel.py,sha256=ZJui5__0rc2T8UGHTheqZGhKmdVZ7Q2I54IoYCjAqJw,12612
+ml_tools/logger.py,sha256=NOtL3YSuffAGmpTpXjY-uJjqFLdRG_jpL7MDyloBw9c,4712
+ml_tools/particle_swarm_optimization.py,sha256=714kZo6lvUvRaPTtj6kJGecZwHcehcSkLysokXAf3No,20706
+ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
+ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
+ml_tools/utilities.py,sha256=z2JPy4GM2YBLUC0sPq7aNLuesPFAQu5KNcsgmuOywdU,8738
+ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
+dragon_ml_toolbox-1.4.1.dist-info/METADATA,sha256=0XdPwNWe81rCvJLJfSS5XvB2ZdJKpBLLoqMU5uxYLMc,2516
+dragon_ml_toolbox-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-1.4.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-1.4.1.dist-info/RECORD,,

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -3,9 +3,20 @@ import miceforest as mf
 import os
 import matplotlib.pyplot as plt
 import numpy as np
-from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename
+from ml_tools.utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
+__all__ = [
+    "apply_mice",
+    "save_imputed_datasets",
+    "get_na_column_names",
+    "get_convergence_diagnostic",
+    "get_imputed_distributions",
+    "run_mice_pipeline"
+]
 def apply_mice(df: pd.DataFrame, df_name: str, resulting_datasets: int=1, iterations: int=20, random_state: int=101):
     # Initialize kernel with number of imputed datasets to generate
@@ -120,7 +131,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     '''
     # Check path
     os.makedirs(root_dir, exist_ok=True)
-    local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}")
+    local_save_dir = os.path.join(root_dir, f"Distribution_Metrics_{df_name}_imputed")
     if not os.path.isdir(local_save_dir):
         os.makedirs(local_save_dir)
@@ -169,8 +180,12 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
         # Adjust layout and save
         # fig.tight_layout()
         # fig.subplots_adjust(bottom=0.2, left=0.2)  # Optional, depending on overflow
+        # sanitize savename
+        feature_save_name = sanitize_filename(filename)
         fig.savefig(
-            os.path.join(local_save_dir, filename + ".svg"),
+            os.path.join(local_save_dir, feature_save_name + ".svg"),
             format='svg',
             bbox_inches='tight',
             pad_inches=0.1
@@ -185,8 +200,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
     else:
         for feature in column_names:
             fig = kernel.plot_imputed_distributions(variables=[feature])
-            feature_save_name = sanitize_filename(feature)
-            _process_figure(fig, feature_save_name)
+            _process_figure(fig, feature)
     print("\tImputed distributions saved successfully.")
@@ -207,7 +221,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
     if os.path.isfile(df_path_or_dir):
         all_file_paths = [df_path_or_dir]
     elif os.path.isdir(df_path_or_dir):
-        all_file_paths, _ = list_csv_paths(df_path_or_dir)
+        all_file_paths = list(list_csv_paths(df_path_or_dir).values())
     else:
         raise ValueError(f"Invalid path or directory: {df_path_or_dir}")
@@ -223,3 +237,7 @@ def run_mice_pipeline(df_path_or_dir: str, save_datasets_dir: str, save_metrics_
         get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_dir)
         get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_dir, column_names=imputed_column_names)
+def info():
+    _script_info(__all__)

ml_tools/VIF_factor.py ADDED Viewed

@@ -0,0 +1,224 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Optional
+from statsmodels.stats.outliers_influence import variance_inflation_factor
+from statsmodels.tools.tools import add_constant
+import warnings
+import os
+from .utilities import sanitize_filename, yield_dataframes_from_dir, save_dataframe, _script_info
+__all__ = [
+    "compute_vif",
+    "drop_vif_based",
+    "compute_vif_multi"
+]
+def compute_vif(
+    df: pd.DataFrame,
+    use_columns: Optional[list[str]] = None,
+    ignore_columns: Optional[list[str]] = None,
+    max_features_to_plot: int = 20,
+    save_dir: Optional[str] = None,
+    filename: Optional[str] = None,
+    fontsize: int = 14,
+    show_plot: bool = True,
+) -> pd.DataFrame:
+    """
+    Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        max_features_to_plot (int): Adjust the number of features shown in the plot.
+        save_dir (str | None): Directory to save the plot as SVG. If None, the plot is not saved.
+        filename (str | None): Optional filename for saving the plot. Defaults to "VIF_plot.svg".
+        fontsize (int): Base fontsize to scale title and labels on the plot.
+        show_plot (bool): Display plot.
+    Returns:
+        pd.DataFrame: DataFrame with features and their corresponding VIF values.
+    NOTE:
+    **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
+    A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
+    A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
+    """
+    ground_truth_cols = df.columns.to_list()
+    if use_columns is None:
+        sanitized_columns = df.select_dtypes(include='number').columns.tolist()
+        missing_features = set(ground_truth_cols) - set(sanitized_columns)
+        if missing_features:
+            print(f"⚠️ These columns are not Numeric:\n{missing_features}")
+    else:
+        sanitized_columns = list()
+        for feature in use_columns:
+            if feature not in ground_truth_cols:
+                print(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
+            else:
+                sanitized_columns.append(feature)
+    if ignore_columns is not None and use_columns is None:
+        missing_ignore = set(ignore_columns) - set(ground_truth_cols)
+        if missing_ignore:
+            print(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
+        sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
+    X = df[sanitized_columns].copy()
+    X = add_constant(X, has_constant='add')
+    vif_data = pd.DataFrame()
+    vif_data["feature"] = X.columns # type: ignore
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        vif_data["VIF"] = [
+            variance_inflation_factor(X.values, i) for i in range(X.shape[1]) # type: ignore
+        ]
+    # Replace infinite values (perfect multicollinearity)
+    vif_data["VIF"] = vif_data["VIF"].replace([np.inf, -np.inf], 999.0)
+    # Drop the constant column
+    vif_data = vif_data[vif_data["feature"] != "const"]
+    # Add color coding
+    def vif_color(v: float) -> str:
+        if v >= 10:
+            return "red"
+        elif v >= 5:
+            return "gold"
+        else:
+            return "green"
+    vif_data["color"] = vif_data["VIF"].apply(vif_color)
+    # Sort by VIF descending
+    vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)
+    # Filter for plotting
+    plot_data = vif_data.head(max_features_to_plot)
+    if save_dir or show_plot:
+        if not plot_data.empty:
+            plt.figure(figsize=(10, 6))
+            plt.barh(
+                plot_data["feature"],
+                plot_data["VIF"],
+                color=plot_data["color"],
+                edgecolor='black'
+            )
+            plt.title("Variance Inflation Factor (VIF) per Feature", fontsize=fontsize+1)
+            plt.xlabel("VIF value", fontsize=fontsize)
+            plt.xticks(fontsize=fontsize)
+            plt.yticks(fontsize=fontsize)
+            plt.axvline(x=5, color='gold', linestyle='--', label='VIF = 5')
+            plt.axvline(x=10, color='red', linestyle='--', label='VIF = 10')
+            plt.xlim(0, 12)
+            plt.legend(loc='lower right', fontsize=fontsize-1)
+            plt.gca().invert_yaxis()
+            plt.grid(axis='x', linestyle='--', alpha=0.5)
+            plt.tight_layout()
+            if save_dir:
+                os.makedirs(save_dir, exist_ok=True)
+                if filename is None:
+                    filename = "VIF_plot.svg"
+                else:
+                    filename = sanitize_filename(filename)
+                    if not filename.endswith(".svg"):
+                        filename += ".svg"
+                save_path = os.path.join(save_dir, "VIF_" + filename)
+                plt.savefig(save_path, format='svg', bbox_inches='tight')
+                print(f"\tSaved VIF plot: '{filename}'")
+            if show_plot:
+                plt.show()
+            plt.close()
+    return vif_data.drop(columns="color")
+def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10.0) -> tuple[pd.DataFrame, list[str]]:
+    """
+    Drops columns from the original DataFrame based on their VIF values exceeding a given threshold.
+    Args:
+        df (pd.DataFrame): Original DataFrame containing the columns to test.
+        vif_df (pd.DataFrame): DataFrame with 'feature' and 'VIF' columns as returned by `compute_vif()`.
+        threshold (float): VIF threshold above which columns will be dropped.
+    Returns:
+        (tuple[pd.DataFrame, list[str]]):
+            - A new DataFrame with high-VIF columns removed.
+            - A list with dropped column names.
+    """
+    # Ensure expected structure
+    if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
+        raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
+    # Identify features to drop
+    to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
+    print(f"\tDropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
+    result_df = df.drop(columns=to_drop)
+    if result_df.empty:
+        print(f"\t⚠️ Warning: All columns were dropped.")
+    return result_df, to_drop
+def compute_vif_multi(input_directory: str,
+                      output_plot_directory: str,
+                      output_dataset_directory: Optional[str] = None,
+                      use_columns: Optional[list[str]] = None,
+                      ignore_columns: Optional[list[str]] = None,
+                      max_features_to_plot: int = 20,
+                      fontsize: int = 14):
+    """
+    Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames).
+    Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
+    Args:
+        input_directory (str): Target directory with CSV files able to be loaded as DataFrame.
+        output_plot_directory (str): Save plots to this directory.
+        output_dataset_directory (str | None): If provided, saves new CSV files to this directory.
+        use_columns (list[str] | None): Optional list of columns to include. Defaults to all numeric columns.
+        ignore_columns (list[str] | None): Optional list of columns to exclude from the VIF computation. Skipped if `target_columns` is provided.
+        max_features_to_plot (int): Adjust the number of features shown in the plot.
+        fontsize (int): Base fontsize to scale title and labels on hte plot.
+    NOTE:
+    **Variance Inflation Factor (VIF)** quantifies the degree of multicollinearity among features in a dataset.
+    A VIF value indicates how much the variance of a regression coefficient is inflated due to linear dependence with other features.
+    A VIF of 1 suggests no correlation, values between 1 and 5 indicate moderate correlation, and values greater than 10 typically signal high multicollinearity, which may distort model interpretation and degrade performance.
+    """
+    if output_dataset_directory is not None:
+        os.makedirs(output_dataset_directory, exist_ok=True)
+    for df, df_name in yield_dataframes_from_dir(datasets_dir=input_directory):
+        vif_dataframe = compute_vif(df=df,
+                            use_columns=use_columns,
+                            ignore_columns=ignore_columns,
+                            max_features_to_plot=max_features_to_plot,
+                            fontsize=fontsize,
+                            save_dir=output_plot_directory,
+                            filename=df_name,
+                            show_plot=False)
+        if output_dataset_directory is not None:
+            new_filename = 'VIF_' + df_name
+            result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
+            if len(dropped_cols) > 0:
+                save_dataframe(df=result_df, save_dir=output_dataset_directory, filename=new_filename)
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 1.3.2py3-none-any.whl → 1.4.1py3-none-any.whl