PyPI - dragon-ml-toolbox - Versions diffs - 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 1.4.8py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (16) hide show

{dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/METADATA +24 -14
dragon_ml_toolbox-2.1.0.dist-info/RECORD +20 -0
{dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +5 -4
ml_tools/MICE_imputation.py +27 -28
ml_tools/PSO_optimization.py +490 -0
ml_tools/VIF_factor.py +20 -17
ml_tools/{particle_swarm_optimization.py → _particle_swarm_optimization.py} +5 -0
ml_tools/data_exploration.py +58 -32
ml_tools/ensemble_learning.py +40 -42
ml_tools/handle_excel.py +98 -78
ml_tools/logger.py +13 -11
ml_tools/utilities.py +134 -46
dragon_ml_toolbox-1.4.8.dist-info/RECORD +0 -19
{dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-1.4.8.dist-info → dragon_ml_toolbox-2.1.0.dist-info}/top_level.txt +0 -0

ml_tools/data_exploration.py CHANGED Viewed

@@ -5,9 +5,9 @@ import seaborn as sns
 from IPython import get_ipython
 from IPython.display import clear_output
 import time
-from typing import Union, Literal, Dict, Tuple, List
-import os
-from .utilities import sanitize_filename, _script_info
+from typing import Union, Literal, Dict, Tuple, List, Optional
+from pathlib import Path
+from .utilities import sanitize_filename, _script_info, make_fullpath
 import re
@@ -59,26 +59,48 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
     return summary
-def drop_rows_with_missing_data(df: pd.DataFrame, threshold: float = 0.7) -> pd.DataFrame:
+def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
     """
-    Drops rows with more than `threshold` fraction of missing values.
+    Drops rows from the DataFrame using a two-stage strategy:
+    1. If `targets`, remove any row where all target columns are missing.
+    2. Among features, drop those with more than `threshold` fraction of missing values.
     Parameters:
         df (pd.DataFrame): The input DataFrame.
-        threshold (float): Fraction of missing values above which rows are dropped.
+        targets (list[str] | None): List of target column names.
+        threshold (float): Maximum allowed fraction of missing values in feature columns.
     Returns:
-        pd.DataFrame: A new DataFrame without the dropped rows.
+        pd.DataFrame: A cleaned DataFrame with problematic rows removed.
     """
-    missing_fraction = df.isnull().mean(axis=1)
-    rows_to_drop = missing_fraction[missing_fraction > threshold].index
-    if len(rows_to_drop) > 0:
-        print(f"Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing data.")
+    df_clean = df.copy()
+    # Stage 1: Drop rows with all target columns missing
+    if targets is not None:
+        target_na = df_clean[targets].isnull().all(axis=1)
+        if target_na.any():
+            print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
+            df_clean = df_clean[~target_na]
+        else:
+            print("✅ No rows with all targets missing.")
     else:
-        print(f"No rows have more than {threshold*100:.0f}% missing data.")
+        targets = []
+    # Stage 2: Drop rows based on feature column missing values
+    feature_cols = [col for col in df_clean.columns if col not in targets]
+    if feature_cols:
+        feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
+        rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
+        if len(rows_to_drop) > 0:
+            print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
+            df_clean = df_clean.drop(index=rows_to_drop)
+        else:
+            print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
+    else:
+        print("⚠️ No feature columns available to evaluate.")
-    return df.drop(index=rows_to_drop)
+    return df_clean
 def split_features_targets(df: pd.DataFrame, targets: list[str]):
@@ -205,13 +227,16 @@ def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFram
     return df_cont, df_bin # type: ignore
-def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None, method: Literal["pearson", "kendall", "spearman"]="pearson", plot_title: str="Correlation Heatmap"):
+def plot_correlation_heatmap(df: pd.DataFrame,
+                             save_dir: Union[str, Path, None] = None,
+                             plot_title: str="Correlation Heatmap",
+                             method: Literal["pearson", "kendall", "spearman"]="pearson"):
     """
     Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
     Args:
         df (pd.DataFrame): The input dataset.
-        save_dir (str | None): If provided, the heatmap will be saved to this directory as a svg file.
+        save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
         plot_title: To make different plots, or overwrite existing ones.
         method (str): Correlation method to use. Must be one of:
             - 'pearson' (default): measures linear correlation (assumes normally distributed data),
@@ -254,10 +279,13 @@ def plot_correlation_heatmap(df: pd.DataFrame, save_dir: Union[str, None] = None
     plt.tight_layout()
     if save_dir:
+        save_path = make_fullpath(save_dir, make=True)
         # sanitize the plot title to save the file
         plot_title = sanitize_filename(plot_title)
-        os.makedirs(save_dir, exist_ok=True)
-        full_path = os.path.join(save_dir, plot_title + ".svg")
+        plot_title = plot_title + ".svg"
+        full_path = save_path / plot_title
         plt.savefig(full_path, bbox_inches="tight", format='svg')
         print(f"Saved correlation heatmap: '{plot_title}.svg'")
@@ -322,7 +350,7 @@ def check_value_distributions(df: pd.DataFrame, view_frequencies: bool=True, bin
         user_input_ = input("Press enter to continue")
-def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
+def plot_value_distributions(df: pd.DataFrame, save_dir: Union[str, Path], bin_threshold: int=10, skip_cols_with_key: Union[str, None]=None):
     """
     Plots and saves the value distributions for all (or selected) columns in a DataFrame,
     with adaptive binning for numerical columns when appropriate.
@@ -335,7 +363,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
     Args:
         df (pd.DataFrame): The input DataFrame whose columns are to be analyzed.
-        save_dir (str): Directory path where the plots will be saved. Will be created if it does not exist.
+        save_dir (str | Path): Directory path where the plots will be saved. Will be created if it does not exist.
         bin_threshold (int): Minimum number of unique values required to trigger binning
             for numerical columns.
         skip_cols_with_key (str | None): If provided, any column whose name contains this
@@ -346,8 +374,7 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
         - All non-alphanumeric characters in column names are sanitized for safe file naming.
         - Colormap is automatically adapted based on the number of categories or bins.
     """
-    if save_dir is not None:
-        os.makedirs(save_dir, exist_ok=True)
+    save_path = make_fullpath(save_dir, make=True)
     dict_to_plot_std = dict()
     dict_to_plot_freq = dict()
@@ -384,13 +411,12 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
             view_freq = 100 * view_std / view_std.sum() # Percentage
         # view_freq = df[col].value_counts(normalize=True, bins=10)  # relative percentages
-        if save_dir:
-            dict_to_plot_std[col] = dict(view_std)
-            dict_to_plot_freq[col] = dict(view_freq)
-            saved_plots += 1
+        dict_to_plot_std[col] = dict(view_std)
+        dict_to_plot_freq[col] = dict(view_freq)
+        saved_plots += 1
     # plot helper
-    def _plot_helper(dict_: dict, target_dir: str, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
+    def _plot_helper(dict_: dict, target_dir: Path, ylabel: Literal["Frequency", "Counts"], base_fontsize: int=12):
         for col, data in dict_.items():
             safe_col = sanitize_filename(col)
@@ -412,15 +438,15 @@ def plot_value_distributions(df: pd.DataFrame, save_dir: str, bin_threshold: int
             plt.gca().set_facecolor('#f9f9f9')
             plt.tight_layout()
-            plot_path = os.path.join(target_dir, f"{safe_col}.png")
+            plot_path = target_dir / f"{safe_col}.png"
             plt.savefig(plot_path, dpi=300, bbox_inches="tight")
             plt.close()
     # Save plots
-    freq_dir = os.path.join(save_dir, "Distribution_Frequency")
-    std_dir = os.path.join(save_dir, "Distribution_Counts")
-    os.makedirs(freq_dir, exist_ok=True)
-    os.makedirs(std_dir, exist_ok=True)
+    freq_dir = save_path / "Distribution_Frequency"
+    std_dir = save_path / "Distribution_Counts"
+    freq_dir.mkdir(parents=True, exist_ok=True)
+    std_dir.mkdir(parents=True, exist_ok=True)
     _plot_helper(dict_=dict_to_plot_std, target_dir=std_dir, ylabel="Counts")
     _plot_helper(dict_=dict_to_plot_freq, target_dir=freq_dir, ylabel="Frequency")

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
 from matplotlib.colors import Colormap
 from matplotlib import rcdefaults
-import os
+from pathlib import Path
 from typing import Literal, Union, Optional, Iterator, Tuple
 from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
@@ -19,7 +19,7 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, mean_absolute_error, mean_squared_error, r2_score, roc_curve, roc_auc_score
 import shap
-from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object
+from .utilities import yield_dataframes_from_dir, sanitize_filename, _script_info, serialize_object, make_fullpath
 import warnings # Ignore warnings
 warnings.filterwarnings('ignore', category=DeprecationWarning)
@@ -469,30 +469,31 @@ def _train_model(model, train_features, train_target):
     return model
 # handle local directories
-def _local_directories(model_name: str, dataset_id: str, save_dir: str):
-    dataset_dir = os.path.join(save_dir, dataset_id)
-    if not os.path.isdir(dataset_dir):
-        os.makedirs(dataset_dir)
+def _local_directories(model_name: str, dataset_id: str, save_dir: Union[str,Path]):
+    save_path = make_fullpath(save_dir, make=True)
-    model_dir = os.path.join(dataset_dir, model_name)
-    if not os.path.isdir(model_dir):
-        os.makedirs(model_dir)
+    dataset_dir = save_path / dataset_id
+    dataset_dir.mkdir(parents=True, exist_ok=True)
+    model_dir = dataset_dir / model_name
+    model_dir.mkdir(parents=True, exist_ok=True)
     return model_dir
 # save model
-def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: str):
+def _save_model(trained_model, model_name: str, target_name:str, feature_names: list[str], save_directory: Union[str,Path]):
     #Sanitize filenames to save
     sanitized_target_name = sanitize_filename(target_name)
     filename = f"{model_name}_{sanitized_target_name}"
     to_save = {'model': trained_model, 'feature_names': feature_names, 'target_name':target_name}
     serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
 # function to evaluate the model and save metrics (Classification)
 def evaluate_model_classification(
     model,
     model_name: str,
-    save_dir: str,
+    save_dir: Union[str,Path],
     x_test_scaled: np.ndarray,
     single_y_test: np.ndarray,
     target_name: str,
@@ -524,7 +525,7 @@ def evaluate_model_classification(
     Returns:
         y_pred: Predicted class labels
     """
-    os.makedirs(save_dir, exist_ok=True)
+    save_path = make_fullpath(save_dir, make=True)
     y_pred = model.predict(x_test_scaled)
     accuracy = accuracy_score(single_y_test, y_pred)
@@ -538,7 +539,7 @@ def evaluate_model_classification(
     # Save text report
     sanitized_target_name = sanitize_filename(target_name)
-    report_path = os.path.join(save_dir, f"Classification_Report_{sanitized_target_name}.txt")
+    report_path = save_path / f"Classification_Report_{sanitized_target_name}.txt"
     with open(report_path, "w") as f:
         f.write(f"{model_name} - {target_name}\t\tAccuracy: {accuracy:.2f}\n")
         f.write("Classification Report:\n")
@@ -568,7 +569,7 @@ def evaluate_model_classification(
         text.set_fontsize(base_fontsize+4)
     fig.tight_layout()
-    fig_path = os.path.join(save_dir, f"Confusion_Matrix_{sanitized_target_name}.svg")
+    fig_path = save_path / f"Confusion_Matrix_{sanitized_target_name}.svg"
     fig.savefig(fig_path, format="svg", bbox_inches="tight")
     plt.close(fig)
@@ -580,7 +581,7 @@ def plot_roc_curve(
     probabilities_or_model: Union[np.ndarray, xgb.XGBClassifier, lgb.LGBMClassifier, object],
     model_name: str,
     target_name: str,
-    save_directory: str,
+    save_directory: Union[str,Path],
     color: str = "darkorange",
     figure_size: tuple = (10, 10),
     linewidth: int = 2,
@@ -594,7 +595,7 @@ def plot_roc_curve(
         true_labels: np.ndarray of shape (n_samples,), ground truth binary labels (0 or 1).
         probabilities_or_model: either predicted probabilities (ndarray), or a trained model with attribute `.predict_proba()`.
         target_name: str, Target name.
-        save_directory: str, path to directory where figure is saved.
+        save_directory: str or Path, path to directory where figure is saved.
         color: color of the ROC curve. Accepts any valid Matplotlib color specification. Examples:
             - Named colors: "darkorange", "blue", "red", "green", "black"
             - Hex codes: "#1f77b4", "#ff7f0e"
@@ -650,17 +651,17 @@ def plot_roc_curve(
     ax.grid(True)
     # Save figure
-    os.makedirs(save_directory, exist_ok=True)
+    save_path = make_fullpath(save_directory, make=True)
     sanitized_target_name = sanitize_filename(target_name)
-    save_path = os.path.join(save_directory, f"ROC_{sanitized_target_name}.svg")
-    fig.savefig(save_path, bbox_inches="tight", format="svg")
+    full_save_path = save_path / f"ROC_{sanitized_target_name}.svg"
+    fig.savefig(full_save_path, bbox_inches="tight", format="svg")
     return fig
 # function to evaluate the model and save metrics (Regression)
 def evaluate_model_regression(model, model_name: str,
-                               save_dir: str,
+                               save_dir: Union[str,Path],
                                x_test_scaled: np.ndarray, single_y_test: np.ndarray,
                                target_name: str,
                                figure_size: tuple = (12, 8),
@@ -677,7 +678,8 @@ def evaluate_model_regression(model, model_name: str,
     # Create formatted report
     sanitized_target_name = sanitize_filename(target_name)
-    report_path = os.path.join(save_dir, f"Regression_Report_{sanitized_target_name}.txt")
+    save_path = make_fullpath(save_dir, make=True)
+    report_path = save_path / f"Regression_Report_{sanitized_target_name}.txt"
     with open(report_path, "w") as f:
         f.write(f"{model_name} - Regression Performance for '{target_name}'\n\n")
         f.write(f"Mean Absolute Error (MAE): {mae:.4f}\n")
@@ -695,7 +697,8 @@ def evaluate_model_regression(model, model_name: str,
     plt.title(f"{model_name} - Residual Plot for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
     plt.tight_layout()
-    plt.savefig(os.path.join(save_dir, f"Residual_Plot_{sanitized_target_name}.svg"), bbox_inches='tight', format="svg")
+    residual_path = save_path / f"Residual_Plot_{sanitized_target_name}.svg"
+    plt.savefig(residual_path, bbox_inches='tight', format="svg")
     plt.close()
     # Create true vs predicted values plot
@@ -708,7 +711,7 @@ def evaluate_model_regression(model, model_name: str,
     plt.ylabel('Predictions', fontsize=base_fontsize)
     plt.title(f"{model_name} - True vs Predicted for {target_name}", fontsize=base_fontsize)
     plt.grid(True)
-    plot_path = os.path.join(save_dir, f"Regression_Plot_{sanitized_target_name}.svg")
+    plot_path = save_path / f"Regression_Plot_{sanitized_target_name}.svg"
     plt.savefig(plot_path, bbox_inches='tight', format="svg")
     plt.close()
@@ -719,7 +722,7 @@ def evaluate_model_regression(model, model_name: str,
 def get_shap_values(
     model,
     model_name: str,
-    save_dir: str,
+    save_dir: Union[str, Path],
     features_to_explain: np.ndarray,
     feature_names: list[str],
     target_name: str,
@@ -737,11 +740,12 @@ def get_shap_values(
 	    * Use the entire dataset to get the global view.
     Parameters:
-        task: 'regression' or 'classification'
+        task: 'regression' or 'classification'.
         features_to_explain: Should match the model's training data format, including scaling.
-        save_dir: Directory to save visualizations
+        save_dir: Directory to save visualizations.
     """
     sanitized_target_name = sanitize_filename(target_name)
+    global_save_path = make_fullpath(save_dir, make=True)
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -759,7 +763,7 @@ def get_shap_values(
         plt.rc('legend', fontsize=base_fontsize)
         plt.rc('figure', titlesize=base_fontsize)
-    def _create_shap_plot(shap_values, features, save_path: str, plot_type: str, title: str):
+    def _create_shap_plot(shap_values, features, save_path: Path, plot_type: str, title: str):
         _apply_plot_style()
         _configure_rcparams()
         plt.figure(figsize=figsize)
@@ -804,7 +808,7 @@ def get_shap_values(
                     _create_shap_plot(
                         shap_values=class_shap,
                         features=features_to_explain,
-                        save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg"),
+                        save_path=global_save_path / f"SHAP_{sanitized_target_name}_Class{class_name}_{plot_type}.svg",
                         plot_type=plot_type,
                         title=f"{model_name} - {target_name} (Class {class_name})"
                     )
@@ -814,7 +818,7 @@ def get_shap_values(
                 _create_shap_plot(
                     shap_values=values,
                     features=features_to_explain,
-                    save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
+                    save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
                     plot_type=plot_type,
                     title=f"{model_name} - {target_name}"
                 )
@@ -824,7 +828,7 @@ def get_shap_values(
             _create_shap_plot(
                 shap_values=shap_values,
                 features=features_to_explain,
-                save_path=os.path.join(save_dir, f"SHAP_{sanitized_target_name}_{plot_type}.svg"),
+                save_path=global_save_path / f"SHAP_{sanitized_target_name}_{plot_type}.svg",
                 plot_type=plot_type,
                 title=f"{model_name} - {target_name}"
             )
@@ -848,7 +852,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
              train_features: np.ndarray, train_target: np.ndarray,
              test_features: np.ndarray, test_target: np.ndarray,
              feature_names: list[str], target_name: str,
-             save_dir: str,
+             save_dir: Union[str,Path],
              debug: bool=False, save_model: bool=False):
     '''
     1. Train model.
@@ -889,7 +893,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: TaskType,
     return trained_model, y_pred
 ###### 5. Execution ######
-def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
+def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
          handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
          test_size: float=0.2, debug:bool=False):
     #Check models
@@ -907,10 +911,11 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
         raise TypeError(f"Unrecognized model {type(model_object)}")
     #Check paths
-    _check_paths(datasets_dir, save_dir)
+    datasets_path = make_fullpath(datasets_dir)
+    save_path = make_fullpath(save_dir, make=True)
     #Yield imputed dataset
-    for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_dir):
+    for dataframe, dataframe_name in yield_dataframes_from_dir(datasets_path):
         #Yield features dataframe and target dataframe
         for df_features, df_target, feature_names, target_name in dataset_yielder(df=dataframe, target_cols=target_columns):
             #Dataset pipeline
@@ -925,15 +930,8 @@ def run_ensemble_pipeline(datasets_dir: str, save_dir: str, target_columns: list
                                     train_features=X_train, train_target=y_train, # type: ignore
                                     test_features=X_test, test_target=y_test,
                                     feature_names=feature_names,target_name=target_name,
-                                    debug=debug, save_dir=save_dir, save_model=save_model)
+                                    debug=debug, save_dir=save_path, save_model=save_model)
     print("\n✅ Training and evaluation complete.")
-def _check_paths(datasets_dir: str, save_dir:str):
-    if not os.path.isdir(save_dir):
-        os.makedirs(save_dir)
-    if not os.path.isdir(datasets_dir):
-        raise IOError(f"Datasets directory '{datasets_dir}' not found.")
 def info():

dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 1.4.8py3-none-any.whl → 2.1.0py3-none-any.whl