PyPI - dragon-ml-toolbox - Versions diffs - 5.3.0__py3-none-any.whl → 6.0.0__py3-none-any.whl - Mend

dragon-ml-toolbox 5.3.0py3-none-any.whl → 6.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (15) hide show

{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/METADATA +9 -6
{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/RECORD +15 -14
ml_tools/ML_callbacks.py +7 -7
ml_tools/ML_evaluation.py +196 -106
ml_tools/ML_trainer.py +17 -15
ml_tools/PSO_optimization.py +5 -5
ml_tools/ensemble_evaluation.py +639 -0
ml_tools/ensemble_inference.py +10 -10
ml_tools/ensemble_learning.py +47 -413
ml_tools/keys.py +2 -2
ml_tools/utilities.py +27 -3
{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/top_level.txt +0 -0

{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 5.3.0
+Version: 6.0.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: Karl Loza <luigiloza@gmail.com>
 License-Expression: MIT
@@ -141,19 +141,22 @@ pip install "dragon-ml-toolbox[pytorch]"
 ```bash
 custom_logger
 data_exploration
-ensemble_learning
+ensemble_evaluation
 ensemble_inference
+ensemble_learning
 ETL_engineering
-ML_datasetmaster
-ML_models
 ML_callbacks
+ML_datasetmaster
 ML_evaluation
-ML_trainer
 ML_inference
+ML_models
+ML_optimization
+ML_trainer
+optimization_tools
 path_manager
 PSO_optimization
-SQL
 RNN_forecast
+SQL
 utilities
 ```

{dragon_ml_toolbox-5.3.0.dist-info → dragon_ml_toolbox-6.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
-dragon_ml_toolbox-5.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
+dragon_ml_toolbox-6.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
+dragon_ml_toolbox-6.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
 ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
 ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
 ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
-ml_tools/ML_callbacks.py,sha256=eOCSc-1_e5vC2dQN1ydHGKDLeJ3DqB-eLRLuXp2DpFM,13257
+ml_tools/ML_callbacks.py,sha256=FEJ80TSEtY0-hdnOsAWeVApQt1mdzTdOntqtoWmMAzE,13310
 ml_tools/ML_datasetmaster.py,sha256=bbKCNA_b_uDIfxP9YIYKZm-VSfUSD15LvegFxpE9DIQ,34315
-ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
+ml_tools/ML_evaluation.py,sha256=A7AlEjy4ZOcdQMh9M3TJIDvCOXqzAbhgLxyhli8S_WY,13593
 ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
 ml_tools/ML_models.py,sha256=SJhKHGAN2VTBqzcHUOpFWuVZ2Y7U1M4P_axG_LNYWcI,6460
 ml_tools/ML_optimization.py,sha256=zGKpWW4SL1-3iiHglDP-dkuADL73T0kxs3Dc-Lyishs,9671
-ml_tools/ML_trainer.py,sha256=t58Ka6ryaYm0Fi5xje-e-fkmz9DwDLIeJLbh04n_gDg,15034
-ml_tools/PSO_optimization.py,sha256=stH2Ux1sftQgX5EwLc85kHcoT4Rmz6zv7sH2yzf4Zrw,22710
+ml_tools/ML_trainer.py,sha256=1q_CDXuMfndRsPuNofUn2mg2TlhG6MYuGqjWxTDgN9c,15112
+ml_tools/PSO_optimization.py,sha256=9Y074d-B5h4Wvp9YPiy6KAeXM-Yv6Il3gWalKvOLVgo,22705
 ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
 ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
 ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
@@ -19,14 +19,15 @@ ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
 ml_tools/data_exploration.py,sha256=P4f8OpRa7Q4i-11nkppxXw5Lx2lwlpn20GwWBbN_xbM,23901
-ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
-ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
+ml_tools/ensemble_evaluation.py,sha256=ywpBCvmVImocZAcKv52mSdQKKHdLswozknoev39l4Yo,24682
+ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
+ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
 ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,13007
-ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
+ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
 ml_tools/optimization_tools.py,sha256=MuT4OG7_r1QqLUti-yYix7QeCpglezD0oe9BDCq0QXk,5086
 ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
-ml_tools/utilities.py,sha256=T5xbxzBr14odUj7KncSeg-tJzqjmSDLOOmxEaGYLLi4,18447
-dragon_ml_toolbox-5.3.0.dist-info/METADATA,sha256=Lu_JBMfkCPssLk-a2v4b-oZu86cFK1OIB4HtHspVRIk,6643
-dragon_ml_toolbox-5.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-5.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-5.3.0.dist-info/RECORD,,
+ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
+dragon_ml_toolbox-6.0.0.dist-info/METADATA,sha256=v7JMG994i_tGqZJmN87pWxswxJEGQTsH2m2fQ_qz0C0,6698
+dragon_ml_toolbox-6.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-6.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-6.0.0.dist-info/RECORD,,

ml_tools/ML_callbacks.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import torch
 from tqdm.auto import tqdm
 from .path_manager import make_fullpath
-from .keys import LogKeys
+from .keys import PyTorchLogKeys
 from ._logger import _LOGGER
 from typing import Optional
 from ._script_info import _script_info
@@ -96,14 +96,14 @@ class TqdmProgressBar(Callback):
     def on_batch_end(self, batch, logs=None):
         self.batch_bar.update(1) # type: ignore
         if logs:
-            self.batch_bar.set_postfix(loss=f"{logs.get(LogKeys.BATCH_LOSS, 0):.4f}") # type: ignore
+            self.batch_bar.set_postfix(loss=f"{logs.get(PyTorchLogKeys.BATCH_LOSS, 0):.4f}") # type: ignore
     def on_epoch_end(self, epoch, logs=None):
         self.batch_bar.close() # type: ignore
         self.epoch_bar.update(1) # type: ignore
         if logs:
-            train_loss_str = f"{logs.get(LogKeys.TRAIN_LOSS, 0):.4f}"
-            val_loss_str = f"{logs.get(LogKeys.VAL_LOSS, 0):.4f}"
+            train_loss_str = f"{logs.get(PyTorchLogKeys.TRAIN_LOSS, 0):.4f}"
+            val_loss_str = f"{logs.get(PyTorchLogKeys.VAL_LOSS, 0):.4f}"
             self.epoch_bar.set_postfix_str(f"Train Loss: {train_loss_str}, Val Loss: {val_loss_str}") # type: ignore
     def on_train_end(self, logs=None):
@@ -124,7 +124,7 @@ class EarlyStopping(Callback):
                     inferred from the name of the monitored quantity.
         verbose (int): Verbosity mode.
     """
-    def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta=0.0, patience=3, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
+    def __init__(self, monitor: str=PyTorchLogKeys.VAL_LOSS, min_delta: float=0.0, patience: int=5, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
@@ -201,8 +201,8 @@ class ModelCheckpoint(Callback):
         mode (str): One of {'auto', 'min', 'max'}.
         verbose (int): Verbosity mode.
     """
-    def __init__(self, save_dir: Union[str,Path], monitor: str = LogKeys.VAL_LOSS,
-                 save_best_only: bool = False, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 1):
+    def __init__(self, save_dir: Union[str,Path], monitor: str = PyTorchLogKeys.VAL_LOSS,
+                 save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
         super().__init__()
         self.save_dir = make_fullpath(save_dir, make=True, enforce="directory")
         if not self.save_dir.is_dir():

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.calibration import CalibrationDisplay
 from sklearn.metrics import (
     classification_report,
     ConfusionMatrixDisplay,
@@ -9,7 +11,9 @@ from sklearn.metrics import (
     mean_squared_error,
     mean_absolute_error,
     r2_score,
-    median_absolute_error
+    median_absolute_error,
+    precision_recall_curve,
+    average_precision_score
 )
 import torch
 import shap
@@ -28,13 +32,13 @@ __all__ = [
 ]
-def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
+def plot_losses(history: dict, save_dir: Union[str, Path]):
     """
     Plots training & validation loss curves from a history object.
     Args:
         history (dict): A dictionary containing 'train_loss' and 'val_loss'.
-        save_dir (str | Path | None): Directory to save the plot image.
+        save_dir (str | Path): Directory to save the plot image.
     """
     train_loss = history.get('train_loss', [])
     val_loss = history.get('val_loss', [])
@@ -62,86 +66,123 @@ def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
     ax.grid(True)
     plt.tight_layout()
-    if save_dir:
-        save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
-        save_path = save_dir_path / "loss_plot.svg"
-        plt.savefig(save_path)
-        _LOGGER.info(f"📉 Loss plot saved as '{save_path.name}'")
-    else:
-        plt.show()
+    save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+    save_path = save_dir_path / "loss_plot.svg"
+    plt.savefig(save_path)
+    _LOGGER.info(f"📉 Loss plot saved as '{save_path.name}'")
     plt.close(fig)
-def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None,
-                           cmap: str = "Blues", save_dir: Optional[Union[str, Path]] = None):
+def classification_metrics(save_dir: Union[str, Path], y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None,
+                           cmap: str = "Blues"):
     """
-    Displays and optionally saves classification metrics and plots.
+    Saves classification metrics and plots.
     Args:
         y_true (np.ndarray): Ground truth labels.
         y_pred (np.ndarray): Predicted labels.
         y_prob (np.ndarray, optional): Predicted probabilities for ROC curve.
         cmap (str): Colormap for the confusion matrix.
-        save_dir (str | Path | None): Directory to save plots. If None, plots are shown not saved.
+        save_dir (str | Path): Directory to save plots.
     """
     print("--- Classification Report ---")
-    report: str = classification_report(y_true, y_pred) # type: ignore
-    print(report)
+    # Generate report as both text and dictionary
+    report_text: str = classification_report(y_true, y_pred) # type: ignore
+    report_dict: dict = classification_report(y_true, y_pred, output_dict=True) # type: ignore
+    print(report_text)
-    if save_dir:
-        save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
-        # Save text report
-        report_path = save_dir_path / "classification_report.txt"
-        report_path.write_text(report, encoding="utf-8")
-        _LOGGER.info(f"📝 Classification report saved as '{report_path.name}'")
+    save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+    # Save text report
+    report_path = save_dir_path / "classification_report.txt"
+    report_path.write_text(report_text, encoding="utf-8")
+    _LOGGER.info(f"📝 Classification report saved as '{report_path.name}'")
+    # --- Save Classification Report Heatmap ---
+    try:
+        plt.figure(figsize=(8, 6), dpi=100)
+        sns.heatmap(pd.DataFrame(report_dict).iloc[:-1, :].T, annot=True, cmap='viridis', fmt='.2f')
+        plt.title("Classification Report")
+        plt.tight_layout()
+        heatmap_path = save_dir_path / "classification_report_heatmap.svg"
+        plt.savefig(heatmap_path)
+        _LOGGER.info(f"📊 Report heatmap saved as '{heatmap_path.name}'")
+        plt.close()
+    except Exception as e:
+        _LOGGER.error(f"❌ Could not generate classification report heatmap: {e}")
-        # Save Confusion Matrix
-        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
-        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap, ax=ax_cm)
-        ax_cm.set_title("Confusion Matrix")
-        cm_path = save_dir_path / "confusion_matrix.svg"
-        plt.savefig(cm_path)
-        _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
-        plt.close(fig_cm)
+    # Save Confusion Matrix
+    fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
+    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap, ax=ax_cm)
+    ax_cm.set_title("Confusion Matrix")
+    cm_path = save_dir_path / "confusion_matrix.svg"
+    plt.savefig(cm_path)
+    _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
+    plt.close(fig_cm)
-        # Save ROC Curve
-        if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
-            fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
-            auc = roc_auc_score(y_true, y_prob[:, 1])
-            fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
-            ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
-            ax_roc.plot([0, 1], [0, 1], 'k--')
-            ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
-            ax_roc.set_xlabel('False Positive Rate')
-            ax_roc.set_ylabel('True Positive Rate')
-            ax_roc.legend(loc='lower right')
-            ax_roc.grid(True)
-            roc_path = save_dir_path / "roc_curve.svg"
-            plt.savefig(roc_path)
-            _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
-            plt.close(fig_roc)
-    else:
-        # Show plots if not saving
-        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap)
-        plt.show()
-        if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
-            fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
-            auc = roc_auc_score(y_true, y_prob[:, 1])
-            plt.figure()
-            plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
-            plt.plot([0, 1], [0, 1], 'k--')
-            plt.title('ROC Curve')
-            plt.show()
+    # Plotting logic for ROC and PR Curves
+    if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
+        # Use probabilities of the positive class
+        y_score = y_prob[:, 1]
+        # --- Save ROC Curve ---
+        fpr, tpr, _ = roc_curve(y_true, y_score)
+        auc = roc_auc_score(y_true, y_score)
+        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
+        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
+        ax_roc.plot([0, 1], [0, 1], 'k--')
+        ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
+        ax_roc.set_xlabel('False Positive Rate')
+        ax_roc.set_ylabel('True Positive Rate')
+        ax_roc.legend(loc='lower right')
+        ax_roc.grid(True)
+        roc_path = save_dir_path / "roc_curve.svg"
+        plt.savefig(roc_path)
+        _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
+        plt.close(fig_roc)
+        # --- Save Precision-Recall Curve ---
+        precision, recall, _ = precision_recall_curve(y_true, y_score)
+        ap_score = average_precision_score(y_true, y_score)
+        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=100)
+        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}')
+        ax_pr.set_title('Precision-Recall Curve')
+        ax_pr.set_xlabel('Recall')
+        ax_pr.set_ylabel('Precision')
+        ax_pr.legend(loc='lower left')
+        ax_pr.grid(True)
+        pr_path = save_dir_path / "pr_curve.svg"
+        plt.savefig(pr_path)
+        _LOGGER.info(f"📈 PR curve saved as '{pr_path.name}'")
+        plt.close(fig_pr)
+        # --- Save Calibration Plot ---
+        if y_prob.ndim > 1 and y_prob.shape[1] >= 2:
+            y_score = y_prob[:, 1] # Use probabilities of the positive class
+            fig_cal, ax_cal = plt.subplots(figsize=(8, 8), dpi=100)
+            CalibrationDisplay.from_predictions(y_true, y_score, n_bins=15, ax=ax_cal)
+            ax_cal.set_title('Calibration Plot (Reliability Curve)')
+            ax_cal.set_xlabel('Mean Predicted Probability')
+            ax_cal.set_ylabel('Fraction of Positives')
+            ax_cal.grid(True)
+            plt.tight_layout()
+            cal_path = save_dir_path / "calibration_plot.svg"
+            plt.savefig(cal_path)
+            _LOGGER.info(f"✅ Calibration plot saved as '{cal_path.name}'")
+            plt.close(fig_cal)
-def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optional[Union[str, Path]] = None):
+def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[str, Path]):
     """
-    Displays regression metrics and optionally saves plots and report.
+    Saves regression metrics and plots.
     Args:
         y_true (np.ndarray): Ground truth values.
         y_pred (np.ndarray): Predicted values.
-        save_dir (str | None): Directory to save plots and report.
+        save_dir (str | Path): Directory to save plots and report.
     """
     rmse = np.sqrt(mean_squared_error(y_true, y_pred))
     mae = mean_absolute_error(y_true, y_pred)
@@ -158,44 +199,56 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optiona
     report_string = "\n".join(report_lines)
     print(report_string)
-    if save_dir:
-        save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
-        # Save text report
-        report_path = save_dir_path / "regression_report.txt"
-        report_path.write_text(report_string)
-        _LOGGER.info(f"📝 Regression report saved as '{report_path.name}'")
+    save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+    # Save text report
+    report_path = save_dir_path / "regression_report.txt"
+    report_path.write_text(report_string)
+    _LOGGER.info(f"📝 Regression report saved as '{report_path.name}'")
-        # Save residual plot
-        residuals = y_true - y_pred
-        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
-        ax_res.scatter(y_pred, residuals, alpha=0.6)
-        ax_res.axhline(0, color='red', linestyle='--')
-        ax_res.set_xlabel("Predicted Values")
-        ax_res.set_ylabel("Residuals")
-        ax_res.set_title("Residual Plot")
-        ax_res.grid(True)
-        plt.tight_layout()
-        res_path = save_dir_path / "residual_plot.svg"
-        plt.savefig(res_path)
-        _LOGGER.info(f"📈 Residual plot saved as '{res_path.name}'")
-        plt.close(fig_res)
+    # Save residual plot
+    residuals = y_true - y_pred
+    fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
+    ax_res.scatter(y_pred, residuals, alpha=0.6)
+    ax_res.axhline(0, color='red', linestyle='--')
+    ax_res.set_xlabel("Predicted Values")
+    ax_res.set_ylabel("Residuals")
+    ax_res.set_title("Residual Plot")
+    ax_res.grid(True)
+    plt.tight_layout()
+    res_path = save_dir_path / "residual_plot.svg"
+    plt.savefig(res_path)
+    _LOGGER.info(f"📈 Residual plot saved as '{res_path.name}'")
+    plt.close(fig_res)
-        # Save true vs predicted plot
-        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
-        ax_tvp.scatter(y_true, y_pred, alpha=0.6)
-        ax_tvp.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)
-        ax_tvp.set_xlabel('True Values')
-        ax_tvp.set_ylabel('Predictions')
-        ax_tvp.set_title('True vs. Predicted Values')
-        ax_tvp.grid(True)
-        plt.tight_layout()
-        tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
-        plt.savefig(tvp_path)
-        _LOGGER.info(f"📉 True vs. Predicted plot saved as '{tvp_path.name}'")
-        plt.close(fig_tvp)
+    # Save true vs predicted plot
+    fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
+    ax_tvp.scatter(y_true, y_pred, alpha=0.6)
+    ax_tvp.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)
+    ax_tvp.set_xlabel('True Values')
+    ax_tvp.set_ylabel('Predictions')
+    ax_tvp.set_title('True vs. Predicted Values')
+    ax_tvp.grid(True)
+    plt.tight_layout()
+    tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
+    plt.savefig(tvp_path)
+    _LOGGER.info(f"📉 True vs. Predicted plot saved as '{tvp_path.name}'")
+    plt.close(fig_tvp)
+    # Save Histogram of Residuals
+    fig_hist, ax_hist = plt.subplots(figsize=(8, 6), dpi=100)
+    sns.histplot(residuals, kde=True, ax=ax_hist)
+    ax_hist.set_xlabel("Residual Value")
+    ax_hist.set_ylabel("Frequency")
+    ax_hist.set_title("Distribution of Residuals")
+    ax_hist.grid(True)
+    plt.tight_layout()
+    hist_path = save_dir_path / "residuals_histogram.svg"
+    plt.savefig(hist_path)
+    _LOGGER.info(f"📊 Residuals histogram saved as '{hist_path.name}'")
+    plt.close(fig_hist)
-def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain: torch.Tensor,
+def shap_summary_plot(model, background_data: Union[torch.Tensor,np.ndarray], instances_to_explain: Union[torch.Tensor,np.ndarray],
                       feature_names: Optional[list[str]]=None, save_dir: Optional[Union[str, Path]] = None):
     """
     Calculates SHAP values and saves summary plots and data.
@@ -207,24 +260,54 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         feature_names (list of str | None): Names of the features for plot labeling.
         save_dir (str | Path | None): Directory to save SHAP artifacts. If None, dot plot is shown.
     """
+    # everything to numpy
+    if isinstance(background_data, np.ndarray):
+        background_data_np = background_data
+    else:
+        background_data_np = background_data.numpy()
+    if isinstance(instances_to_explain, np.ndarray):
+        instances_to_explain_np = instances_to_explain
+    else:
+        instances_to_explain_np = instances_to_explain.numpy()
+    # --- Data Validation Step ---
+    if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
+        _LOGGER.error("❌ Input data for SHAP contains NaN values. Aborting explanation.")
+        return
     print("\n--- SHAP Value Explanation ---")
-    print("Calculating SHAP values... ")
     model.eval()
     model.cpu()
-    explainer = shap.DeepExplainer(model, background_data)
-    shap_values = explainer.shap_values(instances_to_explain)
-    shap_values_for_plot = shap_values[1] if isinstance(shap_values, list) else shap_values
-    if isinstance(shap_values, list):
-        _LOGGER.info("Using SHAP values for the positive class (class 1) for plots.")
+    # 1. Summarize the background data.
+    # Summarize the background data using k-means. 10-50 clusters is a good starting point.
+    background_summary = shap.kmeans(background_data_np, 30)
+    # 2. Define a prediction function wrapper that SHAP can use. It must take a numpy array and return a numpy array.
+    def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
+        # Convert numpy data to torch tensor
+        x_torch = torch.from_numpy(x_np).float()
+        with torch.no_grad():
+            # Get model output
+            output = model(x_torch)
+        # Return as numpy array
+        return output.cpu().numpy().flatten()
+    # 3. Create the KernelExplainer
+    explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
+    print("Calculating SHAP values with KernelExplainer...")
+    shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
     if save_dir:
         save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+        plt.ioff()
         # Save Bar Plot
         bar_path = save_dir_path / "shap_bar_plot.svg"
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="bar", show=False)
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(bar_path)
@@ -233,7 +316,7 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         # Save Dot Plot
         dot_path = save_dir_path / "shap_dot_plot.svg"
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot", show=False)
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
         plt.title("SHAP Feature Importance")
         plt.tight_layout()
         plt.savefig(dot_path)
@@ -242,18 +325,25 @@ def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain
         # Save Summary Data to CSV
         summary_path = save_dir_path / "shap_summary.csv"
-        mean_abs_shap = np.abs(shap_values_for_plot).mean(axis=0)
+        # Ensure the array is 1D before creating the DataFrame
+        mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
         if feature_names is None:
             feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
         summary_df = pd.DataFrame({
             'feature': feature_names,
             'mean_abs_shap_value': mean_abs_shap
         }).sort_values('mean_abs_shap_value', ascending=False)
         summary_df.to_csv(summary_path, index=False)
         _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
+        plt.ion()
     else:
         _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
-        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")
+        shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot")
 def info():

ml_tools/ML_trainer.py CHANGED Viewed

@@ -8,16 +8,16 @@ import numpy as np
 from .ML_callbacks import Callback, History, TqdmProgressBar
 from .ML_evaluation import classification_metrics, regression_metrics, plot_losses, shap_summary_plot
 from ._script_info import _script_info
-from .keys import LogKeys
+from .keys import PyTorchLogKeys
 from ._logger import _LOGGER
 __all__ = [
-    "MyTrainer"
+    "MLTrainer"
 ]
-class MyTrainer:
+class MLTrainer:
     def __init__(self, model: nn.Module, train_dataset: Dataset, test_dataset: Dataset,
                  kind: Literal["regression", "classification"],
                  criterion: nn.Module, optimizer: torch.optim.Optimizer,
@@ -95,14 +95,16 @@ class MyTrainer:
             batch_size=batch_size,
             shuffle=shuffle,
             num_workers=loader_workers,
-            pin_memory=(self.device.type == "cuda")
+            pin_memory=("cuda" in self.device.type),
+            drop_last=True  # Drops the last batch if incomplete, selecting a good batch size is key.
         )
         self.test_loader = DataLoader(
             dataset=self.test_dataset,
             batch_size=batch_size,
             shuffle=False,
             num_workers=loader_workers,
-            pin_memory=(self.device.type == "cuda")
+            pin_memory=("cuda" in self.device.type)
         )
     def fit(self, epochs: int = 10, batch_size: int = 10, shuffle: bool = True):
@@ -159,8 +161,8 @@ class MyTrainer:
         for batch_idx, (features, target) in enumerate(self.train_loader): # type: ignore
             # Create a log dictionary for the batch
             batch_logs = {
-                LogKeys.BATCH_INDEX: batch_idx,
-                LogKeys.BATCH_SIZE: features.size(0)
+                PyTorchLogKeys.BATCH_INDEX: batch_idx,
+                PyTorchLogKeys.BATCH_SIZE: features.size(0)
             }
             self.callbacks_hook('on_batch_begin', batch_idx, logs=batch_logs)
@@ -178,11 +180,11 @@ class MyTrainer:
             running_loss += batch_loss * features.size(0)
             # Add the batch loss to the logs and call the end-of-batch hook
-            batch_logs[LogKeys.BATCH_LOSS] = batch_loss
+            batch_logs[PyTorchLogKeys.BATCH_LOSS] = batch_loss
             self.callbacks_hook('on_batch_end', batch_idx, logs=batch_logs)
         # Return the average loss for the entire epoch
-        return {LogKeys.TRAIN_LOSS: running_loss / len(self.train_loader.dataset)} # type: ignore
+        return {PyTorchLogKeys.TRAIN_LOSS: running_loss / len(self.train_loader.dataset)} # type: ignore
     def _validation_step(self):
         self.model.eval()
@@ -195,7 +197,7 @@ class MyTrainer:
                     output = output.view_as(target)
                 loss = self.criterion(output, target)
                 running_loss += loss.item() * features.size(0)
-        logs = {LogKeys.VAL_LOSS: running_loss / len(self.test_loader.dataset)} # type: ignore
+        logs = {PyTorchLogKeys.VAL_LOSS: running_loss / len(self.test_loader.dataset)} # type: ignore
         return logs
     def _predict_for_eval(self, dataloader: DataLoader):
@@ -230,14 +232,14 @@ class MyTrainer:
                 yield y_pred_batch, y_prob_batch, y_true_batch
-    def evaluate(self, save_dir: Optional[Union[str,Path]], data: Optional[Union[DataLoader, Dataset]] = None):
+    def evaluate(self, save_dir: Union[str,Path], data: Optional[Union[DataLoader, Dataset]] = None):
         """
         Evaluates the model on the given data.
         Args:
             data (DataLoader | Dataset | None ): The data to evaluate on.
                 Can be a DataLoader or a Dataset. If None, defaults to the trainer's internal test_dataset.
-            save_dir (str | Path | None): Directory to save all reports and plots. If None, metrics are shown but not saved.
+            save_dir (str | Path): Directory to save all reports and plots.
         """
         eval_loader = None
         if isinstance(data, DataLoader):
@@ -273,14 +275,14 @@ class MyTrainer:
         y_prob = np.concatenate(all_probs) if self.kind == "classification" else None
         if self.kind == "classification":
-            classification_metrics(y_true, y_pred, y_prob, save_dir=save_dir)
+            classification_metrics(save_dir, y_true, y_pred, y_prob)
         else:
-            regression_metrics(y_true.flatten(), y_pred.flatten(), save_dir=save_dir)
+            regression_metrics(y_true.flatten(), y_pred.flatten(), save_dir)
         print("\n--- Training History ---")
         plot_losses(self.history, save_dir=save_dir)
-    def explain(self, explain_dataset: Optional[Dataset] = None, n_samples: int = 100,
+    def explain(self, explain_dataset: Optional[Dataset] = None, n_samples: int = 1000,
                 feature_names: Optional[List[str]] = None, save_dir: Optional[Union[str,Path]] = None):
         """
         Explains model predictions using SHAP and saves all artifacts.

dragon-ml-toolbox 5.3.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 5.3.0py3-none-any.whl → 6.0.0py3-none-any.whl