PyPI - dragon-ml-toolbox - Versions diffs - 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

dragon-ml-toolbox 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/METADATA +26 -9
dragon_ml_toolbox-3.0.0.dist-info/RECORD +25 -0
ml_tools/ETL_engineering.py +8 -7
ml_tools/GUI_tools.py +495 -0
ml_tools/MICE_imputation.py +8 -4
ml_tools/ML_callbacks.py +341 -0
ml_tools/ML_evaluation.py +255 -0
ml_tools/ML_trainer.py +344 -0
ml_tools/ML_tutorial.py +300 -0
ml_tools/PSO_optimization.py +27 -20
ml_tools/RNN_forecast.py +49 -0
ml_tools/VIF_factor.py +6 -5
ml_tools/datasetmaster.py +601 -527
ml_tools/ensemble_learning.py +12 -9
ml_tools/handle_excel.py +9 -10
ml_tools/logger.py +45 -8
ml_tools/utilities.py +18 -1
dragon_ml_toolbox-2.3.0.dist-info/RECORD +0 -21
ml_tools/trainer.py +0 -346
ml_tools/vision_helpers.py +0 -231
{dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-2.3.0.dist-info → dragon_ml_toolbox-3.0.0.dist-info}/top_level.txt +0 -0
/ml_tools/{pytorch_models.py → _pytorch_models.py} +0 -0

ml_tools/MICE_imputation.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 from .utilities import load_dataframe, list_csv_paths, sanitize_filename, _script_info, merge_dataframes, save_dataframe, threshold_binary_values, make_fullpath
 from plotnine import ggplot, labs, theme, element_blank # type: ignore
 from typing import Optional, Union
+from .logger import _LOGGER
 __all__ = [
@@ -40,7 +41,9 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
     if binary_columns is not None:
         invalid_binary_columns = set(binary_columns) - set(df.columns)
         if invalid_binary_columns:
-            print(f"⚠️ These 'binary columns' are not in the dataset: {invalid_binary_columns}")
+            _LOGGER.warning(f"⚠️ These 'binary columns' are not in the dataset:")
+            for invalid_binary_col in invalid_binary_columns:
+                print(f"  - {invalid_binary_col}")
         valid_binary_columns = [col for col in binary_columns if col not in invalid_binary_columns]
         for imputed_df in imputed_datasets:
             for binary_column_name in valid_binary_columns:
@@ -125,7 +128,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
             plt.savefig(save_path, bbox_inches='tight', format="svg")
             plt.close()
-        print(f"{dataset_file_dir} completed.")
+        _LOGGER.info(f"{dataset_file_dir} completed.")
 # Imputed distributions
@@ -210,7 +213,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
             fig = kernel.plot_imputed_distributions(variables=[feature])
             _process_figure(fig, feature)
-    print(f"{local_dir_name} completed.")
+    _LOGGER.info(f"{local_dir_name} completed.")
 def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str],
@@ -240,7 +243,8 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
         all_file_paths = list(list_csv_paths(input_path).values())
     for df_path in all_file_paths:
-        df, df_name = load_dataframe(df_path=df_path)
+        df: pd.DataFrame
+        df, df_name = load_dataframe(df_path=df_path, kind="pandas") # type: ignore
         df, df_targets = _skip_targets(df, target_columns)

ml_tools/ML_callbacks.py ADDED Viewed

@@ -0,0 +1,341 @@
+import numpy as np
+import torch
+from tqdm.auto import tqdm
+from .utilities import make_fullpath, LogKeys
+from .logger import _LOGGER
+from typing import Optional
+__all__ = [
+    "Callback",
+    "History",
+    "TqdmProgressBar",
+    "EarlyStopping",
+    "ModelCheckpoint",
+    "LRScheduler"
+]
+class Callback:
+    """
+    Abstract base class used to build new callbacks.
+    The methods of this class are automatically called by the Trainer at different
+    points during training. Subclasses can override these methods to implement
+    custom logic.
+    """
+    def __init__(self):
+        self.trainer = None
+    def set_trainer(self, trainer):
+        """This is called by the Trainer to associate itself with the callback."""
+        self.trainer = trainer
+    def on_train_begin(self, logs=None):
+        """Called at the beginning of training."""
+        pass
+    def on_train_end(self, logs=None):
+        """Called at the end of training."""
+        pass
+    def on_epoch_begin(self, epoch, logs=None):
+        """Called at the beginning of an epoch."""
+        pass
+    def on_epoch_end(self, epoch, logs=None):
+        """Called at the end of an epoch."""
+        pass
+    def on_batch_begin(self, batch, logs=None):
+        """Called at the beginning of a training batch."""
+        pass
+    def on_batch_end(self, batch, logs=None):
+        """Called at the end of a training batch."""
+        pass
+class History(Callback):
+    """
+    Callback that records events into a `history` dictionary.
+    This callback is automatically applied to every MyTrainer model.
+    The `history` attribute is a dictionary mapping metric names (e.g., 'val_loss')
+    to a list of metric values.
+    """
+    def on_train_begin(self, logs=None):
+        # Clear history at the beginning of training
+        self.trainer.history = {} # type: ignore
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        for k, v in logs.items():
+            # Append new log values to the history dictionary
+            self.trainer.history.setdefault(k, []).append(v) # type: ignore
+class TqdmProgressBar(Callback):
+    """Callback that provides a tqdm progress bar for training."""
+    def __init__(self):
+        self.epoch_bar = None
+        self.batch_bar = None
+    def on_train_begin(self, logs=None):
+        self.epochs = self.trainer.epochs # type: ignore
+        self.epoch_bar = tqdm(total=self.epochs, desc="Training Progress")
+    def on_epoch_begin(self, epoch, logs=None):
+        total_batches = len(self.trainer.train_loader) # type: ignore
+        self.batch_bar = tqdm(total=total_batches, desc=f"Epoch {epoch}/{self.epochs}", leave=False)
+    def on_batch_end(self, batch, logs=None):
+        self.batch_bar.update(1) # type: ignore
+        if logs:
+            self.batch_bar.set_postfix(loss=f"{logs.get(LogKeys.BATCH_LOSS, 0):.4f}") # type: ignore
+    def on_epoch_end(self, epoch, logs=None):
+        self.batch_bar.close() # type: ignore
+        self.epoch_bar.update(1) # type: ignore
+        if logs:
+            train_loss_str = f"{logs.get(LogKeys.TRAIN_LOSS, 0):.4f}"
+            val_loss_str = f"{logs.get(LogKeys.VAL_LOSS, 0):.4f}"
+            self.epoch_bar.set_postfix_str(f"Train Loss: {train_loss_str}, Val Loss: {val_loss_str}") # type: ignore
+    def on_train_end(self, logs=None):
+        self.epoch_bar.close() # type: ignore
+class EarlyStopping(Callback):
+    """
+    Stop training when a monitored metric has stopped improving.
+    Args:
+        monitor (str): Quantity to be monitored. Defaults to 'val_loss'.
+        min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
+        patience (int): Number of epochs with no improvement after which training will be stopped.
+        mode (str): One of {'auto', 'min', 'max'}. In 'min' mode, training will stop when the quantity
+                    monitored has stopped decreasing; in 'max' mode it will stop when the quantity
+                    monitored has stopped increasing; in 'auto' mode, the direction is automatically
+                    inferred from the name of the monitored quantity.
+        verbose (int): Verbosity mode.
+    """
+    def __init__(self, monitor: str=LogKeys.VAL_LOSS, min_delta=0.0, patience=3, mode='auto', verbose=1):
+        super().__init__()
+        self.monitor = monitor
+        self.patience = patience
+        self.min_delta = min_delta
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.verbose = verbose
+        if mode not in ['auto', 'min', 'max']:
+            raise ValueError(f"EarlyStopping mode {mode} is unknown, choose one of ('auto', 'min', 'max')")
+        self.mode = mode
+        # Determine the comparison operator based on the mode
+        if self.mode == 'min':
+            self.monitor_op = np.less
+        elif self.mode == 'max':
+            self.monitor_op = np.greater
+        else: # auto mode
+            if 'acc' in self.monitor.lower():
+                self.monitor_op = np.greater
+            else: # Default to min mode for loss or other metrics
+                self.monitor_op = np.less
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+    def on_train_begin(self, logs=None):
+        # Reset state at the beginning of training
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+    def on_epoch_end(self, epoch, logs=None):
+        current = logs.get(self.monitor) # type: ignore
+        if current is None:
+            return
+        # Determine the comparison threshold based on the mode
+        if self.monitor_op == np.less:
+            # For 'min' mode, we need to be smaller than 'best' by at least 'min_delta'
+            # Correct check: current < self.best - self.min_delta
+            is_improvement = self.monitor_op(current, self.best - self.min_delta)
+        else:
+            # For 'max' mode, we need to be greater than 'best' by at least 'min_delta'
+            # Correct check: current > self.best + self.min_delta
+            is_improvement = self.monitor_op(current, self.best + self.min_delta)
+        if is_improvement:
+            if self.verbose > 1:
+                _LOGGER.info(f"EarlyStopping: {self.monitor} improved from {self.best:.4f} to {current:.4f}")
+            self.best = current
+            self.wait = 0
+        else:
+            self.wait += 1
+            if self.wait >= self.patience:
+                self.stopped_epoch = epoch
+                self.trainer.stop_training = True # type: ignore
+                if self.verbose > 0:
+                    print("")
+                    _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
+class ModelCheckpoint(Callback):
+    """
+    Saves the model to a directory with automated filename generation and rotation. The filename includes the epoch and score.
+    - If `save_best_only` is True, it saves the single best model, deleting the
+      previous best.
+    - If `save_best_only` is False, it keeps the 3 most recent checkpoints,
+      deleting the oldest ones automatically.
+    Args:
+        save_dir (str): Directory where checkpoint files will be saved.
+        monitor (str): Metric to monitor for `save_best_only=True`.
+        save_best_only (bool): If true, save only the best model.
+        mode (str): One of {'auto', 'min', 'max'}.
+        verbose (int): Verbosity mode.
+    """
+    def __init__(self, save_dir: str, monitor: str = LogKeys.VAL_LOSS,
+                 save_best_only: bool = False, mode: str = 'auto', verbose: int = 1):
+        super().__init__()
+        self.save_dir = make_fullpath(save_dir, make=True)
+        if not self.save_dir.is_dir():
+            _LOGGER.error(f"{save_dir} is not a valid directory.")
+            raise IOError()
+        self.monitor = monitor
+        self.save_best_only = save_best_only
+        self.verbose = verbose
+        # State variables to be managed during training
+        self.saved_checkpoints = []
+        self.last_best_filepath = None
+        if mode not in ['auto', 'min', 'max']:
+            raise ValueError(f"ModelCheckpoint mode {mode} is unknown.")
+        self.mode = mode
+        if self.mode == 'min':
+            self.monitor_op = np.less
+        elif self.mode == 'max':
+            self.monitor_op = np.greater
+        else:
+            self.monitor_op = np.less if 'loss' in self.monitor else np.greater
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+    def on_train_begin(self, logs=None):
+        """Reset state when training starts."""
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+        self.saved_checkpoints = []
+        self.last_best_filepath = None
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+        if self.save_best_only:
+            self._save_best_model(epoch, logs)
+        else:
+            self._save_rolling_checkpoints(epoch, logs)
+    def _save_best_model(self, epoch, logs):
+        """Saves a single best model and deletes the previous one."""
+        current = logs.get(self.monitor)
+        if current is None:
+            return
+        if self.monitor_op(current, self.best):
+            old_best_str = f"{self.best:.4f}" if self.best not in [np.Inf, -np.Inf] else "inf"
+            # Create a descriptive filename
+            filename = f"epoch_{epoch}-{self.monitor}_{current:.4f}.pth"
+            new_filepath = self.save_dir / filename
+            if self.verbose > 0:
+                print("")
+                _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
+            # Save the new best model
+            torch.save(self.trainer.model.state_dict(), new_filepath) # type: ignore
+            # Delete the old best model file
+            if self.last_best_filepath and self.last_best_filepath.exists():
+                self.last_best_filepath.unlink()
+            # Update state
+            self.best = current
+            self.last_best_filepath = new_filepath
+    def _save_rolling_checkpoints(self, epoch, logs):
+        """Saves the latest model and keeps only the last 5."""
+        filename = f"epoch_{epoch}.pth"
+        filepath = self.save_dir / filename
+        if self.verbose > 0:
+            print("")
+            _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
+        torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
+        self.saved_checkpoints.append(filepath)
+        # If we have more than n checkpoints, remove the oldest one
+        if len(self.saved_checkpoints) > 3:
+            file_to_delete = self.saved_checkpoints.pop(0)
+            if file_to_delete.exists():
+                if self.verbose > 0:
+                    _LOGGER.info(f"  -> Deleting old checkpoint: {file_to_delete.name}")
+                file_to_delete.unlink()
+class LRScheduler(Callback):
+    """
+    Callback to manage a PyTorch learning rate scheduler.
+    This callback automatically calls the scheduler's `step()` method at the
+    end of each epoch. It also logs a message when the learning rate changes.
+    Args:
+        scheduler: An initialized PyTorch learning rate scheduler.
+        monitor (str, optional): The metric to monitor for schedulers that
+                                 require it, like `ReduceLROnPlateau`.
+                                 Should match a key in the logs (e.g., 'val_loss').
+    """
+    def __init__(self, scheduler, monitor: Optional[str] = None):
+        super().__init__()
+        self.scheduler = scheduler
+        self.monitor = monitor
+        self.previous_lr = None
+    def on_train_begin(self, logs=None):
+        """Store the initial learning rate."""
+        self.previous_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+    def on_epoch_end(self, epoch, logs=None):
+        """Step the scheduler and log any change in learning rate."""
+        # For schedulers that need a metric (e.g., val_loss)
+        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            if self.monitor is None:
+                raise ValueError("LRScheduler needs a `monitor` metric for ReduceLROnPlateau.")
+            metric_val = logs.get(self.monitor) # type: ignore
+            if metric_val is not None:
+                self.scheduler.step(metric_val)
+            else:
+                print("")
+                _LOGGER.warning(f"LRScheduler could not find metric '{self.monitor}' in logs.")
+        # For all other schedulers
+        else:
+            self.scheduler.step()
+        # Log the change if the LR was updated
+        current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+        if current_lr != self.previous_lr:
+            print("")
+            _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
+            self.previous_lr = current_lr

ml_tools/ML_evaluation.py ADDED Viewed

@@ -0,0 +1,255 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import (
+    classification_report,
+    ConfusionMatrixDisplay,
+    roc_curve,
+    roc_auc_score,
+    mean_squared_error,
+    mean_absolute_error,
+    r2_score,
+    median_absolute_error
+)
+import torch
+import shap
+from pathlib import Path
+from .utilities import make_fullpath
+from .logger import _LOGGER
+from typing import Union, Optional
+__all__ = [
+    "plot_losses",
+    "classification_metrics",
+    "regression_metrics",
+    "shap_summary_plot"
+]
+def plot_losses(history: dict, save_dir: Optional[Union[str, Path]] = None):
+    """
+    Plots training & validation loss curves from a history object.
+    Args:
+        history (dict): A dictionary containing 'train_loss' and 'val_loss'.
+        save_dir (str | Path | None): Directory to save the plot image.
+    """
+    train_loss = history.get('train_loss', [])
+    val_loss = history.get('val_loss', [])
+    if not train_loss and not val_loss:
+        print("Warning: Loss history is empty or incomplete. Cannot plot.")
+        return
+    fig, ax = plt.subplots(figsize=(10, 5), dpi=100)
+    # Plot training loss only if data for it exists
+    if train_loss:
+        epochs = range(1, len(train_loss) + 1)
+        ax.plot(epochs, train_loss, 'o-', label='Training Loss')
+    # Plot validation loss only if data for it exists
+    if val_loss:
+        epochs = range(1, len(val_loss) + 1)
+        ax.plot(epochs, val_loss, 'o-', label='Validation Loss')
+    ax.set_title('Training and Validation Loss')
+    ax.set_xlabel('Epochs')
+    ax.set_ylabel('Loss')
+    ax.legend()
+    ax.grid(True)
+    plt.tight_layout()
+    if save_dir:
+        save_dir_path = make_fullpath(save_dir, make=True)
+        save_path = save_dir_path / "loss_plot.svg"
+        plt.savefig(save_path)
+        _LOGGER.info(f"Loss plot saved as '{save_path.name}'")
+    else:
+        plt.show()
+    plt.close(fig)
+def classification_metrics(y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None,
+                           cmap: str = "Blues", save_dir: Optional[Union[str, Path]] = None):
+    """
+    Displays and optionally saves classification metrics and plots.
+    Args:
+        y_true (np.ndarray): Ground truth labels.
+        y_pred (np.ndarray): Predicted labels.
+        y_prob (np.ndarray, optional): Predicted probabilities for ROC curve.
+        cmap (str): Colormap for the confusion matrix.
+        save_dir (str | Path | None): Directory to save plots. If None, plots are shown not saved.
+    """
+    print("--- Classification Report ---")
+    report: str = classification_report(y_true, y_pred) # type: ignore
+    print(report)
+    if save_dir:
+        save_dir_path = make_fullpath(save_dir, make=True)
+        # Save text report
+        report_path = save_dir_path / "classification_report.txt"
+        report_path.write_text(report, encoding="utf-8")
+        _LOGGER.info(f"Classification report saved as '{report_path.name}'")
+        # Save Confusion Matrix
+        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
+        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap, ax=ax_cm)
+        ax_cm.set_title("Confusion Matrix")
+        cm_path = save_dir_path / "confusion_matrix.svg"
+        plt.savefig(cm_path)
+        _LOGGER.info(f"Confusion matrix saved as '{cm_path.name}'")
+        plt.close(fig_cm)
+        # Save ROC Curve
+        if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
+            fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
+            auc = roc_auc_score(y_true, y_prob[:, 1])
+            fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
+            ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
+            ax_roc.plot([0, 1], [0, 1], 'k--')
+            ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
+            ax_roc.set_xlabel('False Positive Rate')
+            ax_roc.set_ylabel('True Positive Rate')
+            ax_roc.legend(loc='lower right')
+            ax_roc.grid(True)
+            roc_path = save_dir_path / "roc_curve.svg"
+            plt.savefig(roc_path)
+            _LOGGER.info(f"ROC curve saved as '{roc_path.name}'")
+            plt.close(fig_roc)
+    else:
+        # Show plots if not saving
+        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap)
+        plt.show()
+        if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
+            fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
+            auc = roc_auc_score(y_true, y_prob[:, 1])
+            plt.figure()
+            plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
+            plt.plot([0, 1], [0, 1], 'k--')
+            plt.title('ROC Curve')
+            plt.show()
+def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Optional[Union[str, Path]] = None):
+    """
+    Displays regression metrics and optionally saves plots and report.
+    Args:
+        y_true (np.ndarray): Ground truth values.
+        y_pred (np.ndarray): Predicted values.
+        save_dir (str | None): Directory to save plots and report.
+    """
+    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
+    mae = mean_absolute_error(y_true, y_pred)
+    r2 = r2_score(y_true, y_pred)
+    medae = median_absolute_error(y_true, y_pred)
+    report_lines = [
+        "--- Regression Report ---",
+        f"  Root Mean Squared Error (RMSE): {rmse:.4f}",
+        f"  Mean Absolute Error (MAE):      {mae:.4f}",
+        f"  Median Absolute Error (MedAE):  {medae:.4f}",
+        f"  Coefficient of Determination (R²): {r2:.4f}"
+    ]
+    report_string = "\n".join(report_lines)
+    print(report_string)
+    if save_dir:
+        save_dir_path = make_fullpath(save_dir, make=True)
+        # Save text report
+        report_path = save_dir_path / "regression_report.txt"
+        report_path.write_text(report_string)
+        _LOGGER.info(f"Regression report saved as '{report_path.name}'")
+        # Save residual plot
+        residuals = y_true - y_pred
+        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
+        ax_res.scatter(y_pred, residuals, alpha=0.6)
+        ax_res.axhline(0, color='red', linestyle='--')
+        ax_res.set_xlabel("Predicted Values")
+        ax_res.set_ylabel("Residuals")
+        ax_res.set_title("Residual Plot")
+        ax_res.grid(True)
+        plt.tight_layout()
+        res_path = save_dir_path / "residual_plot.svg"
+        plt.savefig(res_path)
+        _LOGGER.info(f"Residual plot saved as '{res_path.name}'")
+        plt.close(fig_res)
+        # Save true vs predicted plot
+        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
+        ax_tvp.scatter(y_true, y_pred, alpha=0.6)
+        ax_tvp.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)
+        ax_tvp.set_xlabel('True Values')
+        ax_tvp.set_ylabel('Predictions')
+        ax_tvp.set_title('True vs. Predicted Values')
+        ax_tvp.grid(True)
+        plt.tight_layout()
+        tvp_path = save_dir_path / "true_vs_predicted_plot.svg"
+        plt.savefig(tvp_path)
+        _LOGGER.info(f"True vs. Predicted plot saved as '{tvp_path.name}'")
+        plt.close(fig_tvp)
+def shap_summary_plot(model, background_data: torch.Tensor, instances_to_explain: torch.Tensor,
+                      feature_names: Optional[list[str]]=None, save_dir: Optional[Union[str, Path]] = None):
+    """
+    Calculates SHAP values and saves summary plots and data.
+    Args:
+        model (nn.Module): The trained PyTorch model.
+        background_data (torch.Tensor): A sample of data for the explainer background.
+        instances_to_explain (torch.Tensor): The specific data instances to explain.
+        feature_names (list of str | None): Names of the features for plot labeling.
+        save_dir (str | Path | None): Directory to save SHAP artifacts. If None, dot plot is shown.
+    """
+    print("\n--- SHAP Value Explanation ---")
+    print("Calculating SHAP values... ")
+    model.eval()
+    model.cpu()
+    explainer = shap.DeepExplainer(model, background_data)
+    shap_values = explainer.shap_values(instances_to_explain)
+    shap_values_for_plot = shap_values[1] if isinstance(shap_values, list) else shap_values
+    if isinstance(shap_values, list):
+        _LOGGER.info("Using SHAP values for the positive class (class 1) for plots.")
+    if save_dir:
+        save_dir_path = make_fullpath(save_dir, make=True)
+        # Save Bar Plot
+        bar_path = save_dir_path / "shap_bar_plot.svg"
+        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="bar", show=False)
+        plt.title("SHAP Feature Importance")
+        plt.tight_layout()
+        plt.savefig(bar_path)
+        _LOGGER.info(f"SHAP bar plot saved as '{bar_path.name}'")
+        plt.close()
+        # Save Dot Plot
+        dot_path = save_dir_path / "shap_dot_plot.svg"
+        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot", show=False)
+        plt.title("SHAP Feature Importance")
+        plt.tight_layout()
+        plt.savefig(dot_path)
+        _LOGGER.info(f"SHAP dot plot saved as '{dot_path.name}'")
+        plt.close()
+        # Save Summary Data to CSV
+        summary_path = save_dir_path / "shap_summary.csv"
+        mean_abs_shap = np.abs(shap_values_for_plot).mean(axis=0)
+        if feature_names is None:
+            feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
+        summary_df = pd.DataFrame({
+            'feature': feature_names,
+            'mean_abs_shap_value': mean_abs_shap
+        }).sort_values('mean_abs_shap_value', ascending=False)
+        summary_df.to_csv(summary_path, index=False)
+        _LOGGER.info(f"SHAP summary data saved as '{summary_path.name}'")
+    else:
+        _LOGGER.info("No save directory provided. Displaying SHAP dot plot.")
+        shap.summary_plot(shap_values_for_plot, instances_to_explain, feature_names=feature_names, plot_type="dot")

dragon-ml-toolbox 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

dragon-ml-toolbox 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl