PyPI - dragon-ml-toolbox - Versions diffs - 12.12.0__tar.gz → 13.0.0__tar.gz - Mend

dragon-ml-toolbox 12.12.0tar.gz → 13.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (46) hide show

{dragon_ml_toolbox-12.12.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-13.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 12.12.0
+Version: 13.0.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 12.12.0
+Version: 13.0.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/ML_callbacks.py RENAMED Viewed

@@ -5,7 +5,7 @@ from typing import Union, Literal, Optional
 from pathlib import Path
 from .path_manager import make_fullpath, sanitize_filename
-from .keys import PyTorchLogKeys
+from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
 from ._logger import _LOGGER
 from ._script_info import _script_info
@@ -189,7 +189,7 @@ class EarlyStopping(Callback):
 class ModelCheckpoint(Callback):
     """
-    Saves the model weights to a directory with automated filename generation and rotation.
+    Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
     """
     def __init__(self, save_dir: Union[str,Path], checkpoint_name: Optional[str]=None, monitor: str = PyTorchLogKeys.VAL_LOSS,
                  save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
@@ -200,7 +200,7 @@ class ModelCheckpoint(Callback):
         Args:
             save_dir (str): Directory where checkpoint files will be saved.
             checkpoint_name (str| None): If None, the filename will include the epoch and score.
-            monitor (str): Metric to monitor for `save_best_only=True`.
+            monitor (str): Metric to monitor.
             save_best_only (bool): If true, save only the best model.
             mode (str): One of {'auto', 'min', 'max'}.
             verbose (int): Verbosity mode.
@@ -270,15 +270,29 @@ class ModelCheckpoint(Callback):
             if self.verbose > 0:
                 _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
+            # Update best score *before* saving
+            self.best = current
+            # Create a comprehensive checkpoint dictionary
+            checkpoint_data = {
+                PyTorchCheckpointKeys.EPOCH: epoch,
+                PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
+                PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
+                PyTorchCheckpointKeys.BEST_SCORE: self.best,
+            }
+            # Check for scheduler
+            if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
+                checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
             # Save the new best model
-            torch.save(self.trainer.model.state_dict(), new_filepath) # type: ignore
+            torch.save(checkpoint_data, new_filepath)
             # Delete the old best model file
             if self.last_best_filepath and self.last_best_filepath.exists():
                 self.last_best_filepath.unlink()
             # Update state
-            self.best = current
             self.last_best_filepath = new_filepath
     def _save_rolling_checkpoints(self, epoch, logs):
@@ -292,7 +306,19 @@ class ModelCheckpoint(Callback):
         if self.verbose > 0:
             _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
-        torch.save(self.trainer.model.state_dict(), filepath) # type: ignore
+        # Create a comprehensive checkpoint dictionary
+        checkpoint_data = {
+            PyTorchCheckpointKeys.EPOCH: epoch,
+            PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
+            PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
+            PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
+        }
+        if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
+            checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
+        torch.save(checkpoint_data, filepath)
         self.saved_checkpoints.append(filepath)
@@ -309,19 +335,25 @@ class LRScheduler(Callback):
     """
     Callback to manage a PyTorch learning rate scheduler.
     """
-    def __init__(self, scheduler, monitor: Optional[str] = None):
+    def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
         """
         This callback automatically calls the scheduler's `step()` method at the
         end of each epoch. It also logs a message when the learning rate changes.
         Args:
             scheduler: An initialized PyTorch learning rate scheduler.
-            monitor (str, optional): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
+            monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
         """
         super().__init__()
         self.scheduler = scheduler
         self.monitor = monitor
         self.previous_lr = None
+    def set_trainer(self, trainer):
+        """This is called by the Trainer to associate itself with the callback."""
+        super().set_trainer(trainer)
+        # Register the scheduler with the trainer so it can be added to the checkpoint
+        self.trainer.scheduler = self.scheduler # type: ignore
     def on_train_begin(self, logs=None):
         """Store the initial learning rate."""

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/ML_evaluation.py RENAMED Viewed

@@ -18,7 +18,8 @@ from sklearn.metrics import (
 import torch
 import shap
 from pathlib import Path
-from typing import Union, Optional, List
+from typing import Union, Optional, List, Literal
+import warnings
 from .path_manager import make_fullpath
 from ._logger import _LOGGER
@@ -249,13 +250,15 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
     plt.savefig(hist_path)
     _LOGGER.info(f"📊 Residuals histogram saved as '{hist_path.name}'")
     plt.close(fig_hist)
 def shap_summary_plot(model,
                       background_data: Union[torch.Tensor,np.ndarray],
                       instances_to_explain: Union[torch.Tensor,np.ndarray],
                       feature_names: Optional[list[str]],
-                      save_dir: Union[str, Path]):
+                      save_dir: Union[str, Path],
+                      device: torch.device = torch.device('cpu'),
+                      explainer_type: Literal['deep', 'kernel'] = 'deep'):
     """
     Calculates SHAP values and saves summary plots and data.
@@ -265,48 +268,88 @@ def shap_summary_plot(model,
         instances_to_explain (torch.Tensor): The specific data instances to explain.
         feature_names (list of str | None): Names of the features for plot labeling.
         save_dir (str | Path): Directory to save SHAP artifacts.
+        device (torch.device): The torch device for SHAP calculations.
+        explainer_type (Literal['deep', 'kernel']): The explainer to use.
+            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient for
+              PyTorch models.
+            - 'kernel': Uses shap.KernelExplainer. Model-agnostic but EXTREMELY
+              slow and memory-intensive.
     """
-    # everything to numpy
-    if isinstance(background_data, np.ndarray):
-        background_data_np = background_data
-    else:
-        background_data_np = background_data.numpy()
-    if isinstance(instances_to_explain, np.ndarray):
-        instances_to_explain_np = instances_to_explain
-    else:
-        instances_to_explain_np = instances_to_explain.numpy()
-    # --- Data Validation Step ---
-    if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
-        _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
-        return
-    print("\n--- SHAP Value Explanation ---")
+    print(f"\n--- SHAP Value Explanation Using {explainer_type.upper()} Explainer ---")
     model.eval()
-    model.cpu()
-    # 1. Summarize the background data.
-    # Summarize the background data using k-means. 10-50 clusters is a good starting point.
-    background_summary = shap.kmeans(background_data_np, 30)
-    # 2. Define a prediction function wrapper that SHAP can use. It must take a numpy array and return a numpy array.
-    def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
-        # Convert numpy data to torch tensor
-        x_torch = torch.from_numpy(x_np).float()
-        with torch.no_grad():
-            # Get model output
-            output = model(x_torch)
-        # Return as numpy array
-        return output.cpu().numpy().flatten()
-    # 3. Create the KernelExplainer
-    explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
+    # model.cpu() # Run explanations on CPU
-    print("Calculating SHAP values with KernelExplainer...")
-    shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
+    shap_values = None
+    instances_to_explain_np = None
+    if explainer_type == 'deep':
+        # --- 1. Use DeepExplainer (Preferred) ---
+        # Ensure data is torch.Tensor
+        if isinstance(background_data, np.ndarray):
+            background_data = torch.from_numpy(background_data).float()
+        if isinstance(instances_to_explain, np.ndarray):
+            instances_to_explain = torch.from_numpy(instances_to_explain).float()
+        if torch.isnan(background_data).any() or torch.isnan(instances_to_explain).any():
+            _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
+            return
+        background_data = background_data.to(device)
+        instances_to_explain = instances_to_explain.to(device)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+            explainer = shap.DeepExplainer(model, background_data)
+        # print("Calculating SHAP values with DeepExplainer...")
+        shap_values = explainer.shap_values(instances_to_explain)
+        instances_to_explain_np = instances_to_explain.cpu().numpy()
+    elif explainer_type == 'kernel':
+        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        _LOGGER.warning(
+            "Using KernelExplainer. This is memory-intensive and slow. "
+            "Consider reducing 'n_samples' if the process terminates unexpectedly."
+        )
+        # Ensure data is np.ndarray
+        if isinstance(background_data, torch.Tensor):
+            background_data_np = background_data.cpu().numpy()
+        else:
+            background_data_np = background_data
+        if isinstance(instances_to_explain, torch.Tensor):
+            instances_to_explain_np = instances_to_explain.cpu().numpy()
+        else:
+            instances_to_explain_np = instances_to_explain
+        if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
+            _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
+            return
+        # Summarize background data
+        background_summary = shap.kmeans(background_data_np, 30)
+        def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
+            x_torch = torch.from_numpy(x_np).float().to(device)
+            with torch.no_grad():
+                output = model(x_torch)
+            # Return as numpy array
+            return output.cpu().numpy()
+        explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
+        # print("Calculating SHAP values with KernelExplainer...")
+        shap_values = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
+        # instances_to_explain_np is already set
+    else:
+        _LOGGER.error(f"Invalid explainer_type: '{explainer_type}'. Must be 'deep' or 'kernel'.")
+        raise ValueError()
+    # --- 3. Plotting and Saving ---
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     plt.ioff()
@@ -326,8 +369,9 @@ def shap_summary_plot(model,
     shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
     ax = plt.gca()
     ax.set_xlabel("SHAP Value Impact", labelpad=10)
-    cb = plt.gcf().axes[-1]
-    cb.set_ylabel("", size=1)
+    if plt.gcf().axes and len(plt.gcf().axes) > 1:
+        cb = plt.gcf().axes[-1]
+        cb.set_ylabel("", size=1)
     plt.title("SHAP Feature Importance")
     plt.tight_layout()
     plt.savefig(dot_path)
@@ -337,8 +381,14 @@ def shap_summary_plot(model,
     # Save Summary Data to CSV
     shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
     summary_path = save_dir_path / shap_summary_filename
-    # Ensure the array is 1D before creating the DataFrame
-    mean_abs_shap = np.abs(shap_values).mean(axis=0).flatten()
+    # Handle multi-class (list of arrays) vs. regression (single array)
+    if isinstance(shap_values, list):
+        mean_abs_shap = np.abs(np.stack(shap_values)).mean(axis=0).mean(axis=0)
+    else:
+        mean_abs_shap = np.abs(shap_values).mean(axis=0)
+    mean_abs_shap = mean_abs_shap.flatten()
     if feature_names is None:
         feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
@@ -351,7 +401,7 @@ def shap_summary_plot(model,
     summary_df.to_csv(summary_path, index=False)
     _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
-    plt.ion()
+    plt.ion()
 def plot_attention_importance(weights: List[torch.Tensor], feature_names: Optional[List[str]], save_dir: Union[str, Path], top_n: int = 10):

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/ML_evaluation_multi.py RENAMED Viewed

@@ -19,11 +19,13 @@ from sklearn.metrics import (
     jaccard_score
 )
 from pathlib import Path
-from typing import Union, List
+from typing import Union, List, Literal
+import warnings
 from .path_manager import make_fullpath, sanitize_filename
 from ._logger import _LOGGER
 from ._script_info import _script_info
+from .keys import SHAPKeys
 __all__ = [
@@ -231,10 +233,12 @@ def multi_target_shap_summary_plot(
     instances_to_explain: Union[torch.Tensor, np.ndarray],
     feature_names: List[str],
     target_names: List[str],
-    save_dir: Union[str, Path]
+    save_dir: Union[str, Path],
+    device: torch.device = torch.device('cpu'),
+    explainer_type: Literal['deep', 'kernel'] = 'deep'
 ):
     """
-    Calculates SHAP values for a multi-target model and saves summary plots for each target.
+    Calculates SHAP values for a multi-target model and saves summary plots and data for each target.
     Args:
         model (torch.nn.Module): The trained PyTorch model.
@@ -243,40 +247,94 @@ def multi_target_shap_summary_plot(
         feature_names (List[str]): Names of the features for plot labeling.
         target_names (List[str]): Names of the output targets.
         save_dir (str | Path): Directory to save SHAP artifacts.
+        device (torch.device): The torch device for SHAP calculations.
+        explainer_type (Literal['deep', 'kernel']): The explainer to use.
+            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient.
+            - 'kernel': Uses shap.KernelExplainer. Model-agnostic but slow and memory-intensive.
     """
-    # Convert all data to numpy
-    background_data_np = background_data.numpy() if isinstance(background_data, torch.Tensor) else background_data
-    instances_to_explain_np = instances_to_explain.numpy() if isinstance(instances_to_explain, torch.Tensor) else instances_to_explain
-    if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
-        _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
-        return
-    _LOGGER.info("--- Multi-Target SHAP Value Explanation ---")
+    _LOGGER.info(f"--- Multi-Target SHAP Value Explanation (Using: {explainer_type.upper()}Explainer) ---")
     model.eval()
-    model.cpu()
-    # 1. Summarize the background data.
-    background_summary = shap.kmeans(background_data_np, 30)
-    # 2. Define a prediction function wrapper for the multi-target model.
-    def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
-        x_torch = torch.from_numpy(x_np).float()
-        with torch.no_grad():
-            output = model(x_torch)
-        return output.cpu().numpy()
-    # 3. Create the KernelExplainer.
-    explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
-    print("Calculating SHAP values with KernelExplainer...")
-    # For multi-output models, shap_values is a list of arrays.
-    shap_values_list = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
+    # model.cpu()
+    shap_values_list = None
+    instances_to_explain_np = None
+    if explainer_type == 'deep':
+        # --- 1. Use DeepExplainer (Preferred) ---
+        # Ensure data is torch.Tensor
+        if isinstance(background_data, np.ndarray):
+            background_data = torch.from_numpy(background_data).float()
+        if isinstance(instances_to_explain, np.ndarray):
+            instances_to_explain = torch.from_numpy(instances_to_explain).float()
+        if torch.isnan(background_data).any() or torch.isnan(instances_to_explain).any():
+            _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
+            return
+        background_data = background_data.to(device)
+        instances_to_explain = instances_to_explain.to(device)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", category=UserWarning)
+            explainer = shap.DeepExplainer(model, background_data)
+        # print("Calculating SHAP values with DeepExplainer...")
+        # DeepExplainer returns a list of arrays for multi-output models
+        shap_values_list = explainer.shap_values(instances_to_explain)
+        instances_to_explain_np = instances_to_explain.cpu().numpy()
+    elif explainer_type == 'kernel':
+        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        _LOGGER.warning(
+            "Using KernelExplainer. This is memory-intensive and slow. "
+            "Consider reducing 'n_samples' if the process terminates."
+        )
+        # Convert all data to numpy
+        background_data_np = background_data.numpy() if isinstance(background_data, torch.Tensor) else background_data
+        instances_to_explain_np = instances_to_explain.numpy() if isinstance(instances_to_explain, torch.Tensor) else instances_to_explain
+        if np.isnan(background_data_np).any() or np.isnan(instances_to_explain_np).any():
+            _LOGGER.error("Input data for SHAP contains NaN values. Aborting explanation.")
+            return
+        background_summary = shap.kmeans(background_data_np, 30)
+        def prediction_wrapper(x_np: np.ndarray) -> np.ndarray:
+            x_torch = torch.from_numpy(x_np).float().to(device)
+            with torch.no_grad():
+                output = model(x_torch)
+            return output.cpu().numpy() # Return full multi-output array
+        explainer = shap.KernelExplainer(prediction_wrapper, background_summary)
+        # print("Calculating SHAP values with KernelExplainer...")
+        # KernelExplainer also returns a list of arrays for multi-output models
+        shap_values_list = explainer.shap_values(instances_to_explain_np, l1_reg="aic")
+        # instances_to_explain_np is already set
+    else:
+        _LOGGER.error(f"Invalid explainer_type: '{explainer_type}'. Must be 'deep' or 'kernel'.")
+        raise ValueError("Invalid explainer_type")
+    # --- 3. Plotting and Saving (Common Logic) ---
+    if shap_values_list is None or instances_to_explain_np is None:
+        _LOGGER.error("SHAP value calculation failed. Aborting plotting.")
+        return
+    # Ensure number of SHAP value arrays matches number of target names
+    if len(shap_values_list) != len(target_names):
+        _LOGGER.error(
+            f"SHAP explanation mismatch: Model produced {len(shap_values_list)} "
+            f"outputs, but {len(target_names)} target_names were provided."
+        )
+        return
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     plt.ioff()
-    # 4. Iterate through each target's SHAP values and generate plots.
+    # Iterate through each target's SHAP values and generate plots.
     for i, target_name in enumerate(target_names):
         print(f"  -> Generating SHAP plots for target: '{target_name}'")
         shap_values_for_target = shap_values_list[i]
@@ -293,11 +351,28 @@ def multi_target_shap_summary_plot(
         # Save Dot Plot for the target
         shap.summary_plot(shap_values_for_target, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
         plt.title(f"SHAP Feature Importance for '{target_name}'")
+        if plt.gcf().axes and len(plt.gcf().axes) > 1:
+            cb = plt.gcf().axes[-1]
+            cb.set_ylabel("", size=1)
         plt.tight_layout()
         dot_path = save_dir_path / f"shap_dot_plot_{sanitized_target_name}.svg"
         plt.savefig(dot_path)
         plt.close()
+        # --- Save Summary Data to CSV for this target ---
+        shap_summary_filename = f"{SHAPKeys.SAVENAME}_{sanitized_target_name}.csv"
+        summary_path = save_dir_path / shap_summary_filename
+        # For a specific target, shap_values_for_target is just a 2D array
+        mean_abs_shap = np.abs(shap_values_for_target).mean(axis=0).flatten()
+        summary_df = pd.DataFrame({
+            SHAPKeys.FEATURE_COLUMN: feature_names,
+            SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+        }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
+        summary_df.to_csv(summary_path, index=False)
     plt.ion()
     _LOGGER.info(f"All SHAP plots saved to '{save_dir_path.name}'")

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/ML_inference.py RENAMED Viewed

@@ -9,7 +9,7 @@ from .ML_scaler import PytorchScaler
 from ._script_info import _script_info
 from ._logger import _LOGGER
 from .path_manager import make_fullpath
-from .keys import PyTorchInferenceKeys
+from .keys import PyTorchInferenceKeys, PyTorchCheckpointKeys
 __all__ = [
@@ -56,11 +56,21 @@ class _BaseInferenceHandler(ABC):
         model_p = make_fullpath(state_dict, enforce="file")
         try:
-            # Load the state dictionary and apply it to the model structure
-            self.model.load_state_dict(torch.load(model_p, map_location=self.device))
+            # Load whatever is in the file
+            loaded_data = torch.load(model_p, map_location=self.device)
+            # Check if it's the new checkpoint dictionary or an old weights-only file
+            if isinstance(loaded_data, dict) and PyTorchCheckpointKeys.MODEL_STATE in loaded_data:
+                # It's a new training checkpoint, extract the weights
+                self.model.load_state_dict(loaded_data[PyTorchCheckpointKeys.MODEL_STATE])
+            else:
+                # It's an old-style file (or just a state_dict), load it directly
+                self.model.load_state_dict(loaded_data)
+            _LOGGER.info(f"Model state loaded from '{model_p.name}'.")
             self.model.to(self.device)
             self.model.eval()  # Set the model to evaluation mode
-            _LOGGER.info(f"Model state loaded from '{model_p.name}' and set to evaluation mode.")
         except Exception as e:
             _LOGGER.error(f"Failed to load model state from '{model_p}': {e}")
             raise

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/ML_trainer.py RENAMED Viewed

@@ -5,12 +5,13 @@ import torch
 from torch import nn
 import numpy as np
-from .ML_callbacks import Callback, History, TqdmProgressBar
+from .ML_callbacks import Callback, History, TqdmProgressBar, ModelCheckpoint
 from .ML_evaluation import classification_metrics, regression_metrics, plot_losses, shap_summary_plot, plot_attention_importance
 from .ML_evaluation_multi import multi_target_regression_metrics, multi_label_classification_metrics, multi_target_shap_summary_plot
 from ._script_info import _script_info
-from .keys import PyTorchLogKeys
+from .keys import PyTorchLogKeys, PyTorchCheckpointKeys
 from ._logger import _LOGGER
+from .path_manager import make_fullpath
 __all__ = [
@@ -55,6 +56,7 @@ class MLTrainer:
         self.kind = kind
         self.criterion = criterion
         self.optimizer = optimizer
+        self.scheduler = None
         self.device = self._validate_device(device)
         self.dataloader_workers = dataloader_workers
@@ -70,6 +72,7 @@ class MLTrainer:
         self.history = {}
         self.epoch = 0
         self.epochs = 0 # Total epochs for the fit run
+        self.start_epoch = 1
         self.stop_training = False
     def _validate_device(self, device: str) -> torch.device:
@@ -109,8 +112,66 @@ class MLTrainer:
             num_workers=loader_workers,
             pin_memory=("cuda" in self.device.type)
         )
+    def _load_checkpoint(self, path: Union[str, Path]):
+        """Loads a training checkpoint to resume training."""
+        p = make_fullpath(path, enforce="file")
+        _LOGGER.info(f"Loading checkpoint from '{p.name}' to resume training...")
+        try:
+            checkpoint = torch.load(p, map_location=self.device)
+            if PyTorchCheckpointKeys.MODEL_STATE not in checkpoint or PyTorchCheckpointKeys.OPTIMIZER_STATE not in checkpoint:
+                _LOGGER.error(f"Checkpoint file '{p.name}' is invalid. Missing 'model_state_dict' or 'optimizer_state_dict'.")
+                raise KeyError()
-    def fit(self, epochs: int = 10, batch_size: int = 10, shuffle: bool = True):
+            self.model.load_state_dict(checkpoint[PyTorchCheckpointKeys.MODEL_STATE])
+            self.optimizer.load_state_dict(checkpoint[PyTorchCheckpointKeys.OPTIMIZER_STATE])
+            self.start_epoch = checkpoint.get(PyTorchCheckpointKeys.EPOCH, 0) + 1 # Resume on the *next* epoch
+            # --- Scheduler State Loading Logic ---
+            scheduler_state_exists = PyTorchCheckpointKeys.SCHEDULER_STATE in checkpoint
+            scheduler_object_exists = self.scheduler is not None
+            if scheduler_object_exists and scheduler_state_exists:
+                # Case 1: Both exist. Attempt to load.
+                try:
+                    self.scheduler.load_state_dict(checkpoint[PyTorchCheckpointKeys.SCHEDULER_STATE]) # type: ignore
+                    scheduler_name = self.scheduler.__class__.__name__
+                    _LOGGER.info(f"Restored LR scheduler state for: {scheduler_name}")
+                except Exception as e:
+                    # Loading failed, likely a mismatch
+                    scheduler_name = self.scheduler.__class__.__name__
+                    _LOGGER.error(f"Failed to load scheduler state for '{scheduler_name}'. A different scheduler type might have been used.")
+                    raise e
+            elif scheduler_object_exists and not scheduler_state_exists:
+                # Case 2: Scheduler provided, but no state in checkpoint.
+                scheduler_name = self.scheduler.__class__.__name__
+                _LOGGER.warning(f"'{scheduler_name}' was provided, but no scheduler state was found in the checkpoint. The scheduler will start from its initial state.")
+            elif not scheduler_object_exists and scheduler_state_exists:
+                # Case 3: State in checkpoint, but no scheduler provided.
+                _LOGGER.error("Checkpoint contains an LR scheduler state, but no LRScheduler callback was provided.")
+                raise ValueError()
+            # Restore callback states
+            for cb in self.callbacks:
+                if isinstance(cb, ModelCheckpoint) and PyTorchCheckpointKeys.BEST_SCORE in checkpoint:
+                    cb.best = checkpoint[PyTorchCheckpointKeys.BEST_SCORE]
+                    _LOGGER.info(f"Restored {cb.__class__.__name__} 'best' score to: {cb.best:.4f}")
+            _LOGGER.info(f"Checkpoint loaded. Resuming training from epoch {self.start_epoch}.")
+        except Exception as e:
+            _LOGGER.error(f"Failed to load checkpoint from '{p}': {e}")
+            raise
+    def fit(self,
+            epochs: int = 10,
+            batch_size: int = 10,
+            shuffle: bool = True,
+            resume_from_checkpoint: Optional[Union[str, Path]] = None):
         """
         Starts the training-validation process of the model.
@@ -120,6 +181,7 @@ class MLTrainer:
             epochs (int): The total number of epochs to train for.
             batch_size (int): The number of samples per batch.
             shuffle (bool): Whether to shuffle the training data at each epoch.
+            resume_from_checkpoint (str | Path | None): Optional path to a checkpoint to resume training.
         Note:
             For regression tasks using `nn.MSELoss` or `nn.L1Loss`, the trainer
@@ -132,15 +194,18 @@ class MLTrainer:
         self._create_dataloaders(batch_size, shuffle)
         self.model.to(self.device)
+        if resume_from_checkpoint:
+            self._load_checkpoint(resume_from_checkpoint)
         # Reset stop_training flag on the trainer
         self.stop_training = False
-        self.callbacks_hook('on_train_begin')
+        self._callbacks_hook('on_train_begin')
-        for epoch in range(1, self.epochs + 1):
+        for epoch in range(self.start_epoch, self.epochs + 1):
             self.epoch = epoch
             epoch_logs = {}
-            self.callbacks_hook('on_epoch_begin', epoch, logs=epoch_logs)
+            self._callbacks_hook('on_epoch_begin', epoch, logs=epoch_logs)
             train_logs = self._train_step()
             epoch_logs.update(train_logs)
@@ -148,13 +213,13 @@ class MLTrainer:
             val_logs = self._validation_step()
             epoch_logs.update(val_logs)
-            self.callbacks_hook('on_epoch_end', epoch, logs=epoch_logs)
+            self._callbacks_hook('on_epoch_end', epoch, logs=epoch_logs)
             # Check the early stopping flag
             if self.stop_training:
                 break
-        self.callbacks_hook('on_train_end')
+        self._callbacks_hook('on_train_end')
         return self.history
     def _train_step(self):
@@ -166,7 +231,7 @@ class MLTrainer:
                 PyTorchLogKeys.BATCH_INDEX: batch_idx,
                 PyTorchLogKeys.BATCH_SIZE: features.size(0)
             }
-            self.callbacks_hook('on_batch_begin', batch_idx, logs=batch_logs)
+            self._callbacks_hook('on_batch_begin', batch_idx, logs=batch_logs)
             features, target = features.to(self.device), target.to(self.device)
             self.optimizer.zero_grad()
@@ -188,7 +253,7 @@ class MLTrainer:
             # Add the batch loss to the logs and call the end-of-batch hook
             batch_logs[PyTorchLogKeys.BATCH_LOSS] = batch_loss
-            self.callbacks_hook('on_batch_end', batch_idx, logs=batch_logs)
+            self._callbacks_hook('on_batch_end', batch_idx, logs=batch_logs)
         return {PyTorchLogKeys.TRAIN_LOSS: running_loss / len(self.train_loader.dataset)} # type: ignore
@@ -340,9 +405,10 @@ class MLTrainer:
     def explain(self,
                 save_dir: Union[str,Path],
                 explain_dataset: Optional[Dataset] = None,
-                n_samples: int = 1000,
+                n_samples: int = 300,
                 feature_names: Optional[List[str]] = None,
-                target_names: Optional[List[str]] = None):
+                target_names: Optional[List[str]] = None,
+                explainer_type: Literal['deep', 'kernel'] = 'deep'):
         """
         Explains model predictions using SHAP and saves all artifacts.
@@ -359,6 +425,9 @@ class MLTrainer:
             feature_names (list[str] | None): Feature names.
             target_names (list[str] | None): Target names for multi-target tasks.
             save_dir (str | Path): Directory to save all SHAP artifacts.
+            explainer_type (Literal['deep', 'kernel']): The explainer to use.
+                - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient for PyTorch models.
+                - 'kernel': Uses shap.KernelExplainer. Model-agnostic but EXTREMELY slow and memory-intensive. Use with a very low 'n_samples'< 100.
         """
         # Internal helper to create a dataloader and get a random sample
         def _get_random_sample(dataset: Dataset, num_samples: int):
@@ -410,6 +479,9 @@ class MLTrainer:
             else:
                 _LOGGER.error("Could not extract `feature_names` from the dataset. It must be provided if the dataset object does not have a `feature_names` attribute.")
                 raise ValueError()
+        # move model to device
+        self.model.to(self.device)
         # 3. Call the plotting function
         if self.kind in ["regression", "classification"]:
@@ -418,7 +490,9 @@ class MLTrainer:
                 background_data=background_data,
                 instances_to_explain=instances_to_explain,
                 feature_names=feature_names,
-                save_dir=save_dir
+                save_dir=save_dir,
+                explainer_type=explainer_type,
+                device=self.device
             )
         elif self.kind in ["multi_target_regression", "multi_label_classification"]:
             # try to get target names
@@ -442,7 +516,9 @@ class MLTrainer:
                 instances_to_explain=instances_to_explain,
                 feature_names=feature_names, # type: ignore
                 target_names=target_names, # type: ignore
-                save_dir=save_dir
+                save_dir=save_dir,
+                explainer_type=explainer_type,
+                device=self.device
             )
     def _attention_helper(self, dataloader: DataLoader):
@@ -527,11 +603,33 @@ class MLTrainer:
         else:
             _LOGGER.error("No attention weights were collected from the model.")
-    def callbacks_hook(self, method_name: str, *args, **kwargs):
+    def _callbacks_hook(self, method_name: str, *args, **kwargs):
         """Calls the specified method on all callbacks."""
         for callback in self.callbacks:
             method = getattr(callback, method_name)
             method(*args, **kwargs)
+    def to_cpu(self):
+        """
+        Moves the model to the CPU and updates the trainer's device setting.
+        This is useful for running operations that require the CPU.
+        """
+        self.device = torch.device('cpu')
+        self.model.to(self.device)
+        _LOGGER.info("Trainer and model moved to CPU.")
+    def to_device(self, device: str):
+        """
+        Moves the model to the specified device and updates the trainer's device setting.
+        Args:
+            device (str): The target device (e.g., 'cuda', 'mps', 'cpu').
+        """
+        self.device = self._validate_device(device)
+        self.model.to(self.device)
+        _LOGGER.info(f"Trainer and model moved to {self.device}.")
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/ml_tools/keys.py RENAMED Viewed

@@ -68,6 +68,15 @@ class SHAPKeys:
     SAVENAME = "shap_summary"
+class PyTorchCheckpointKeys:
+    """Keys for saving/loading a training checkpoint dictionary."""
+    MODEL_STATE = "model_state_dict"
+    OPTIMIZER_STATE = "optimizer_state_dict"
+    SCHEDULER_STATE = "scheduler_state_dict"
+    EPOCH = "epoch"
+    BEST_SCORE = "best_score"
 class _OneHotOtherPlaceholder:
     """Used internally by GUI_tools."""
     OTHER_GUI = "OTHER"

{dragon_ml_toolbox-12.12.0 → dragon_ml_toolbox-13.0.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "dragon-ml-toolbox"
-version = "12.12.0"
+version = "13.0.0"
 description = "A collection of tools for data science and machine learning projects."
 authors = [
     { name = "Karl L. Loza Vidaurre", email = "luigiloza@gmail.com" }