PyPI - dragon-ml-toolbox - Versions diffs - 19.10.0__py3-none-any.whl → 19.12.0__py3-none-any.whl - Mend

dragon-ml-toolbox 19.10.0py3-none-any.whl → 19.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/METADATA +1 -1
{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/RECORD +19 -19
ml_tools/ML_callbacks.py +8 -4
ml_tools/_core/_MICE_imputation.py +2 -2
ml_tools/_core/_ML_callbacks.py +461 -171
ml_tools/_core/_ML_trainer.py +50 -50
ml_tools/_core/_ML_utilities.py +153 -50
ml_tools/_core/_PSO_optimization.py +1 -1
ml_tools/_core/_ensemble_inference.py +1 -1
ml_tools/_core/_keys.py +32 -1
ml_tools/_core/_optimization_tools.py +1 -1
ml_tools/_core/_path_manager.py +149 -27
ml_tools/_core/_utilities.py +6 -2
ml_tools/keys.py +2 -0
ml_tools/path_manager.py +5 -1
{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
{dragon_ml_toolbox-19.10.0.dist-info → dragon_ml_toolbox-19.12.0.dist-info}/top_level.txt +0 -0

ml_tools/_core/_ML_callbacks.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import numpy as np
 import torch
+from collections import deque
 from tqdm.auto import tqdm
 from typing import Union, Literal, Optional
 from pathlib import Path
@@ -16,9 +17,11 @@ _LOGGER = get_logger("Callbacks")
 __all__ = [
     "History",
     "TqdmProgressBar",
-    "DragonEarlyStopping",
+    "DragonPatienceEarlyStopping",
+    "DragonPrecheltEarlyStopping",
     "DragonModelCheckpoint",
-    "DragonLRScheduler"
+    "DragonScheduler",
+    "DragonReduceLROnPlateau"
 ]
@@ -112,67 +115,89 @@ class TqdmProgressBar(_Callback):
         self.epoch_bar.close() # type: ignore
-class DragonEarlyStopping(_Callback):
+class _DragonEarlyStopping(_Callback):
     """
-    Stop training when a monitored metric has stopped improving.
+    Base class for Early Stopping strategies.
+    Ensures type compatibility and shared logging logic.
     """
-    def __init__(self, monitor: str=PyTorchLogKeys.VAL_LOSS, min_delta: float=0.0, patience: int=5, mode: Literal['auto', 'min', 'max']='auto', verbose: int=1):
-        """
+    def __init__(self,
+                 monitor: str,
+                 verbose: int = 1):
+        super().__init__()
+        self.monitor = monitor
+        self.verbose = verbose
+        self.stopped_epoch = 0
+    def _stop_training(self, epoch: int, reason: str):
+        """Helper to trigger the stop."""
+        self.stopped_epoch = epoch
+        self.trainer.stop_training = True # type: ignore
+        if self.verbose > 0:
+            _LOGGER.info(f"Epoch {epoch}: Early stopping triggered. Reason: {reason}")
+class DragonPatienceEarlyStopping(_DragonEarlyStopping):
+    """
+    Standard early stopping: Tracks minimum validation loss (or other metric) with a patience counter.
+    """
+    def __init__(self,
+                 monitor: Literal["Training Loss", "Validation Loss"] = "Validation Loss",
+                 min_delta: float = 0.0,
+                 patience: int = 10,
+                 mode: Literal['min', 'max'] = 'min',
+                 verbose: int = 1):
+        """
         Args:
-            monitor (str): Quantity to be monitored. Defaults to 'val_loss'.
-            min_delta (float): Minimum change in the monitored quantity to qualify as an improvement.
+            monitor (str): Metric to monitor.
+            min_delta (float): Minimum change to qualify as an improvement.
             patience (int): Number of epochs with no improvement after which training will be stopped.
-            mode (str): One of {'auto', 'min', 'max'}. In 'min' mode, training will stop when the quantity
-                        monitored has stopped decreasing; in 'max' mode it will stop when the quantity
-                        monitored has stopped increasing; in 'auto' mode, the direction is automatically
-                        inferred from the name of the monitored quantity.
+            mode (str): One of {'min', 'max'}. In 'min' mode, training will stop when the quantity monitored has stopped decreasing; in 'max' mode it will stop when the quantity monitored has stopped increasing.
             verbose (int): Verbosity mode.
         """
-        super().__init__()
-        self.monitor = monitor
+        # standardize monitor key
+        if monitor == "Training Loss":
+            std_monitor = PyTorchLogKeys.TRAIN_LOSS
+        elif monitor == "Validation Loss":
+            std_monitor =  PyTorchLogKeys.VAL_LOSS
+        else:
+            _LOGGER.error(f"Unknown monitor key: {monitor}.")
+            raise ValueError()
+        super().__init__(std_monitor, verbose)
         self.patience = patience
         self.min_delta = min_delta
         self.wait = 0
-        self.stopped_epoch = 0
-        self.verbose = verbose
+        self.mode = mode
-        if mode not in ['auto', 'min', 'max']:
-            _LOGGER.error(f"EarlyStopping mode {mode} is unknown, choose one of ('auto', 'min', 'max')")
+        if mode not in ['min', 'max']:
+            _LOGGER.error(f"EarlyStopping mode {mode} is unknown, choose one of ('min', 'max')")
             raise ValueError()
-        self.mode = mode
-        # Determine the comparison operator based on the mode
+        # Determine the comparison operator
         if self.mode == 'min':
             self.monitor_op = np.less
         elif self.mode == 'max':
             self.monitor_op = np.greater
-        else: # auto mode
-            if 'acc' in self.monitor.lower():
-                self.monitor_op = np.greater
-            else: # Default to min mode for loss or other metrics
-                self.monitor_op = np.less
+        else:
+            # raise error for unknown mode
+            _LOGGER.error(f"EarlyStopping mode {mode} is unknown, choose one of ('min', 'max')")
+            raise ValueError()
         self.best = np.inf if self.monitor_op == np.less else -np.inf
     def on_train_begin(self, logs=None):
-        # Reset state at the beginning of training
         self.wait = 0
-        self.stopped_epoch = 0
         self.best = np.inf if self.monitor_op == np.less else -np.inf
     def on_epoch_end(self, epoch, logs=None):
         current = logs.get(self.monitor) # type: ignore
         if current is None:
             return
-        # Determine the comparison threshold based on the mode
+        # Check improvement
         if self.monitor_op == np.less:
-            # For 'min' mode, we need to be smaller than 'best' by at least 'min_delta'
-            # Correct check: current < self.best - self.min_delta
             is_improvement = self.monitor_op(current, self.best - self.min_delta)
         else:
-            # For 'max' mode, we need to be greater than 'best' by at least 'min_delta'
-            # Correct check: current > self.best + self.min_delta
             is_improvement = self.monitor_op(current, self.best + self.min_delta)
         if is_improvement:
@@ -183,137 +208,224 @@ class DragonEarlyStopping(_Callback):
         else:
             self.wait += 1
             if self.wait >= self.patience:
-                self.stopped_epoch = epoch
-                self.trainer.stop_training = True # type: ignore
-                if self.verbose > 0:
-                    _LOGGER.info(f"Epoch {epoch+1}: early stopping after {self.wait} epochs with no improvement.")
+                self._stop_training(epoch, f"No improvement in {self.monitor} for {self.wait} epochs.")
+class DragonPrecheltEarlyStopping(_DragonEarlyStopping):
+    """
+    Implements Prechelt's 'Progress-Modified GL' criterion.
+    Tracks the ratio between Generalization Loss (overfitting) and Training Progress.
+    References:
+        Prechelt, L. (1998). Early Stopping - But When?
+    """
+    def __init__(self,
+                 alpha: float = 0.75,
+                 k: int = 5,
+                 verbose: int = 1):
+        """
+        This early stopping strategy monitors both validation loss and training loss to determine the optimal stopping point.
+        Args:
+            alpha (float): The threshold for the stopping criterion.
+            k (int): The window size for calculating training progress.
+            verbose (int): Verbosity mode.
+        NOTE:
+        - **The Strip Size (k)**:
+            - `5`: The empirical "gold standard." It is long enough to smooth out batch noise but short enough to react to convergence plateaus quickly.
+            - `10` to `20`: Use if the training curve is very jagged (e.g., noisy data, small batch sizes, high dropout, or Reinforcement Learning). A larger k value prevents premature stopping due to random volatility.
+        - **The threshold (alpha)**:
+            - `< 0.5`: Aggressive. Stops training very early.
+            - `0.75` to `0.80`: Prechelt found this range to be the most robust across different datasets. It typically yields the best trade-off between generalization and training cost.
+            - `1.0` to `1.2`: Useful for complex tasks (like Transformers) where training progress might dip temporarily before recovering. It risks slightly more overfitting but ensures potential is exhausted.
+        """
+        super().__init__(PyTorchLogKeys.VAL_LOSS, verbose)
+        self.train_monitor = PyTorchLogKeys.TRAIN_LOSS
+        self.alpha = alpha
+        self.k = k
+        self.best_val_loss = np.inf
+        self.train_strip = deque(maxlen=k)
+    def on_train_begin(self, logs=None):
+        self.best_val_loss = np.inf
+        self.train_strip.clear()
+    def on_epoch_end(self, epoch, logs=None):
+        val_loss = logs.get(self.monitor) # type: ignore
+        train_loss = logs.get(self.train_monitor) # type: ignore
+        if val_loss is None or train_loss is None:
+            return
+        # 1. Update Best Validation Loss
+        if val_loss < self.best_val_loss:
+            self.best_val_loss = val_loss
+        # 2. Update Training Strip
+        self.train_strip.append(train_loss)
+        # 3. Calculate Generalization Loss (GL)
+        # GL(t) = 100 * (E_val / E_opt - 1)
+        # Low GL is good. High GL means we are drifting away from best val score (overfitting).
+        gl = 100 * ((val_loss / self.best_val_loss) - 1)
+        # 4. Calculate Progress (Pk)
+        # Pk(t) = 1000 * (Sum(strip) / (k * min(strip)) - 1)
+        # High Pk is good (training loss is still dropping fast). Low Pk means training has stalled.
+        if len(self.train_strip) < self.k:
+            # Not enough data for progress yet
+            return
+        strip_sum = sum(self.train_strip)
+        strip_min = min(self.train_strip)
+        # Avoid division by zero
+        if strip_min == 0:
+            pk = 0.1 # Arbitrary small number
+        else:
+            pk = 1000 * ((strip_sum / (self.k * strip_min)) - 1)
+        # 5. The Quotient Criterion
+        # Stop if GL / Pk > alpha
+        # Intuition: Stop if Overfitting is high AND Progress is low.
+        # Avoid division by zero
+        if pk == 0:
+            pk = 1e-6
+        quotient = gl / pk
+        if self.verbose > 1:
+            _LOGGER.info(f"Epoch {epoch}: GL={gl:.3f} | Pk={pk:.3f} | Quotient={quotient:.3f} (Threshold={self.alpha})")
+        if quotient > self.alpha:
+            self._stop_training(epoch, f"Prechelt Criterion triggered. Generalization/Progress quotient ({quotient:.3f}) > alpha ({self.alpha}).")
 class DragonModelCheckpoint(_Callback):
     """
     Saves the model weights, optimizer state, LR scheduler state (if any), and epoch number to a directory with automated filename generation and rotation.
     """
-    def __init__(self, save_dir: Union[str,Path], monitor: str = PyTorchLogKeys.VAL_LOSS,
-                 save_best_only: bool = True, mode: Literal['auto', 'min', 'max']= 'auto', verbose: int = 0):
+    def __init__(self,
+                 save_dir: Union[str, Path],
+                 monitor: Literal["Training Loss", "Validation Loss", "both"] = "Validation Loss",
+                 save_three_best: bool = True,
+                 mode: Literal['min', 'max'] = 'min',
+                 verbose: int = 0):
         """
-        - If `save_best_only` is True, it saves the single best model, deleting the previous best.
-        - If `save_best_only` is False, it keeps the 3 most recent checkpoints, deleting the oldest ones automatically.
         Args:
             save_dir (str): Directory where checkpoint files will be saved.
-            monitor (str): Metric to monitor.
-            save_best_only (bool): If true, save only the best model.
-            mode (str): One of {'auto', 'min', 'max'}.
+            monitor (str): Metric to monitor. If "both", the sum of training loss and validation loss is used.
+            save_three_best (bool):
+                - If True, keeps the top 3 best checkpoints found during training (based on metric).
+                - If False, keeps the 3 most recent checkpoints (rolling window).
+            mode (str): One of {'min', 'max'}.
             verbose (int): Verbosity mode.
         """
         super().__init__()
         self.save_dir = make_fullpath(save_dir, make=True, enforce="directory")
-        if not self.save_dir.is_dir():
-            _LOGGER.error(f"{save_dir} is not a valid directory.")
-            raise IOError()
-        self.monitor = monitor
-        self.save_best_only = save_best_only
+        # Standardize monitor key
+        if monitor == "Training Loss":
+            std_monitor = PyTorchLogKeys.TRAIN_LOSS
+        elif monitor == "Validation Loss":
+            std_monitor = PyTorchLogKeys.VAL_LOSS
+        elif monitor == "both":
+            std_monitor = "both"
+        else:
+            _LOGGER.error(f"Unknown monitor key: {monitor}.")
+            raise ValueError()
+        self.monitor = std_monitor
+        self.save_three_best = save_three_best
         self.verbose = verbose
         self._latest_checkpoint_path = None
         self._checkpoint_name = PyTorchCheckpointKeys.CHECKPOINT_NAME
-        # State variables to be managed during training
-        self.saved_checkpoints = []
-        self.last_best_filepath = None
+        # State variables
+        # stored as list of dicts: [{'path': Path, 'score': float, 'epoch': int}]
+        self.best_checkpoints = []
+        # For rolling check (save_three_best=False)
+        self.recent_checkpoints = []
-        if mode not in ['auto', 'min', 'max']:
-            _LOGGER.error(f"ModelCheckpoint mode {mode} is unknown.")
+        if mode not in ['min', 'max']:
+            _LOGGER.error(f"ModelCheckpoint mode {mode} is unknown. Use 'min' or 'max'.")
             raise ValueError()
         self.mode = mode
+        # Determine comparison operator
         if self.mode == 'min':
             self.monitor_op = np.less
-        elif self.mode == 'max':
-            self.monitor_op = np.greater
+            self.best = np.inf
         else:
-            self.monitor_op = np.less if 'loss' in self.monitor else np.greater
-        self.best = np.inf if self.monitor_op == np.less else -np.inf
+            self.monitor_op = np.greater
+            self.best = -np.inf
     def on_train_begin(self, logs=None):
-        """Reset state when training starts."""
-        self.best = np.inf if self.monitor_op == np.less else -np.inf
-        self.saved_checkpoints = []
-        self.last_best_filepath = None
+        """Reset file tracking state when training starts.
+        NOTE: Do nOT reset self.best here if it differs from the default. This allows the Trainer to restore 'best' from a checkpoint before calling train()."""
+        self.best_checkpoints = []
+        self.recent_checkpoints = []
+        # Check if self.best is at default initialization value
+        is_default_min = (self.mode == 'min' and self.best == np.inf)
+        is_default_max = (self.mode == 'max' and self.best == -np.inf)
+        # If it is NOT default, it means it was restored.
+        if not (is_default_min or is_default_max):
+            _LOGGER.debug(f"Resuming with best score: {self.best:.4f}")
+    def _get_metric_value(self, logs):
+        """Extracts or calculates the metric value based on configuration."""
+        if self.monitor == "both":
+            t_loss = logs.get(PyTorchLogKeys.TRAIN_LOSS)
+            v_loss = logs.get(PyTorchLogKeys.VAL_LOSS)
+            if t_loss is None or v_loss is None:
+                return None
+            return t_loss + v_loss
+        else:
+            return logs.get(self.monitor)
     def on_epoch_end(self, epoch, logs=None):
         logs = logs or {}
+        current_score = self._get_metric_value(logs)
-        if self.save_best_only:
-            self._save_best_model(epoch, logs)
-        else:
-            self._save_rolling_checkpoints(epoch, logs)
-    def _save_best_model(self, epoch, logs):
-        """Saves a single best model and deletes the previous one."""
-        current = logs.get(self.monitor)
-        if current is None:
+        if current_score is None:
+            if self.verbose > 0:
+                _LOGGER.warning(f"Epoch {epoch}: Metric '{self.monitor}' not found in logs. Skipping checkpoint.")
             return
-        if self.monitor_op(current, self.best):
-            old_best_str = f"{self.best:.4f}" if self.best not in [np.inf, -np.inf] else "inf"
-            # Create a descriptive filename
-            self.save_dir.mkdir(parents=True, exist_ok=True)
-            current_string = str(round(current, ndigits=2)).replace('.', '_')
-            filename = f"epoch{epoch}_{self._checkpoint_name}-{current_string}.pth"
-            new_filepath = self.save_dir / filename
+        # 1. Update global best score (for logging/metadata)
+        if self.monitor_op(current_score, self.best):
             if self.verbose > 0:
-                _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current:.4f}, saving model to {new_filepath}")
-            # Update best score *before* saving
-            self.best = current
+                 # Only log explicit "improvement" if we are beating the historical best
+                 old_best_str = f"{self.best:.4f}" if not np.isinf(self.best) else "inf"
+                 _LOGGER.info(f"Epoch {epoch}: {self.monitor} improved from {old_best_str} to {current_score:.4f}")
+            self.best = current_score
-            # Create a comprehensive checkpoint dictionary
-            checkpoint_data = {
-                PyTorchCheckpointKeys.EPOCH: epoch,
-                PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
-                PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
-                PyTorchCheckpointKeys.BEST_SCORE: self.best,
-                PyTorchCheckpointKeys.HISTORY: self.trainer.history, # type: ignore
-            }
-            # Check for scheduler
-            if hasattr(self.trainer, 'scheduler') and self.trainer.scheduler is not None: # type: ignore
-                checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
-            # Save the new best model
-            torch.save(checkpoint_data, new_filepath)
-            self._latest_checkpoint_path = new_filepath
-            # Delete the old best model file
-            if self.last_best_filepath and self.last_best_filepath.exists():
-                self.last_best_filepath.unlink()
-            # Update state
-            self.last_best_filepath = new_filepath
+        if self.save_three_best:
+            self._save_top_k_checkpoints(epoch, current_score)
+        else:
+            self._save_rolling_checkpoints(epoch, current_score)
-    def _save_rolling_checkpoints(self, epoch, logs):
-        """Saves the latest model and keeps only the most recent ones."""
-        current = logs.get(self.monitor)
+    def _save_checkpoint_file(self, epoch, current_score):
+        """Helper to physically save the file."""
         self.save_dir.mkdir(parents=True, exist_ok=True)
-        current_string = str(round(current, ndigits=2)).replace('.', '_')
-        filename = f"epoch{epoch}_{self._checkpoint_name}-{current_string}.pth"
-        filepath = self.save_dir / filename
-        if self.verbose > 0:
-            _LOGGER.info(f'Epoch {epoch}: saving model to {filepath}')
+        # Create filename
+        score_str = f"{current_score:.4f}".replace('.', '_')
+        filename = f"epoch{epoch}_{self._checkpoint_name}-{score_str}.pth"
+        filepath = self.save_dir / filename
-        # Create a comprehensive checkpoint dictionary
+        # Create checkpoint dict
         checkpoint_data = {
             PyTorchCheckpointKeys.EPOCH: epoch,
             PyTorchCheckpointKeys.MODEL_STATE: self.trainer.model.state_dict(), # type: ignore
             PyTorchCheckpointKeys.OPTIMIZER_STATE: self.trainer.optimizer.state_dict(), # type: ignore
-            PyTorchCheckpointKeys.BEST_SCORE: self.best, # Save the current best score
+            PyTorchCheckpointKeys.BEST_SCORE: current_score,
             PyTorchCheckpointKeys.HISTORY: self.trainer.history, # type: ignore
         }
@@ -321,91 +433,269 @@ class DragonModelCheckpoint(_Callback):
             checkpoint_data[PyTorchCheckpointKeys.SCHEDULER_STATE] = self.trainer.scheduler.state_dict() # type: ignore
         torch.save(checkpoint_data, filepath)
         self._latest_checkpoint_path = filepath
+        return filepath
+    def _save_top_k_checkpoints(self, epoch, current_score):
+        """Logic for maintaining the top 3 best checkpoints."""
+        def sort_key(item): return item['score']
+        # Determine sort direction so that Index 0 is BEST and Index -1 is WORST
+        # Min mode (lower is better): Ascending (reverse=False) -> [0.1, 0.5, 0.9] (0.1 is best)
+        # Max mode (higher is better): Descending (reverse=True) -> [0.9, 0.5, 0.1] (0.9 is best)
+        is_reverse = (self.mode == 'max')
-        self.saved_checkpoints.append(filepath)
+        should_save = False
+        if len(self.best_checkpoints) < 3:
+            should_save = True
+        else:
+            # Sort current list to identify the worst (last item)
+            self.best_checkpoints.sort(key=sort_key, reverse=is_reverse)
+            worst_entry = self.best_checkpoints[-1]
+            # Check if current is better than the worst in the list
+            # min mode: current < worst['score']
+            # max mode: current > worst['score']
+            if self.monitor_op(current_score, worst_entry['score']):
+                should_save = True
+        if should_save:
+            filepath = self._save_checkpoint_file(epoch, current_score)
+            if self.verbose > 0:
+                _LOGGER.info(f"Epoch {epoch}: {self.monitor} ({current_score:.4f}) is in top 3. Saving to {filepath.name}")
-        # If we have more than n checkpoints, remove the oldest one
-        if len(self.saved_checkpoints) > 3:
-            file_to_delete = self.saved_checkpoints.pop(0)
+            self.best_checkpoints.append({'path': filepath, 'score': current_score, 'epoch': epoch})
+            # Prune if > 3
+            if len(self.best_checkpoints) > 3:
+                # Re-sort to ensure worst is at the end
+                self.best_checkpoints.sort(key=sort_key, reverse=is_reverse)
+                # Evict the last one (Worst)
+                entry_to_delete = self.best_checkpoints.pop(-1)
+                if entry_to_delete['path'].exists():
+                    if self.verbose > 0:
+                        _LOGGER.info(f"  -> Deleting checkpoint outside top 3: {entry_to_delete['path'].name}")
+                    entry_to_delete['path'].unlink()
+    def _save_rolling_checkpoints(self, epoch, current_score):
+        """Saves the latest model and keeps only the 3 most recent ones."""
+        filepath = self._save_checkpoint_file(epoch, current_score)
+        if self.verbose > 0:
+            _LOGGER.info(f'Epoch {epoch}: saving rolling model to {filepath.name}')
+        self.recent_checkpoints.append(filepath)
+        # If we have more than 3 checkpoints, remove the oldest one
+        if len(self.recent_checkpoints) > 3:
+            file_to_delete = self.recent_checkpoints.pop(0)
             if file_to_delete.exists():
                 if self.verbose > 0:
-                    _LOGGER.info(f"  -> Deleting old checkpoint: {file_to_delete.name}")
+                    _LOGGER.info(f"  -> Deleting old rolling checkpoint: {file_to_delete.name}")
                 file_to_delete.unlink()
     @property
     def best_checkpoint_path(self):
-        if self._latest_checkpoint_path:
+        # If tracking top 3, return the absolute best among them
+        if self.save_three_best and self.best_checkpoints:
+            def sort_key(item): return item['score']
+            is_reverse = (self.mode == 'max')
+            # Sort Best -> Worst
+            sorted_bests = sorted(self.best_checkpoints, key=sort_key, reverse=is_reverse)
+            # Index 0 is always the best based on the logic above
+            return sorted_bests[0]['path']
+        elif self._latest_checkpoint_path:
             return self._latest_checkpoint_path
         else:
             _LOGGER.error("No checkpoint paths saved.")
             raise ValueError()
-class DragonLRScheduler(_Callback):
+class _DragonLRScheduler(_Callback):
     """
-    Callback to manage a PyTorch learning rate scheduler.
+    Base class for Dragon LR Schedulers.
+    Handles common logic like logging and attaching to the trainer.
     """
-    def __init__(self, scheduler, monitor: Optional[str] = PyTorchLogKeys.VAL_LOSS):
-        """
-        This callback automatically calls the scheduler's `step()` method at the
-        end of each epoch. It also logs a message when the learning rate changes.
+    def __init__(self):
+        super().__init__()
+        self.scheduler = None
+        self.previous_lr = None
+    def set_trainer(self, trainer):
+        """Associates the callback with the trainer."""
+        super().set_trainer(trainer)
+        # Note: Subclasses must ensure self.scheduler is set before or during this call
+        # if they want to register it immediately.
+        if self.scheduler:
+            self.trainer.scheduler = self.scheduler # type: ignore
+    def on_train_begin(self, logs=None):
+        """Store the initial learning rate."""
+        if not self.trainer.optimizer: # type: ignore
+            _LOGGER.warning("No optimizer found in trainer. LRScheduler cannot track learning rate.")
+            return
+        self.previous_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+    def _check_and_log_lr(self, epoch, logs, verbose: bool):
+        """Helper to log LR changes and update history."""
+        if not self.trainer.optimizer: # type: ignore
+            return
+        current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+        # Log change
+        if self.previous_lr is not None and current_lr != self.previous_lr:
+            if verbose:
+                print(f"    > Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
+            self.previous_lr = current_lr
+        # Log to dictionary
+        logs[PyTorchLogKeys.LEARNING_RATE] = current_lr
+        # Log to history
+        if hasattr(self.trainer, 'history'):
+            self.trainer.history.setdefault(PyTorchLogKeys.LEARNING_RATE, []).append(current_lr) # type: ignore
+class DragonScheduler(_DragonLRScheduler):
+    """
+    Callback for standard PyTorch Learning Rate Schedulers.
+    Compatible with: StepLR, MultiStepLR, ExponentialLR, CosineAnnealingLR, etc.
+    NOT Compatible with: ReduceLROnPlateau (Use `DragonReduceLROnPlateau` instead).
+    """
+    def __init__(self, scheduler, verbose: bool=True):
+        """
         Args:
-            scheduler: An initialized PyTorch learning rate scheduler.
-            monitor (str): The metric to monitor for schedulers that require it, like `ReduceLROnPlateau`. Should match a key in the logs (e.g., 'val_loss').
+            scheduler: An initialized PyTorch learning rate scheduler instance.
+            verbose (bool): If True, logs learning rate changes to console.
         """
         super().__init__()
+        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            raise ValueError(
+                "DragonLRScheduler does not support 'ReduceLROnPlateau'. "
+                "Please use the `DragonReduceLROnPlateau` callback instead."
+            )
         self.scheduler = scheduler
-        self.monitor = monitor
-        self.previous_lr = None
+        self.verbose = verbose
     def set_trainer(self, trainer):
-        """This is called by the Trainer to associate itself with the callback."""
         super().set_trainer(trainer)
-        # Register the scheduler with the trainer so it can be added to the checkpoint
+        # Explicitly register the scheduler again to be safe
         self.trainer.scheduler = self.scheduler # type: ignore
-    def on_train_begin(self, logs=None):
-        """Store the initial learning rate."""
-        self.previous_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+        if self.verbose:
+            _LOGGER.info(f"Registered LR Scheduler: {self.scheduler.__class__.__name__}")
     def on_epoch_end(self, epoch, logs=None):
-        """Step the scheduler and log any change in learning rate."""
         logs = logs or {}
-        # For schedulers that need a metric (e.g., val_loss)
-        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-            if self.monitor is None:
-                _LOGGER.error("LRScheduler needs a `monitor` metric for ReduceLROnPlateau.")
-                raise ValueError()
-            metric_val = logs.get(self.monitor) # type: ignore
-            if metric_val is not None:
-                self.scheduler.step(metric_val)
-            else:
-                _LOGGER.warning(f"LRScheduler could not find metric '{self.monitor}' in logs.")
+        # Standard step (no metrics needed)
+        self.scheduler.step()
-        # For all other schedulers
+        self._check_and_log_lr(epoch, logs, self.verbose)
+class DragonReduceLROnPlateau(_DragonLRScheduler):
+    """
+    Specific callback for `torch.optim.lr_scheduler.ReduceLROnPlateau`. Reduces learning rate when a monitored metric has stopped improving.
+    This wrapper initializes the scheduler internally using the Trainer's optimizer, simplifying the setup process.
+    """
+    def __init__(self,
+                 monitor: Literal["Training Loss", "Validation Loss"] = "Validation Loss",
+                 mode: Literal['min', 'max'] = 'min',
+                 factor: float = 0.1,
+                 patience: int = 5,
+                 threshold: float = 1e-4,
+                 threshold_mode: Literal['rel', 'abs'] = 'rel',
+                 cooldown: int = 0,
+                 min_lr: float = 0,
+                 eps: float = 1e-8,
+                 verbose: bool = True):
+        """
+        Args:
+            monitor ("Training Loss", "Validation Loss"): Metric to monitor.
+            mode ('min', 'max'): One of 'min', 'max'.
+            factor (float): Factor by which the learning rate will be reduced. new_lr = lr * factor.
+            patience (int): Number of epochs with no improvement after which learning rate will be reduced.
+            threshold (float): Threshold for measuring the new optimum.
+            threshold_mode ('rel', 'abs'): One of 'rel', 'abs'.
+            cooldown (int): Number of epochs to wait before resuming normal operation after lr has been reduced.
+            min_lr (float or list): A scalar or a list of scalars.
+            eps (float): Minimal decay applied to lr.
+            verbose (bool): If True, logs learning rate changes to console.
+        """
+        super().__init__()
+        # Standardize monitor key
+        if monitor == "Training Loss":
+            std_monitor = PyTorchLogKeys.TRAIN_LOSS
+        elif monitor == "Validation Loss":
+            std_monitor = PyTorchLogKeys.VAL_LOSS
         else:
-            self.scheduler.step()
+            _LOGGER.error(f"Unknown monitor key: {monitor}.")
+            raise ValueError()
+        self.monitor = std_monitor
+        self.verbose = verbose
+        # Config storage for delayed initialization
+        self.config = {
+            'mode': mode,
+            'factor': factor,
+            'patience': patience,
+            'threshold': threshold,
+            'threshold_mode': threshold_mode,
+            'cooldown': cooldown,
+            'min_lr': min_lr,
+            'eps': eps,
+        }
+    def set_trainer(self, trainer):
+        """
+        Initializes the ReduceLROnPlateau scheduler using the trainer's optimizer and registers it.
+        """
+        super().set_trainer(trainer)
+        if not hasattr(self.trainer, 'optimizer'):
+            _LOGGER.error("Trainer has no optimizer. Cannot initialize ReduceLROnPlateau.")
+            raise ValueError()
-        # Get the current learning rate
-        current_lr = self.trainer.optimizer.param_groups[0]['lr'] # type: ignore
+        # Initialize the actual scheduler with the optimizer
+        if self.verbose:
+            _LOGGER.info(f"Initializing ReduceLROnPlateau monitoring '{self.monitor}'")
+        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer=self.trainer.optimizer, # type: ignore
+            **self.config
+        )
+        # Register with trainer for checkpointing
+        self.trainer.scheduler = self.scheduler # type: ignore
-        # Log the change if the LR was updated
-        if current_lr != self.previous_lr:
-            _LOGGER.info(f"Epoch {epoch}: Learning rate changed to {current_lr:.6f}")
-            self.previous_lr = current_lr
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
-        # --- Add LR to logs and history ---
-        # Add to the logs dict for any subsequent callbacks
-        logs[PyTorchLogKeys.LEARNING_RATE] = current_lr
+        metric_val = logs.get(self.monitor)
-        # Also add directly to the trainer's history dict
-        if hasattr(self.trainer, 'history'):
-            self.trainer.history.setdefault(PyTorchLogKeys.LEARNING_RATE, []).append(current_lr) # type: ignore
+        if metric_val is None:
+            _LOGGER.warning(f"DragonReduceLROnPlateau could not find metric '{self.monitor}' in logs. Scheduler step skipped.")
+            # Still log LR to keep history consistent
+            self._check_and_log_lr(epoch, logs, self.verbose)
+            return
+        # Step with metric
+        self.scheduler.step(metric_val)
+        self._check_and_log_lr(epoch, logs, self.verbose)
 def info():

dragon-ml-toolbox 19.10.0__py3-none-any.whl → 19.12.0__py3-none-any.whl

dragon-ml-toolbox 19.10.0py3-none-any.whl → 19.12.0py3-none-any.whl