PyPI - congrads - Versions diffs - 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

congrads 1.0.6py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

congrads/__init__.py +2 -3
congrads/checkpoints.py +73 -127
congrads/constraints.py +813 -476
congrads/core.py +521 -345
congrads/datasets.py +491 -191
congrads/descriptor.py +118 -82
congrads/metrics.py +55 -127
congrads/networks.py +35 -81
congrads/py.typed +0 -0
congrads/transformations.py +65 -88
congrads/utils.py +499 -131
{congrads-1.0.6.dist-info → congrads-1.1.0.dist-info}/METADATA +48 -41
congrads-1.1.0.dist-info/RECORD +14 -0
congrads-1.1.0.dist-info/WHEEL +4 -0
congrads-1.0.6.dist-info/LICENSE +0 -26
congrads-1.0.6.dist-info/RECORD +0 -15
congrads-1.0.6.dist-info/WHEEL +0 -5
congrads-1.0.6.dist-info/top_level.txt +0 -1

congrads/core.py CHANGED Viewed

@@ -1,15 +1,15 @@
-"""
-This module provides the CongradsCore class, which is designed to integrate
-constraint-guided optimization into neural network training.
-It extends traditional training processes by enforcing specific constraints
-on the model's outputs, ensuring that the network satisfies domain-specific
+"""This module provides the core CongradsCore class for the main training functionality.
+It is designed to integrate constraint-guided optimization into neural network training.
+It extends traditional training processes by enforcing specific constraints
+on the model's outputs, ensuring that the network satisfies domain-specific
 requirements during both training and evaluation.
-The `CongradsCore` class serves as the central engine for managing the
-training, validation, and testing phases of a neural network model,
-incorporating constraints that influence the loss function and model updates.
-The model is trained with standard loss functions while also incorporating
-constraint-based adjustments, which are tracked and logged
+The `CongradsCore` class serves as the central engine for managing the
+training, validation, and testing phases of a neural network model,
+incorporating constraints that influence the loss function and model updates.
+The model is trained with standard loss functions while also incorporating
+constraint-based adjustments, which are tracked and logged
 throughout the process.
 Key features:
@@ -18,37 +18,19 @@ Key features:
 - Metric management for tracking loss and constraint satisfaction.
 - Checkpoint management for saving and evaluating model states.
-Modules in this package provide the following:
-- `Descriptor`: Describes variable layers in the network that are
-  subject to constraints.
-- `Constraint`: Defines various constraints, which are used to guide
-  the training process.
-- `MetricManager`: Manages and tracks performance metrics such as loss
-  and constraint satisfaction.
-- `CheckpointManager`: Manages saving and loading model checkpoints
-  during training.
-- Utility functions to validate inputs and configurations.
-Dependencies:
-    - PyTorch (`torch`)
-    - tqdm (for progress tracking)
-The `CongradsCore` class allows for the use of additional callback functions
-at different stages of the training process to customize behavior for
-specific needs. These include callbacks for the start and end of epochs, as
+The `CongradsCore` class allows for the use of additional callback functions
+at different stages of the training process to customize behavior for
+specific needs. These include callbacks for the start and end of epochs, as
 well as the start and end of the entire training process.
 """
 import warnings
-from numbers import Number
-from typing import Callable
+from collections.abc import Callable
 import torch
-# pylint: disable-next=redefined-builtin
-from torch import Tensor, float32, maximum, no_grad, norm, numel, sum, tensor
+from torch import Tensor, float32, no_grad, sum, tensor
+from torch.linalg import vector_norm
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer
@@ -60,7 +42,10 @@ from .constraints import Constraint
 from .descriptor import Descriptor
 from .metrics import MetricManager
 from .utils import (
+    is_torch_loss,
+    torch_loss_wrapper,
     validate_callable,
+    validate_callable_iterable,
     validate_iterable,
     validate_loaders,
     validate_type,
@@ -68,32 +53,11 @@ from .utils import (
 class CongradsCore:
-    """
-    The CongradsCore class is the central training engine for constraint-guided
-    neural network optimization. It integrates standard neural network training
+    """The CongradsCore class is the central training engine for constraint-guided optimization.
+    It integrates standard neural network training
     with additional constraint-driven adjustments to the loss function, ensuring
     that the network satisfies domain-specific constraints during training.
-    Args:
-        descriptor (Descriptor): Describes variable layers in the network.
-        constraints (list[Constraint]): List of constraints to guide training.
-        loaders (tuple[DataLoader, DataLoader, DataLoader]): DataLoaders for
-            training, validation, and testing.
-        network (Module): The neural network model to train.
-        criterion (callable): The loss function used for
-            training and validation.
-        optimizer (Optimizer): The optimizer used for updating model parameters.
-        metric_manager (MetricManager): Manages metric tracking and recording.
-        device (torch.device): The device (e.g., CPU or GPU) for computations.
-        checkpoint_manager (CheckpointManager, optional): Manages
-                checkpointing. If not set, no checkpointing is done.
-        epsilon (Number, optional): A small value to avoid division by zero
-            in gradient calculations. Default is 1e-10.
-    Note:
-        A warning is logged if the descriptor has no variable layers,
-        as at least one variable layer is required for the constraint logic
-        to influence the training process.
     """
     def __init__(
@@ -106,29 +70,69 @@ class CongradsCore:
         optimizer: Optimizer,
         metric_manager: MetricManager,
         device: torch.device,
+        network_uses_grad: bool = False,
         checkpoint_manager: CheckpointManager = None,
-        epsilon: Number = 1e-6,
+        epsilon: float = 1e-6,
+        constraint_aggregator: Callable[..., Tensor] = sum,
+        disable_progress_bar_epoch: bool = False,
+        disable_progress_bar_batch: bool = False,
+        enforce_all: bool = True,
     ):
-        """
-        Initialize the CongradsCore object.
-        """
+        """Initialize the CongradsCore object.
+        Args:
+            descriptor (Descriptor): Describes variable layers in the network.
+            constraints (list[Constraint]): List of constraints to guide training.
+            loaders (tuple[DataLoader, DataLoader, DataLoader]): DataLoaders for
+                training, validation, and testing.
+            network (Module): The neural network model to train.
+            criterion (callable): The loss function used for
+                training and validation.
+            optimizer (Optimizer): The optimizer used for updating model parameters.
+            metric_manager (MetricManager): Manages metric tracking and recording.
+            device (torch.device): The device (e.g., CPU or GPU) for computations.
+            network_uses_grad (bool, optional): A flag indicating if the network
+                contains gradient calculation computations. Default is False.
+            checkpoint_manager (CheckpointManager, optional): Manages
+                    checkpointing. If not set, no checkpointing is done.
+            epsilon (float, optional): A small value to avoid division by zero
+                in gradient calculations. Default is 1e-10.
+            constraint_aggregator (Callable[..., Tensor], optional): A function
+                to aggregate the constraint rescale loss. Default is `sum`.
+            disable_progress_bar_epoch (bool, optional): If set to True, the epoch
+                progress bar will not show. Defaults to False.
+            disable_progress_bar_batch (bool, optional): If set to True, the batch
+                progress bar will not show. Defaults to False.
+            enforce_all (bool, optional): If set to False, constraints will only be monitored and
+                not influence the training process. Overrides constraint-specific `enforce` parameters.
+                Defaults to True.
+        Note:
+            A warning is logged if the descriptor has no variable layers,
+            as at least one variable layer is required for the constraint logic
+            to influence the training process.
+        """
         # Type checking
         validate_type("descriptor", descriptor, Descriptor)
-        validate_iterable("constraints", constraints, Constraint)
-        validate_loaders()
+        validate_iterable("constraints", constraints, Constraint, allow_empty=True)
+        validate_loaders("loaders", loaders)
         validate_type("network", network, Module)
         validate_type("criterion", criterion, _Loss)
         validate_type("optimizer", optimizer, Optimizer)
         validate_type("metric_manager", metric_manager, MetricManager)
         validate_type("device", device, torch.device)
+        validate_type("network_uses_grad", network_uses_grad, bool)
         validate_type(
             "checkpoint_manager",
             checkpoint_manager,
             CheckpointManager,
             allow_none=True,
         )
-        validate_type("epsilon", epsilon, Number)
+        validate_type("epsilon", epsilon, float)
+        validate_callable("constraint_aggregator", constraint_aggregator, allow_none=True)
+        validate_type("disable_progress_bar_epoch", disable_progress_bar_epoch, bool)
+        validate_type("disable_progress_bar_batch", disable_progress_bar_batch, bool)
+        validate_type("enforce_all", enforce_all, bool)
         # Init object variables
         self.descriptor = descriptor
@@ -137,30 +141,38 @@ class CongradsCore:
         self.valid_loader = loaders[1]
         self.test_loader = loaders[2]
         self.network = network
-        self.criterion = criterion
         self.optimizer = optimizer
         self.metric_manager = metric_manager
         self.device = device
+        self.network_uses_grad = network_uses_grad
         self.checkpoint_manager = checkpoint_manager
-        # Init epsilon tensor
-        self.epsilon = tensor(epsilon, device=self.device)
+        self.epsilon = epsilon
+        self.constraint_aggregator = constraint_aggregator
+        self.disable_progress_bar_epoch = disable_progress_bar_epoch
+        self.disable_progress_bar_batch = disable_progress_bar_batch
+        self.enforce_all = enforce_all
+        # Check if criterion is a torch loss function
+        if is_torch_loss(criterion):
+            # If so, wrap it in a custom loss function
+            self.criterion = torch_loss_wrapper(criterion)
+        else:
+            self.criterion = criterion
         # Perform checks
-        if len(self.descriptor.variable_layers) == 0:
+        if len(self.descriptor.variable_keys) == 0:
             warnings.warn(
                 "The descriptor object has no variable layers. The constraint \
                     guided loss adjustment is therefore not used. \
-                    Is this the intended behavior?"
+                    Is this the intended behavior?",
+                stacklevel=2,
             )
         # Initialize constraint metrics
         self._initialize_metrics()
     def _initialize_metrics(self) -> None:
-        """
-        Register metrics for loss, constraint satisfaction ratio (CSR),
-        and individual constraints.
+        """Register metrics for loss, constraint satisfaction ratio (CSR), and constraints.
         This method registers the following metrics:
@@ -173,7 +185,6 @@ class CongradsCore:
         - One metric per constraint, for both training and validation.
         """
         self.metric_manager.register("Loss/train", "during_training")
         self.metric_manager.register("Loss/valid", "during_training")
         self.metric_manager.register("Loss/test", "after_training")
@@ -184,414 +195,579 @@ class CongradsCore:
             self.metric_manager.register("CSR/test", "after_training")
         for constraint in self.constraints:
-            self.metric_manager.register(
-                f"{constraint.name}/train", "during_training"
-            )
-            self.metric_manager.register(
-                f"{constraint.name}/valid", "during_training"
-            )
-            self.metric_manager.register(
-                f"{constraint.name}/test", "after_training"
-            )
+            self.metric_manager.register(f"{constraint.name}/train", "during_training")
+            self.metric_manager.register(f"{constraint.name}/valid", "during_training")
+            self.metric_manager.register(f"{constraint.name}/test", "after_training")
     def fit(
         self,
         start_epoch: int = 0,
         max_epochs: int = 100,
-        on_epoch_start: Callable[[int], None] = None,
-        on_epoch_end: Callable[[int], None] = None,
-        on_train_start: Callable[[int], None] = None,
-        on_train_end: Callable[[int], None] = None,
+        test_model: bool = True,
+        on_batch_start: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_batch_end: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_train_batch_start: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_train_batch_end: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_valid_batch_start: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_valid_batch_end: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_test_batch_start: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_test_batch_end: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]] | None = None,
+        on_epoch_start: list[Callable[[int], None]] | None = None,
+        on_epoch_end: list[Callable[[int], None]] | None = None,
+        on_train_start: list[Callable[[int], None]] | None = None,
+        on_train_end: list[Callable[[int], None]] | None = None,
+        on_train_completion_forward_pass: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]]
+        | None = None,
+        on_val_completion_forward_pass: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]]
+        | None = None,
+        on_test_completion_forward_pass: list[Callable[[dict[str, Tensor]], dict[str, Tensor]]]
+        | None = None,
+        on_test_start: list[Callable[[int], None]] | None = None,
+        on_test_end: list[Callable[[int], None]] | None = None,
     ) -> None:
-        """
-        Train the model for a given number of epochs.
+        """Train the model over multiple epochs with optional validation and testing.
+        This method manages the full training loop, including:
+        - Executing epoch-level and batch-level callbacks.
+        - Training and validating the model each epoch.
+        - Adjusting losses according to constraints.
+        - Logging metrics via the metric manager.
+        - Optional evaluation on the test set.
+        - Checkpointing the model during and after training.
         Args:
-            start_epoch (int, optional): The epoch number to start the training
-                with. Default is 0.
-            max_epochs (int, optional): The number of epochs to train the
-                model. Default is 100.
-            on_epoch_start (Callable[[int], None], optional): A callback
-                function that will be executed at the start of each epoch.
-            on_epoch_end (Callable[[int], None], optional): A callback
-                function that will be executed at the end of each epoch.
-            on_train_start (Callable[[int], None], optional): A callback
-                function that will be executed before the training starts.
-            on_train_end (Callable[[int], None], optional): A callback
-                function that will be executed after training ends.
+            start_epoch (int, optional): Epoch number to start training from. Defaults to 0.
+            max_epochs (int, optional): Total number of epochs to train. Defaults to 100.
+            test_model (bool, optional): If True, evaluate the model on the test set after training. Defaults to True.
+            on_batch_start (list[Callable], optional): Callbacks executed at the start of every batch. Defaults to None.
+            on_batch_end (list[Callable], optional): Callbacks executed at the end of every batch. Defaults to None.
+            on_train_batch_start (list[Callable], optional): Callbacks executed at the start of each training batch. Defaults to `on_batch_start` if not provided.
+            on_train_batch_end (list[Callable], optional): Callbacks executed at the end of each training batch. Defaults to `on_batch_end` if not provided.
+            on_valid_batch_start (list[Callable], optional): Callbacks executed at the start of each validation batch. Defaults to `on_batch_start` if not provided.
+            on_valid_batch_end (list[Callable], optional): Callbacks executed at the end of each validation batch. Defaults to `on_batch_end` if not provided.
+            on_test_batch_start (list[Callable], optional): Callbacks executed at the start of each test batch. Defaults to `on_batch_start` if not provided.
+            on_test_batch_end (list[Callable], optional): Callbacks executed at the end of each test batch. Defaults to `on_batch_end` if not provided.
+            on_epoch_start (list[Callable], optional): Callbacks executed at the start of each epoch. Defaults to None.
+            on_epoch_end (list[Callable], optional): Callbacks executed at the end of each epoch. Defaults to None.
+            on_train_start (list[Callable], optional): Callbacks executed before training starts. Defaults to None.
+            on_train_end (list[Callable], optional): Callbacks executed after training ends. Defaults to None.
+            on_train_completion_forward_pass (list[Callable], optional): Callbacks executed after the forward pass during training. Defaults to None.
+            on_val_completion_forward_pass (list[Callable], optional): Callbacks executed after the forward pass during validation. Defaults to None.
+            on_test_completion_forward_pass (list[Callable], optional): Callbacks executed after the forward pass during testing. Defaults to None.
+            on_test_start (list[Callable], optional): Callbacks executed before testing starts. Defaults to None.
+            on_test_end (list[Callable], optional): Callbacks executed after testing ends. Defaults to None.
+        Notes:
+            - If phase-specific callbacks (train/valid/test) are not provided, the global `on_batch_start` and `on_batch_end` are used.
+            - Training metrics, loss adjustments, and constraint satisfaction ratios are automatically logged via the metric manager.
+            - The final model checkpoint is saved if a checkpoint manager is configured.
         """
         # Type checking
         validate_type("start_epoch", start_epoch, int)
-        validate_callable("on_epoch_start", on_epoch_start, True)
-        validate_callable("on_epoch_end", on_epoch_end, True)
-        validate_callable("on_train_start", on_train_start, True)
-        validate_callable("on_train_end", on_train_end, True)
+        validate_type("max_epochs", max_epochs, int)
+        validate_type("test_model", test_model, bool)
+        validate_callable_iterable("on_batch_start", on_batch_start, allow_none=True)
+        validate_callable_iterable("on_batch_end", on_batch_end, allow_none=True)
+        validate_callable_iterable("on_train_batch_start", on_train_batch_start, allow_none=True)
+        validate_callable_iterable("on_train_batch_end", on_train_batch_end, allow_none=True)
+        validate_callable_iterable("on_valid_batch_start", on_valid_batch_start, allow_none=True)
+        validate_callable_iterable("on_valid_batch_end", on_valid_batch_end, allow_none=True)
+        validate_callable_iterable("on_test_batch_start", on_test_batch_start, allow_none=True)
+        validate_callable_iterable("on_test_batch_end", on_test_batch_end, allow_none=True)
+        validate_callable_iterable("on_epoch_start", on_epoch_start, allow_none=True)
+        validate_callable_iterable("on_epoch_end", on_epoch_end, allow_none=True)
+        validate_callable_iterable("on_train_start", on_train_start, allow_none=True)
+        validate_callable_iterable("on_train_end", on_train_end, allow_none=True)
+        validate_callable_iterable(
+            "on_train_completion_forward_pass",
+            on_train_completion_forward_pass,
+            allow_none=True,
+        )
+        validate_callable_iterable(
+            "on_val_completion_forward_pass",
+            on_val_completion_forward_pass,
+            allow_none=True,
+        )
+        validate_callable_iterable(
+            "on_test_completion_forward_pass",
+            on_test_completion_forward_pass,
+            allow_none=True,
+        )
+        validate_callable_iterable("on_test_start", on_test_start, allow_none=True)
+        validate_callable_iterable("on_test_end", on_test_end, allow_none=True)
+        # Use global batch callback if phase-specific callback is unset
+        # Init callbacks as empty list if None
+        on_train_batch_start = on_train_batch_start or on_batch_start or []
+        on_train_batch_end = on_train_batch_end or on_batch_end or []
+        on_valid_batch_start = on_valid_batch_start or on_batch_start or []
+        on_valid_batch_end = on_valid_batch_end or on_batch_end or []
+        on_test_batch_start = on_test_batch_start or on_batch_start or []
+        on_test_batch_end = on_test_batch_end or on_batch_end or []
+        on_batch_start = on_batch_start or []
+        on_batch_end = on_batch_end or []
+        on_epoch_start = on_epoch_start or []
+        on_epoch_end = on_epoch_end or []
+        on_train_start = on_train_start or []
+        on_train_end = on_train_end or []
+        on_train_completion_forward_pass = on_train_completion_forward_pass or []
+        on_val_completion_forward_pass = on_val_completion_forward_pass or []
+        on_test_completion_forward_pass = on_test_completion_forward_pass or []
+        on_test_start = on_test_start or []
+        on_test_end = on_test_end or []
         # Keep track of epoch
         epoch = start_epoch
         # Execute training start hook if set
-        if on_train_start:
-            on_train_start(epoch)
-        for i in tqdm(range(epoch, max_epochs), initial=epoch, desc="Epoch"):
+        for callback in on_train_start:
+            callback(epoch)
+        for i in tqdm(
+            range(epoch, max_epochs),
+            initial=epoch,
+            desc="Epoch",
+            disable=self.disable_progress_bar_epoch,
+        ):
             epoch = i
             # Execute epoch start hook if set
-            if on_epoch_start:
-                on_epoch_start(epoch)
+            for callback in on_epoch_start:
+                callback(epoch)
             # Execute training and validation epoch
-            self._train_epoch()
-            self._validate_epoch()
+            self._train_epoch(
+                on_train_batch_start,
+                on_train_batch_end,
+                on_train_completion_forward_pass,
+            )
+            self._validate_epoch(
+                on_valid_batch_start,
+                on_valid_batch_end,
+                on_val_completion_forward_pass,
+            )
             # Checkpointing
             if self.checkpoint_manager:
                 self.checkpoint_manager.evaluate_criteria(epoch)
             # Execute epoch end hook if set
-            if on_epoch_end:
-                on_epoch_end(epoch)
+            for callback in on_epoch_end:
+                callback(epoch)
+        # Execute training end hook if set
+        for callback in on_train_end:
+            callback(epoch)
+        # Evaluate model performance on unseen test set if required
+        if test_model:
+            # Execute test end hook if set
+            for callback in on_test_start:
+                callback(epoch)
+            self._test_model(
+                on_test_batch_start,
+                on_test_batch_end,
+                on_test_completion_forward_pass,
+            )
-        # Evaluate model performance on unseen test set
-        self._test_model()
+            # Execute test end hook if set
+            for callback in on_test_end:
+                callback(epoch)
         # Save final model
         if self.checkpoint_manager:
             self.checkpoint_manager.save(epoch, "checkpoint_final.pth")
-        # Execute training end hook if set
-        if on_train_end:
-            on_train_end(epoch)
-    def _train_epoch(self) -> None:
-        """
-        Perform training for a single epoch.
+    def _train_epoch(
+        self,
+        on_train_batch_start: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]], ...],
+        on_train_batch_end: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]], ...],
+        on_train_completion_forward_pass: tuple[
+            Callable[[dict[str, Tensor]], dict[str, Tensor]], ...
+        ],
+    ) -> None:
+        """Perform a single training epoch over all batches.
-        This method:
-            - Sets the model to training mode.
-            - Processes batches from the training DataLoader.
-            - Computes predictions and losses.
-            - Adjusts losses based on constraints.
-            - Updates model parameters using backpropagation.
+        This method sets the network to training mode, iterates over the training
+        DataLoader, computes predictions, evaluates losses, applies constraint-based
+        adjustments, performs backpropagation, and updates model parameters. It also
+        supports executing optional callbacks at different stages of the batch
+        processing.
         Args:
-            epoch (int): The current epoch number.
-        """
+            on_train_batch_start (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]], ...]):
+                Callbacks executed at the start of each batch. Each callback receives the
+                data dictionary and returns updated versions.
+            on_train_batch_end (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]], ...]):
+                Callbacks executed at the end of each batch. Each callback receives the
+                data dictionary and returns updated versions.
+            on_train_completion_forward_pass (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]], ...]):
+                Callbacks executed immediately after the forward pass of the batch.
+                Each callback receives the data dictionary and returns updated versions.
+        Returns:
+            None
+        """
         # Set model in training mode
         self.network.train()
-        for batch in tqdm(
-            self.train_loader, desc="Training batches", leave=False
+        for data in tqdm(
+            self.train_loader,
+            desc="Training batches",
+            leave=False,
+            disable=self.disable_progress_bar_batch,
         ):
+            # Transfer batch data to GPU
+            data: dict[str, Tensor] = {key: value.to(self.device) for key, value in data.items()}
-            # Get input-output pairs from batch
-            inputs, outputs = batch
-            # Transfer to GPU
-            inputs, outputs = inputs.to(self.device), outputs.to(self.device)
+            # Execute on batch start callbacks
+            for callback in on_train_batch_start:
+                data = callback(data)
             # Model computations
-            prediction = self.network(inputs)
+            data = self.network(data)
+            # Execute on completion forward pass callbacks
+            for callback in on_train_completion_forward_pass:
+                data = callback(data)
             # Calculate loss
-            loss = self.criterion(prediction["output"], outputs)
+            loss = self.criterion(
+                data["output"],
+                data["target"],
+                data=data,
+            )
             self.metric_manager.accumulate("Loss/train", loss.unsqueeze(0))
             # Adjust loss based on constraints
-            combined_loss = self.train_step(prediction, loss)
+            combined_loss = self.train_step(
+                data,
+                loss,
+                self.constraints,
+                self.descriptor,
+                self.metric_manager,
+                self.device,
+                constraint_aggregator=self.constraint_aggregator,
+                epsilon=self.epsilon,
+                enforce_all=self.enforce_all,
+            )
             # Backprop
             self.optimizer.zero_grad()
-            combined_loss.backward(
-                retain_graph=False, inputs=list(self.network.parameters())
-            )
+            combined_loss.backward(retain_graph=False, inputs=list(self.network.parameters()))
             self.optimizer.step()
-    def _validate_epoch(self) -> None:
-        """
-        Perform validation for a single epoch.
+            # Execute on batch end callbacks
+            for callback in on_train_batch_end:
+                data = callback(data)
+    def _validate_epoch(
+        self,
+        on_valid_batch_start: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+        on_valid_batch_end: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+        on_valid_completion_forward_pass: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+    ) -> None:
+        """Perform a single validation epoch over all batches.
-        This method:
-            - Sets the model to evaluation mode.
-            - Processes batches from the validation DataLoader.
-            - Computes predictions and losses.
-            - Logs constraint satisfaction ratios.
+        This method sets the network to evaluation mode, iterates over the validation
+        DataLoader, computes predictions, evaluates losses, and logs constraint
+        satisfaction. Optional callbacks can be executed at the start and end of each
+        batch, as well as after the forward pass.
         Args:
-            epoch (int): The current epoch number.
-        """
+            on_valid_batch_start (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed at the start of each validation batch. Each callback
+                receives the data dictionary and returns updated versions.
+            on_valid_batch_end (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed at the end of each validation batch. Each callback
+                receives the data dictionary and returns updated versions.
+            on_valid_completion_forward_pass (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed immediately after the forward pass of the validation batch.
+                Each callback receives the data dictionary and returns updated versions.
+        Returns:
+            None
+        """
         # Set model in evaluation mode
         self.network.eval()
-        with no_grad():
-            for batch in tqdm(
-                self.valid_loader, desc="Validation batches", leave=False
+        # Enable or disable gradient tracking for validation pass
+        with torch.set_grad_enabled(self.network_uses_grad):
+            # Loop over validation batches
+            for data in tqdm(
+                self.valid_loader,
+                desc="Validation batches",
+                leave=False,
+                disable=self.disable_progress_bar_batch,
             ):
+                # Transfer batch data to GPU
+                data: dict[str, Tensor] = {
+                    key: value.to(self.device) for key, value in data.items()
+                }
-                # Get input-output pairs from batch
-                inputs, outputs = batch
-                # Transfer to GPU
-                inputs, outputs = inputs.to(self.device), outputs.to(
-                    self.device
-                )
+                # Execute on batch start callbacks
+                for callback in on_valid_batch_start:
+                    data = callback(data)
                 # Model computations
-                prediction = self.network(inputs)
+                data = self.network(data)
+                # Execute on completion forward pass callbacks
+                for callback in on_valid_completion_forward_pass:
+                    data = callback(data)
                 # Calculate loss
-                loss = self.criterion(prediction["output"], outputs)
+                loss = self.criterion(
+                    data["output"],
+                    data["target"],
+                    data=data,
+                )
                 self.metric_manager.accumulate("Loss/valid", loss.unsqueeze(0))
                 # Validate constraints
-                self.valid_step(prediction, loss)
+                self.valid_step(
+                    data,
+                    loss,
+                    self.constraints,
+                    self.metric_manager,
+                )
-    def _test_model(self) -> None:
-        """
-        Evaluate model performance on the test set.
+                # Execute on batch end callbacks
+                for callback in on_valid_batch_end:
+                    data = callback(data)
+    def _test_model(
+        self,
+        on_test_batch_start: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+        on_test_batch_end: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+        on_test_completion_forward_pass: tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]],
+    ) -> None:
+        """Evaluate the model on the test dataset.
-        This method:
-            - Sets the model to evaluation mode.
-            - Processes batches from the test DataLoader.
-            - Computes predictions and losses.
-            - Logs constraint satisfaction ratios.
+        This method sets the network to evaluation mode, iterates over the test
+        DataLoader, computes predictions, evaluates losses, and logs constraint
+        satisfaction. Optional callbacks can be executed at the start and end of
+        each batch, as well as after the forward pass.
-        """
+        Args:
+            on_test_batch_start (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed at the start of each test batch. Each callback
+                receives the data dictionary and returns updated versions.
+            on_test_batch_end (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed at the end of each test batch. Each callback
+                receives the data dictionary and returns updated versions.
+            on_test_completion_forward_pass (tuple[Callable[[dict[str, Tensor]], dict[str, Tensor]]]):
+                Callbacks executed immediately after the forward pass of the test batch.
+                Each callback receives the data dictionary and returns updated versions.
+        Returns:
+            None
+        """
         # Set model in evaluation mode
         self.network.eval()
-        with no_grad():
-            for batch in tqdm(
-                self.test_loader, desc="Test batches", leave=False
+        # Enable or disable gradient tracking for validation pass
+        with torch.set_grad_enabled(self.network_uses_grad):
+            # Loop over test batches
+            for data in tqdm(
+                self.test_loader,
+                desc="Test batches",
+                leave=False,
+                disable=self.disable_progress_bar_batch,
             ):
+                # Transfer batch data to GPU
+                data: dict[str, Tensor] = {
+                    key: value.to(self.device) for key, value in data.items()
+                }
-                # Get input-output pairs from batch
-                inputs, outputs = batch
-                # Transfer to GPU
-                inputs, outputs = inputs.to(self.device), outputs.to(
-                    self.device
-                )
+                # Execute on batch start callbacks
+                for callback in on_test_batch_start:
+                    data = callback(data)
                 # Model computations
-                prediction = self.network(inputs)
+                data = self.network(data)
+                # Execute on completion forward pass callbacks
+                for callback in on_test_completion_forward_pass:
+                    data = callback(data)
                 # Calculate loss
-                loss = self.criterion(prediction["output"], outputs)
+                loss = self.criterion(
+                    data["output"],
+                    data["target"],
+                    data=data,
+                )
                 self.metric_manager.accumulate("Loss/test", loss.unsqueeze(0))
                 # Validate constraints
-                self.test_step(prediction, loss)
+                self.test_step(
+                    data,
+                    loss,
+                    self.constraints,
+                    self.metric_manager,
+                )
+                # Execute on batch end callbacks
+                for callback in on_test_batch_end:
+                    data = callback(data)
+    @staticmethod
     def train_step(
-        self,
-        prediction: dict[str, Tensor],
+        data: dict[str, Tensor],
         loss: Tensor,
+        constraints: list[Constraint],
+        descriptor: Descriptor,
+        metric_manager: MetricManager,
+        device: torch.device,
+        constraint_aggregator: Callable = torch.sum,
+        epsilon: float = 1e-6,
+        enforce_all: bool = True,
     ) -> Tensor:
-        """
-        Adjust the training loss based on constraints
-        and compute the combined loss.
+        """Adjust the training loss based on constraints and compute the combined loss.
+        This method calculates the directions in which the network outputs should be
+        adjusted to satisfy constraints, scales these adjustments according to the
+        constraint's rescale factor and gradient norms, and adds the result to the
+        base loss. It also logs the constraint satisfaction ratio (CSR) for monitoring.
         Args:
-            prediction (dict[str, Tensor]): Model predictions
-                for variable layers.
+            data (dict[str, Tensor]): Dictionary containing the batch data, predictions and additional data.
             loss (Tensor): The base loss computed by the criterion.
+            constraints (list[Constraint]): List of constraints to enforce during training.
+            descriptor (Descriptor): Descriptor containing layer metadata and variable/loss layer info.
+            metric_manager (MetricManager): Metric manager for logging loss and CSR.
+            device (torch.device): Device on which computations are performed.
+            constraint_aggregator (Callable, optional): Function to aggregate per-layer rescaled losses. Defaults to `torch.mean`.
+            epsilon (float, optional): Small value to prevent division by zero in gradient normalization. Defaults to 1e-6.
+            enforce_all (bool, optional): If False, constraints are only monitored and do not influence the loss. Defaults to True.
         Returns:
-            Tensor: The combined loss (base loss + constraint adjustments).
+            Tensor: The combined loss including the original loss and constraint-based adjustments.
         """
         # Init scalar tensor for loss
-        total_rescale_loss = tensor(0, dtype=float32, device=self.device)
-        loss_grads = {}
+        total_rescale_loss = tensor(0, dtype=float32, device=device)
+        norm_loss_grad: dict[str, Tensor] = {}
         # Precalculate loss gradients for each variable layer
-        with no_grad():
-            for layer in self.descriptor.variable_layers:
-                self.optimizer.zero_grad()
-                loss.backward(retain_graph=True, inputs=prediction[layer])
-                loss_grads[layer] = prediction[layer].grad
+        for key in descriptor.variable_keys & descriptor.affects_loss_keys:
+            # Calculate gradients of loss w.r.t. predictions
+            grad = torch.autograd.grad(
+                outputs=loss, inputs=data[key], retain_graph=True, allow_unused=True
+            )[0]
+            # If gradients is None, report error
+            if grad is None:
+                raise RuntimeError(
+                    f"Unable to compute loss gradients for layer '{key}'. "
+                    "For layers not connected to the loss, set has_loss=False "
+                    "when defining them in the Descriptor."
+                )
-        for constraint in self.constraints:
+            # Flatten batch and compute L2 norm along each item
+            grad_flat = grad.view(grad.shape[0], -1)
+            norm_loss_grad[key] = (
+                vector_norm(grad_flat, dim=1, ord=2, keepdim=True).clamp(min=epsilon).detach()
+            )
+        for constraint in constraints:
             # Check if constraints are satisfied and calculate directions
-            with no_grad():
-                constraint_checks, relevant_constraint_count = (
-                    constraint.check_constraint(prediction)
-                )
+            checks, mask = constraint.check_constraint(data)
+            directions = constraint.calculate_direction(data)
+            # Log constraint satisfaction ratio
+            csr = (sum(checks * mask) / sum(mask)).unsqueeze(0)
+            metric_manager.accumulate(f"{constraint.name}/train", csr)
+            metric_manager.accumulate("CSR/train", csr)
             # Only do adjusting calculation if constraint is not observant
-            if not constraint.monitor_only:
+            if not enforce_all or not constraint.enforce:
+                continue
+            # Only do direction calculations for variable layers affecting constraint
+            for key in constraint.layers & descriptor.variable_keys:
                 with no_grad():
-                    constraint_directions = constraint.calculate_direction(
-                        prediction
-                    )
-                # Only do direction calculations for variable
-                # layers affecting constraint
-                for layer in (
-                    constraint.layers & self.descriptor.variable_layers
-                ):
-                    with no_grad():
-                        # Multiply direction modifiers with constraint result
-                        constraint_result = (
-                            1 - constraint_checks.unsqueeze(1)
-                        ) * constraint_directions[layer]
-                        # Multiply result with rescale factor of constraint
-                        constraint_result *= constraint.rescale_factor
-                        # Calculate loss gradient norm
-                        norm_loss_grad = norm(
-                            loss_grads[layer], dim=1, p=2, keepdim=True
-                        )
-                        # Apply minimum epsilon
-                        norm_loss_grad = maximum(norm_loss_grad, self.epsilon)
-                    # Calculate rescale loss
-                    rescale_loss = (
-                        prediction[layer]
-                        * constraint_result
-                        * norm_loss_grad.detach().clone()
-                    ).mean()
-                    # Store rescale loss for this reference space
-                    total_rescale_loss += rescale_loss
+                    # Multiply direction modifiers with constraint result
+                    constraint_result = (1 - checks) * directions[key]
-            # Log constraint satisfaction ratio
-            self.metric_manager.accumulate(
-                f"{constraint.name}/train",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
-            self.metric_manager.accumulate(
-                "CSR/train",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
+                    # Multiply result with rescale factor of constraint
+                    constraint_result *= constraint.rescale_factor
+                # Calculate rescale loss
+                total_rescale_loss += constraint_aggregator(
+                    data[key] * constraint_result * norm_loss_grad[key],
+                )
         # Return combined loss
         return loss + total_rescale_loss
+    @staticmethod
     def valid_step(
-        self,
-        prediction: dict[str, Tensor],
+        data: dict[str, Tensor],
         loss: Tensor,
+        constraints: list[Constraint],
+        metric_manager: MetricManager,
     ) -> Tensor:
-        """
-        Evaluate constraints during validation and log satisfaction metrics.
+        """Evaluate constraints during validation and log constraint satisfaction metrics.
+        This method checks whether each constraint is satisfied for the given
+        data, computes the constraint satisfaction ratio (CSR),
+        and logs it using the metric manager. The base loss is not modified.
         Args:
-            prediction (dict[str, Tensor]): Model predictions for
-                variable layers.
+            data (dict[str, Tensor]): Dictionary containing the batch data, predictions and additional data.
             loss (Tensor): The base loss computed by the criterion.
+            constraints (list[Constraint]): List of constraints to evaluate.
+            metric_manager (MetricManager): Metric manager for logging CSR and per-constraint metrics.
         Returns:
-            Tensor: The unchanged base loss.
+            Tensor: The original, unchanged base loss.
         """
         # For each constraint in this reference space, calculate directions
-        for constraint in self.constraints:
+        for constraint in constraints:
             # Check if constraints are satisfied for
-            constraint_checks, relevant_constraint_count = (
-                constraint.check_constraint(prediction)
-            )
+            checks, mask = constraint.check_constraint(data)
             # Log constraint satisfaction ratio
-            self.metric_manager.accumulate(
-                f"{constraint.name}/valid",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
-            self.metric_manager.accumulate(
-                "CSR/valid",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
+            csr = (sum(checks * mask) / sum(mask)).unsqueeze(0)
+            metric_manager.accumulate(f"{constraint.name}/valid", csr)
+            metric_manager.accumulate("CSR/valid", csr)
-        # Return loss
+        # Return original loss
         return loss
+    @staticmethod
     def test_step(
-        self,
-        prediction: dict[str, Tensor],
+        data: dict[str, Tensor],
         loss: Tensor,
+        constraints: list[Constraint],
+        metric_manager: MetricManager,
     ) -> Tensor:
-        """
-        Evaluate constraints during test and log satisfaction metrics.
+        """Evaluate constraints during testing and log constraint satisfaction metrics.
+        This method checks whether each constraint is satisfied for the given
+        data, computes the constraint satisfaction ratio (CSR),
+        and logs it using the metric manager. The base loss is not modified.
         Args:
-            prediction (dict[str, Tensor]): Model predictions
-                for variable layers.
+            data (dict[str, Tensor]): Dictionary containing the batch data, predictions and additional data.
             loss (Tensor): The base loss computed by the criterion.
+            constraints (list[Constraint]): List of constraints to evaluate.
+            metric_manager (MetricManager): Metric manager for logging CSR and per-constraint metrics.
         Returns:
-            Tensor: The unchanged base loss.
+            Tensor: The original, unchanged base loss.
         """
         # For each constraint in this reference space, calculate directions
-        for constraint in self.constraints:
+        for constraint in constraints:
             # Check if constraints are satisfied for
-            constraint_checks, relevant_constraint_count = (
-                constraint.check_constraint(prediction)
-            )
+            checks, mask = constraint.check_constraint(data)
             # Log constraint satisfaction ratio
-            self.metric_manager.accumulate(
-                f"{constraint.name}/test",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
-            self.metric_manager.accumulate(
-                "CSR/test",
-                (
-                    (
-                        sum(constraint_checks)
-                        - numel(constraint_checks)
-                        + relevant_constraint_count
-                    )
-                    / relevant_constraint_count
-                ).unsqueeze(0),
-            )
+            csr = (sum(checks * mask) / sum(mask)).unsqueeze(0)
+            metric_manager.accumulate(f"{constraint.name}/test", csr)
+            metric_manager.accumulate("CSR/test", csr)
-        # Return loss
+        # Return original loss
         return loss

congrads 1.0.6__py3-none-any.whl → 1.1.0__py3-none-any.whl

congrads 1.0.6py3-none-any.whl → 1.1.0py3-none-any.whl