PyPI - sleap-nn - Versions diffs - 0.0.5__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

sleap-nn 0.0.5py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sleap_nn/__init__.py +9 -2
sleap_nn/architectures/convnext.py +5 -0
sleap_nn/architectures/encoder_decoder.py +25 -6
sleap_nn/architectures/swint.py +8 -0
sleap_nn/cli.py +489 -46
sleap_nn/config/data_config.py +51 -8
sleap_nn/config/get_config.py +32 -24
sleap_nn/config/trainer_config.py +88 -0
sleap_nn/data/augmentation.py +61 -200
sleap_nn/data/custom_datasets.py +433 -61
sleap_nn/data/instance_cropping.py +71 -6
sleap_nn/data/normalization.py +45 -2
sleap_nn/data/providers.py +26 -0
sleap_nn/data/resizing.py +2 -2
sleap_nn/data/skia_augmentation.py +414 -0
sleap_nn/data/utils.py +135 -17
sleap_nn/evaluation.py +177 -42
sleap_nn/export/__init__.py +21 -0
sleap_nn/export/cli.py +1778 -0
sleap_nn/export/exporters/__init__.py +51 -0
sleap_nn/export/exporters/onnx_exporter.py +80 -0
sleap_nn/export/exporters/tensorrt_exporter.py +291 -0
sleap_nn/export/metadata.py +225 -0
sleap_nn/export/predictors/__init__.py +63 -0
sleap_nn/export/predictors/base.py +22 -0
sleap_nn/export/predictors/onnx.py +154 -0
sleap_nn/export/predictors/tensorrt.py +312 -0
sleap_nn/export/utils.py +307 -0
sleap_nn/export/wrappers/__init__.py +25 -0
sleap_nn/export/wrappers/base.py +96 -0
sleap_nn/export/wrappers/bottomup.py +243 -0
sleap_nn/export/wrappers/bottomup_multiclass.py +195 -0
sleap_nn/export/wrappers/centered_instance.py +56 -0
sleap_nn/export/wrappers/centroid.py +58 -0
sleap_nn/export/wrappers/single_instance.py +83 -0
sleap_nn/export/wrappers/topdown.py +180 -0
sleap_nn/export/wrappers/topdown_multiclass.py +304 -0
sleap_nn/inference/__init__.py +6 -0
sleap_nn/inference/bottomup.py +86 -20
sleap_nn/inference/peak_finding.py +93 -16
sleap_nn/inference/postprocessing.py +284 -0
sleap_nn/inference/predictors.py +339 -137
sleap_nn/inference/provenance.py +292 -0
sleap_nn/inference/topdown.py +55 -47
sleap_nn/legacy_models.py +65 -11
sleap_nn/predict.py +224 -19
sleap_nn/system_info.py +443 -0
sleap_nn/tracking/tracker.py +8 -1
sleap_nn/train.py +138 -44
sleap_nn/training/callbacks.py +1258 -5
sleap_nn/training/lightning_modules.py +902 -220
sleap_nn/training/model_trainer.py +424 -111
sleap_nn/training/schedulers.py +191 -0
sleap_nn/training/utils.py +367 -2
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/METADATA +35 -33
sleap_nn-0.1.0.dist-info/RECORD +88 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/WHEEL +1 -1
sleap_nn-0.0.5.dist-info/RECORD +0 -63
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/entry_points.txt +0 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/licenses/LICENSE +0 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/top_level.txt +0 -0

sleap_nn/training/lightning_modules.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """This module has the LightningModule classes for all model types."""
-from typing import Optional, Union, Dict, Any
+from typing import Optional, Union, Dict, Any, List
 import time
 from torch import nn
 import numpy as np
@@ -33,6 +33,7 @@ from sleap_nn.inference.bottomup import (
 )
 from sleap_nn.inference.paf_grouping import PAFScorer
 from sleap_nn.architectures.model import Model
+from sleap_nn.data.normalization import normalize_on_gpu
 from sleap_nn.training.losses import compute_ohkm_loss
 from loguru import logger
 from sleap_nn.training.utils import (
@@ -40,14 +41,26 @@ from sleap_nn.training.utils import (
     plot_confmaps,
     plot_img,
     plot_peaks,
+    VisualizationData,
 )
+import matplotlib
+matplotlib.use(
+    "Agg"
+)  # Use non-interactive backend to avoid tkinter issues on Windows CI
 import matplotlib.pyplot as plt
 from sleap_nn.config.utils import get_backbone_type_from_cfg, get_model_type_from_cfg
 from sleap_nn.config.trainer_config import (
+    CosineAnnealingWarmupConfig,
+    LinearWarmupLinearDecayConfig,
     LRSchedulerConfig,
     ReduceLROnPlateauConfig,
     StepLRConfig,
 )
+from sleap_nn.training.schedulers import (
+    LinearWarmupCosineAnnealingLR,
+    LinearWarmupLinearDecayLR,
+)
 from sleap_nn.config.get_config import get_backbone_config
 from sleap_nn.legacy_models import (
     load_legacy_model_weights,
@@ -177,6 +190,15 @@ class LightningModel(L.LightningModule):
         self.val_loss = {}
         self.learning_rate = {}
+        # For epoch-averaged loss tracking
+        self._epoch_loss_sum = 0.0
+        self._epoch_loss_count = 0
+        # For epoch-end evaluation
+        self.val_predictions: List[Dict] = []
+        self.val_ground_truth: List[Dict] = []
+        self._collect_val_predictions: bool = False
         # Initialization for encoder and decoder stacks.
         if self.init_weights == "xavier":
             self.model.apply(xavier_init_weights)
@@ -213,7 +235,9 @@ class LightningModel(L.LightningModule):
             elif self.pretrained_backbone_weights.endswith(".h5"):
                 # load from sleap model weights
                 load_legacy_model_weights(
-                    self.model.backbone, self.pretrained_backbone_weights
+                    self.model.backbone,
+                    self.pretrained_backbone_weights,
+                    component="backbone",
                 )
             else:
@@ -242,7 +266,9 @@ class LightningModel(L.LightningModule):
             elif self.pretrained_head_weights.endswith(".h5"):
                 # load from sleap model weights
                 load_legacy_model_weights(
-                    self.model.head_layers, self.pretrained_head_weights
+                    self.model.head_layers,
+                    self.pretrained_head_weights,
+                    component="head",
                 )
             else:
@@ -298,34 +324,82 @@ class LightningModel(L.LightningModule):
     def on_train_epoch_start(self):
         """Configure the train timer at the beginning of each epoch."""
         self.train_start_time = time.time()
+        # Reset epoch loss tracking
+        self._epoch_loss_sum = 0.0
+        self._epoch_loss_count = 0
+    def _accumulate_loss(self, loss: torch.Tensor):
+        """Accumulate loss for epoch-averaged logging. Call this in training_step."""
+        self._epoch_loss_sum += loss.detach().item()
+        self._epoch_loss_count += 1
     def on_train_epoch_end(self):
         """Configure the train timer at the end of every epoch."""
         train_time = time.time() - self.train_start_time
         self.log(
-            "train_time",
+            "train/time",
             train_time,
             prog_bar=False,
             on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Log epoch explicitly for custom x-axis support in wandb
+        self.log(
+            "epoch",
+            float(self.current_epoch),
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        # Log epoch-averaged training loss
+        if self._epoch_loss_count > 0:
+            avg_loss = self._epoch_loss_sum / self._epoch_loss_count
+            self.log(
+                "train/loss",
+                avg_loss,
+                prog_bar=False,
+                on_step=False,
+                on_epoch=True,
+                sync_dist=True,
+            )
+        # Log current learning rate (useful for monitoring LR schedulers)
+        if self.trainer.optimizers:
+            lr = self.trainer.optimizers[0].param_groups[0]["lr"]
+            self.log(
+                "train/lr",
+                lr,
+                prog_bar=False,
+                on_step=False,
+                on_epoch=True,
+                sync_dist=True,
+            )
     def on_validation_epoch_start(self):
         """Configure the val timer at the beginning of each epoch."""
         self.val_start_time = time.time()
+        # Clear accumulated predictions for new epoch
+        self.val_predictions = []
+        self.val_ground_truth = []
     def on_validation_epoch_end(self):
         """Configure the val timer at the end of every epoch."""
         val_time = time.time() - self.val_start_time
         self.log(
-            "val_time",
+            "val/time",
             val_time,
             prog_bar=False,
             on_step=False,
             on_epoch=True,
-            logger=True,
+            sync_dist=True,
+        )
+        # Log epoch explicitly so val/* metrics can use it as x-axis in wandb
+        # (mirrors what on_train_epoch_end does for train/* metrics)
+        self.log(
+            "epoch",
+            float(self.current_epoch),
+            on_step=False,
+            on_epoch=True,
             sync_dist=True,
         )
@@ -362,13 +436,51 @@ class LightningModel(L.LightningModule):
                 lr_scheduler_cfg.step_lr = StepLRConfig()
             elif self.lr_scheduler == "reduce_lr_on_plateau":
                 lr_scheduler_cfg.reduce_lr_on_plateau = ReduceLROnPlateauConfig()
+            elif self.lr_scheduler == "cosine_annealing_warmup":
+                lr_scheduler_cfg.cosine_annealing_warmup = CosineAnnealingWarmupConfig()
+            elif self.lr_scheduler == "linear_warmup_linear_decay":
+                lr_scheduler_cfg.linear_warmup_linear_decay = (
+                    LinearWarmupLinearDecayConfig()
+                )
         elif isinstance(self.lr_scheduler, dict):
             lr_scheduler_cfg = self.lr_scheduler
         for k, v in self.lr_scheduler.items():
             if v is not None:
-                if k == "step_lr":
+                if k == "cosine_annealing_warmup":
+                    cfg = self.lr_scheduler.cosine_annealing_warmup
+                    # Use trainer's max_epochs if not specified in config
+                    max_epochs = (
+                        cfg.max_epochs
+                        if cfg.max_epochs is not None
+                        else self.trainer.max_epochs
+                    )
+                    scheduler = LinearWarmupCosineAnnealingLR(
+                        optimizer=optimizer,
+                        warmup_epochs=cfg.warmup_epochs,
+                        max_epochs=max_epochs,
+                        warmup_start_lr=cfg.warmup_start_lr,
+                        eta_min=cfg.eta_min,
+                    )
+                    break
+                elif k == "linear_warmup_linear_decay":
+                    cfg = self.lr_scheduler.linear_warmup_linear_decay
+                    # Use trainer's max_epochs if not specified in config
+                    max_epochs = (
+                        cfg.max_epochs
+                        if cfg.max_epochs is not None
+                        else self.trainer.max_epochs
+                    )
+                    scheduler = LinearWarmupLinearDecayLR(
+                        optimizer=optimizer,
+                        warmup_epochs=cfg.warmup_epochs,
+                        max_epochs=max_epochs,
+                        warmup_start_lr=cfg.warmup_start_lr,
+                        end_lr=cfg.end_lr,
+                    )
+                    break
+                elif k == "step_lr":
                     scheduler = torch.optim.lr_scheduler.StepLR(
                         optimizer=optimizer,
                         step_size=self.lr_scheduler.step_lr.step_size,
@@ -396,7 +508,7 @@ class LightningModel(L.LightningModule):
             "optimizer": optimizer,
             "lr_scheduler": {
                 "scheduler": scheduler,
-                "monitor": "val_loss",
+                "monitor": "val/loss",
             },
         }
@@ -493,8 +605,15 @@ class SingleInstanceLightningModule(LightningModel):
         )
         self.node_names = self.head_configs.single_instance.confmaps.part_names
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(self, sample) -> VisualizationData:
+        """Extract visualization data from a sample.
+        Args:
+            sample: A sample dictionary from the data pipeline.
+        Returns:
+            VisualizationData containing image, confmaps, peaks, etc.
+        """
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -502,27 +621,41 @@ class SingleInstanceLightningModule(LightningModel):
                 ex[k] = v.to(device=self.device)
         ex["image"] = ex["image"].unsqueeze(dim=0)
         output = self.single_instance_inf_layer(ex)[0]
         peaks = output["pred_instance_peaks"].cpu().numpy()
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        peak_values = output["pred_peak_values"].cpu().numpy()
+        img = output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
         gt_instances = ex["instances"][0].cpu().numpy()
-        confmaps = (
-            output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        confmaps = output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=peak_values,
+            gt_instances=gt_instances,
+            node_names=list(self.node_names) if self.node_names else [],
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=True,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
-        plot_peaks(gt_instances, peaks, paired=True)
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         return self.model(img)["SingleInstanceConfmapsHead"]
     def training_step(self, batch, batch_idx):
@@ -531,6 +664,7 @@ class SingleInstanceLightningModule(LightningModel):
             torch.squeeze(batch["image"], dim=1),
             torch.squeeze(batch["confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["SingleInstanceConfmapsHead"]
@@ -554,23 +688,24 @@ class SingleInstanceLightningModule(LightningModel):
             channel_wise_loss = torch.sum(mse, dim=(0, 2, 3)) / (batch_size * h * w)
             for node_idx, name in enumerate(self.node_names):
                 self.log(
-                    f"{name}",
+                    f"train/confmaps/{name}",
                     channel_wise_loss[node_idx],
-                    prog_bar=True,
-                    on_step=True,
+                    prog_bar=False,
+                    on_step=False,
                     on_epoch=True,
-                    logger=True,
                     sync_dist=True,
                 )
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
-            on_epoch=True,
-            logger=True,
+            on_epoch=False,
             sync_dist=True,
         )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
         return loss
     def validation_step(self, batch, batch_idx):
@@ -579,6 +714,7 @@ class SingleInstanceLightningModule(LightningModel):
             torch.squeeze(batch["image"], dim=1),
             torch.squeeze(batch["confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["SingleInstanceConfmapsHead"]
         val_loss = nn.MSELoss()(y_preds, y)
@@ -592,26 +728,60 @@ class SingleInstanceLightningModule(LightningModel):
                 loss_scale=self.loss_scale,
             )
             val_loss = val_loss + ohkm_loss
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
         self.log(
-            "learning_rate",
-            lr,
-            prog_bar=True,
-            on_step=True,
-            on_epoch=True,
-            logger=True,
-            sync_dist=True,
-        )
-        self.log(
-            "val_loss",
+            "val/loss",
             val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            with torch.no_grad():
+                # Squeeze n_samples dim from image for inference (batch, 1, C, H, W) -> (batch, C, H, W)
+                inference_batch = {k: v for k, v in batch.items()}
+                if inference_batch["image"].ndim == 5:
+                    inference_batch["image"] = inference_batch["image"].squeeze(1)
+                inference_output = self.single_instance_inf_layer(inference_batch)
+                if isinstance(inference_output, list):
+                    inference_output = inference_output[0]
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions are already in original image space (inference divides by eff_scale)
+                pred_peaks = inference_output["pred_instance_peaks"][i].cpu().numpy()
+                pred_scores = inference_output["pred_peak_values"][i].cpu().numpy()
+                # Transform GT from preprocessed to original image space
+                # Note: instances have shape (1, max_inst, n_nodes, 2) - squeeze n_samples dim
+                gt_prep = batch["instances"][i].cpu().numpy()
+                if gt_prep.ndim == 4:
+                    gt_prep = gt_prep.squeeze(0)  # (max_inst, n_nodes, 2)
+                gt_orig = gt_prep / eff
+                num_inst = batch["num_instances"][i].item()
+                gt_orig = gt_orig[:num_inst]  # Only valid instances
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_peaks,
+                        "pred_scores": pred_scores,
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_orig,
+                        "num_instances": num_inst,
+                    }
+                )
 class TopDownCenteredInstanceLightningModule(LightningModel):
     """Lightning Module for TopDownCenteredInstance Model.
@@ -705,8 +875,8 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
         self.node_names = self.head_configs.centered_instance.confmaps.part_names
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(self, sample) -> VisualizationData:
+        """Extract visualization data from a sample."""
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -714,27 +884,41 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
                 ex[k] = v.to(device=self.device)
         ex["instance_image"] = ex["instance_image"].unsqueeze(dim=0)
         output = self.instance_peaks_inf_layer(ex)
         peaks = output["pred_instance_peaks"].cpu().numpy()
-        img = (
-            output["instance_image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        peak_values = output["pred_peak_values"].cpu().numpy()
+        img = output["instance_image"][0, 0].cpu().numpy().transpose(1, 2, 0)
         gt_instances = ex["instance"].cpu().numpy()
-        confmaps = (
-            output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        confmaps = output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=peak_values,
+            gt_instances=gt_instances,
+            node_names=list(self.node_names) if self.node_names else [],
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=True,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
-        plot_peaks(gt_instances, peaks, paired=True)
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         return self.model(img)["CenteredInstanceConfmapsHead"]
     def training_step(self, batch, batch_idx):
@@ -743,6 +927,7 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
             torch.squeeze(batch["instance_image"], dim=1),
             torch.squeeze(batch["confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["CenteredInstanceConfmapsHead"]
@@ -766,24 +951,25 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
             channel_wise_loss = torch.sum(mse, dim=(0, 2, 3)) / (batch_size * h * w)
             for node_idx, name in enumerate(self.node_names):
                 self.log(
-                    f"{name}",
+                    f"train/confmaps/{name}",
                     channel_wise_loss[node_idx],
-                    prog_bar=True,
-                    on_step=True,
+                    prog_bar=False,
+                    on_step=False,
                     on_epoch=True,
-                    logger=True,
                     sync_dist=True,
                 )
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
-            on_epoch=True,
-            logger=True,
+            on_epoch=False,
             sync_dist=True,
         )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
         return loss
     def validation_step(self, batch, batch_idx):
@@ -792,6 +978,7 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
             torch.squeeze(batch["instance_image"], dim=1),
             torch.squeeze(batch["confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["CenteredInstanceConfmapsHead"]
         val_loss = nn.MSELoss()(y_preds, y)
@@ -805,26 +992,71 @@ class TopDownCenteredInstanceLightningModule(LightningModel):
                 loss_scale=self.loss_scale,
             )
             val_loss = val_loss + ohkm_loss
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
-        self.log(
-            "learning_rate",
-            lr,
-            prog_bar=True,
-            on_step=True,
-            on_epoch=True,
-            logger=True,
-            sync_dist=True,
-        )
         self.log(
-            "val_loss",
+            "val/loss",
             val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            # SAVE bbox BEFORE inference (it modifies in-place!)
+            bbox_prep_saved = batch["instance_bbox"].clone()
+            with torch.no_grad():
+                inference_output = self.instance_peaks_inf_layer(batch)
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions from inference (crop-relative, original scale)
+                pred_peaks_crop = (
+                    inference_output["pred_instance_peaks"][i].cpu().numpy()
+                )
+                pred_scores = inference_output["pred_peak_values"][i].cpu().numpy()
+                # Compute bbox offset in original space from SAVED prep bbox
+                # bbox has shape (n_samples=1, 4, 2) where 4 corners
+                bbox_prep = bbox_prep_saved[i].squeeze(0).cpu().numpy()  # (4, 2)
+                bbox_top_left_orig = (
+                    bbox_prep[0] / eff
+                )  # Top-left corner in original space
+                # Full image coordinates (original space)
+                pred_peaks_full = pred_peaks_crop + bbox_top_left_orig
+                # GT transform: crop-relative preprocessed -> full image original
+                gt_crop_prep = (
+                    batch["instance"][i].squeeze(0).cpu().numpy()
+                )  # (n_nodes, 2)
+                gt_crop_orig = gt_crop_prep / eff
+                gt_full_orig = gt_crop_orig + bbox_top_left_orig
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_peaks_full.reshape(
+                            1, -1, 2
+                        ),  # (1, n_nodes, 2)
+                        "pred_scores": pred_scores.reshape(1, -1),  # (1, n_nodes)
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_full_orig.reshape(
+                            1, -1, 2
+                        ),  # (1, n_nodes, 2)
+                        "num_instances": 1,
+                    }
+                )
 class CentroidLightningModule(LightningModel):
     """Lightning Module for Centroid Model.
@@ -916,9 +1148,10 @@ class CentroidLightningModule(LightningModel):
             output_stride=self.head_configs.centroid.confmaps.output_stride,
             input_scale=1.0,
         )
+        self.node_names = ["centroid"]
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(self, sample) -> VisualizationData:
+        """Extract visualization data from a sample."""
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -927,26 +1160,40 @@ class CentroidLightningModule(LightningModel):
         ex["image"] = ex["image"].unsqueeze(dim=0)
         gt_centroids = ex["centroids"].cpu().numpy()
         output = self.centroid_inf_layer(ex)
         peaks = output["centroids"][0].cpu().numpy()
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
-        confmaps = (
-            output["pred_centroid_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        centroid_vals = output["centroid_vals"][0].cpu().numpy()
+        img = output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
+        confmaps = output["pred_centroid_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=centroid_vals,
+            gt_instances=gt_centroids,
+            node_names=self.node_names,
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=False,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
-        plot_peaks(gt_centroids, peaks, paired=False)
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         return self.model(img)["CentroidConfmapsHead"]
     def training_step(self, batch, batch_idx):
@@ -955,18 +1202,21 @@ class CentroidLightningModule(LightningModel):
             torch.squeeze(batch["image"], dim=1),
             torch.squeeze(batch["centroids_confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["CentroidConfmapsHead"]
         loss = nn.MSELoss()(y_preds, y)
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
-            on_epoch=True,
-            logger=True,
+            on_epoch=False,
             sync_dist=True,
         )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
         return loss
     def validation_step(self, batch, batch_idx):
@@ -975,29 +1225,74 @@ class CentroidLightningModule(LightningModel):
             torch.squeeze(batch["image"], dim=1),
             torch.squeeze(batch["centroids_confidence_maps"], dim=1),
         )
+        X = normalize_on_gpu(X)
         y_preds = self.model(X)["CentroidConfmapsHead"]
         val_loss = nn.MSELoss()(y_preds, y)
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
         self.log(
-            "learning_rate",
-            lr,
-            prog_bar=True,
-            on_step=True,
-            on_epoch=True,
-            logger=True,
-            sync_dist=True,
-        )
-        self.log(
-            "val_loss",
+            "val/loss",
             val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            # Save GT centroids before inference (inference overwrites batch["centroids"])
+            batch["gt_centroids"] = batch["centroids"].clone()
+            with torch.no_grad():
+                inference_output = self.centroid_inf_layer(batch)
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions are in original image space (inference divides by eff_scale)
+                # centroids shape: (batch, 1, max_instances, 2) - squeeze to (max_instances, 2)
+                pred_centroids = (
+                    inference_output["centroids"][i].squeeze(0).cpu().numpy()
+                )
+                pred_vals = inference_output["centroid_vals"][i].cpu().numpy()
+                # Transform GT centroids from preprocessed to original image space
+                # Use "gt_centroids" since inference overwrites "centroids" with predictions
+                gt_centroids_prep = (
+                    batch["gt_centroids"][i].cpu().numpy()
+                )  # (n_samples=1, max_inst, 2)
+                gt_centroids_orig = gt_centroids_prep.squeeze(0) / eff  # (max_inst, 2)
+                num_inst = batch["num_instances"][i].item()
+                # Filter to valid instances (non-NaN)
+                valid_pred_mask = ~np.isnan(pred_centroids).any(axis=1)
+                pred_centroids = pred_centroids[valid_pred_mask]
+                pred_vals = pred_vals[valid_pred_mask]
+                gt_centroids_valid = gt_centroids_orig[:num_inst]
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_centroids.reshape(
+                            -1, 1, 2
+                        ),  # (n_inst, 1, 2)
+                        "pred_scores": pred_vals.reshape(-1, 1),  # (n_inst, 1)
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_centroids_valid.reshape(
+                            -1, 1, 2
+                        ),  # (n_inst, 1, 2)
+                        "num_instances": num_inst,
+                    }
+                )
 class BottomUpLightningModule(LightningModel):
     """Lightning Module for BottomUp Model.
@@ -1090,16 +1385,20 @@ class BottomUpLightningModule(LightningModel):
         self.bottomup_inf_layer = BottomUpInferenceModel(
             torch_model=self.forward,
             paf_scorer=paf_scorer,
-            peak_threshold=0.2,
+            peak_threshold=0.1,  # Lower threshold for epoch-end eval during training
             input_scale=1.0,
             return_confmaps=True,
             return_pafs=True,
             cms_output_stride=self.head_configs.bottomup.confmaps.output_stride,
             pafs_output_stride=self.head_configs.bottomup.pafs.output_stride,
+            max_peaks_per_node=100,  # Prevents combinatorial explosion in early training
         )
+        self.node_names = list(self.head_configs.bottomup.confmaps.part_names)
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(
+        self, sample, include_pafs: bool = False
+    ) -> VisualizationData:
+        """Extract visualization data from a sample."""
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -1107,54 +1406,65 @@ class BottomUpLightningModule(LightningModel):
                 ex[k] = v.to(device=self.device)
         ex["image"] = ex["image"].unsqueeze(dim=0)
         output = self.bottomup_inf_layer(ex)[0]
         peaks = output["pred_instance_peaks"][0].cpu().numpy()
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        peak_values = output["pred_peak_values"][0].cpu().numpy()
+        img = output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
         gt_instances = ex["instances"][0].cpu().numpy()
-        confmaps = (
-            output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        confmaps = output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        pred_pafs = None
+        if include_pafs:
+            pafs = output["pred_part_affinity_fields"].cpu().numpy()[0]
+            pred_pafs = pafs  # (h, w, 2*edges)
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=peak_values,
+            gt_instances=gt_instances,
+            node_names=self.node_names,
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=False,
+            pred_pafs=pred_pafs,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
         plt.xlim(plt.xlim())
         plt.ylim(plt.ylim())
-        plot_peaks(gt_instances, peaks, paired=False)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def visualize_pafs_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
-        ex = sample.copy()
-        ex["eff_scale"] = torch.tensor([1.0])
-        for k, v in ex.items():
-            if isinstance(v, torch.Tensor):
-                ex[k] = v.to(device=self.device)
-        ex["image"] = ex["image"].unsqueeze(dim=0)
-        output = self.bottomup_inf_layer(ex)[0]
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
-        pafs = output["pred_part_affinity_fields"].cpu().numpy()[0]  # (h, w, 2*edges)
+        """Visualize PAF predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample, include_pafs=True)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        pafs = data.pred_pafs
         pafs = pafs.reshape((pafs.shape[0], pafs.shape[1], -1, 2))
         pafs_mag = np.sqrt(pafs[..., 0] ** 2 + pafs[..., 1] ** 2)
-        plot_confmaps(pafs_mag, output_scale=pafs_mag.shape[0] / img.shape[0])
+        plot_confmaps(pafs_mag, output_scale=pafs_mag.shape[0] / data.image.shape[0])
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         output = self.model(img)
         return {
             "MultiInstanceConfmapsHead": output["MultiInstanceConfmapsHead"],
@@ -1166,6 +1476,7 @@ class BottomUpLightningModule(LightningModel):
         X = torch.squeeze(batch["image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_paf = batch["part_affinity_fields"]
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         pafs = preds["PartAffinityFieldsHead"]
         confmaps = preds["MultiInstanceConfmapsHead"]
@@ -1198,13 +1509,29 @@ class BottomUpLightningModule(LightningModel):
             "PartAffinityFieldsHead": pafs_loss,
         }
         loss = sum([s * losses[t] for s, t in zip(self.loss_weights, losses)])
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
+        self.log(
+            "train/confmaps_loss",
+            confmap_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            "train/paf_loss",
+            pafs_loss,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
         return loss
@@ -1214,6 +1541,7 @@ class BottomUpLightningModule(LightningModel):
         X = torch.squeeze(batch["image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_paf = batch["part_affinity_fields"]
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         pafs = preds["PartAffinityFieldsHead"]
@@ -1248,25 +1576,75 @@ class BottomUpLightningModule(LightningModel):
         }
         val_loss = sum([s * losses[t] for s, t in zip(self.loss_weights, losses)])
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
         self.log(
-            "learning_rate",
-            lr,
+            "val/loss",
+            val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
         self.log(
-            "val_loss",
-            val_loss,
-            prog_bar=True,
-            on_step=True,
+            "val/confmaps_loss",
+            confmap_loss,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        self.log(
+            "val/paf_loss",
+            pafs_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            with torch.no_grad():
+                # Note: Do NOT squeeze the image here - the forward() method expects
+                # (batch, n_samples, C, H, W) and handles the n_samples squeeze internally
+                inference_output = self.bottomup_inf_layer(batch)
+                if isinstance(inference_output, list):
+                    inference_output = inference_output[0]
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions are already in original space (variable number of instances)
+                pred_peaks = inference_output["pred_instance_peaks"][i]
+                pred_scores = inference_output["pred_peak_values"][i]
+                if torch.is_tensor(pred_peaks):
+                    pred_peaks = pred_peaks.cpu().numpy()
+                if torch.is_tensor(pred_scores):
+                    pred_scores = pred_scores.cpu().numpy()
+                # Transform GT to original space
+                # Note: instances have shape (1, max_inst, n_nodes, 2) - squeeze n_samples dim
+                gt_prep = batch["instances"][i].cpu().numpy()
+                if gt_prep.ndim == 4:
+                    gt_prep = gt_prep.squeeze(0)  # (max_inst, n_nodes, 2)
+                gt_orig = gt_prep / eff
+                num_inst = batch["num_instances"][i].item()
+                gt_orig = gt_orig[:num_inst]  # Only valid instances
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_peaks,  # Original space, variable instances
+                        "pred_scores": pred_scores,
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_orig,  # Original space
+                        "num_instances": num_inst,
+                    }
+                )
 class BottomUpMultiClassLightningModule(LightningModel):
@@ -1361,9 +1739,14 @@ class BottomUpMultiClassLightningModule(LightningModel):
             cms_output_stride=self.head_configs.multi_class_bottomup.confmaps.output_stride,
             class_maps_output_stride=self.head_configs.multi_class_bottomup.class_maps.output_stride,
         )
+        self.node_names = list(
+            self.head_configs.multi_class_bottomup.confmaps.part_names
+        )
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(
+        self, sample, include_class_maps: bool = False
+    ) -> VisualizationData:
+        """Extract visualization data from a sample."""
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -1371,54 +1754,65 @@ class BottomUpMultiClassLightningModule(LightningModel):
                 ex[k] = v.to(device=self.device)
         ex["image"] = ex["image"].unsqueeze(dim=0)
         output = self.bottomup_inf_layer(ex)[0]
         peaks = output["pred_instance_peaks"][0].cpu().numpy()
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        peak_values = output["pred_peak_values"][0].cpu().numpy()
+        img = output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
         gt_instances = ex["instances"][0].cpu().numpy()
-        confmaps = (
-            output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        confmaps = output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        pred_class_maps = None
+        if include_class_maps:
+            pred_class_maps = (
+                output["pred_class_maps"].cpu().numpy()[0].transpose(1, 2, 0)
+            )
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=peak_values,
+            gt_instances=gt_instances,
+            node_names=self.node_names,
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=False,
+            pred_class_maps=pred_class_maps,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
         plt.xlim(plt.xlim())
         plt.ylim(plt.ylim())
-        plot_peaks(gt_instances, peaks, paired=False)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def visualize_class_maps_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
-        ex = sample.copy()
-        ex["eff_scale"] = torch.tensor([1.0])
-        for k, v in ex.items():
-            if isinstance(v, torch.Tensor):
-                ex[k] = v.to(device=self.device)
-        ex["image"] = ex["image"].unsqueeze(dim=0)
-        output = self.bottomup_inf_layer(ex)[0]
-        img = (
-            output["image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
-        classmaps = (
-            output["pred_class_maps"].cpu().numpy()[0].transpose(1, 2, 0)
-        )  # (n_classes, h, w)
+        """Visualize class map predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample, include_class_maps=True)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(classmaps, output_scale=classmaps.shape[0] / img.shape[0])
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(
+            data.pred_class_maps,
+            output_scale=data.pred_class_maps.shape[0] / data.image.shape[0],
+        )
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         output = self.model(img)
         return {
             "MultiInstanceConfmapsHead": output["MultiInstanceConfmapsHead"],
@@ -1430,6 +1824,7 @@ class BottomUpMultiClassLightningModule(LightningModel):
         X = torch.squeeze(batch["image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_classmap = torch.squeeze(batch["class_maps"], dim=1)
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         classmaps = preds["ClassMapsHead"]
         confmaps = preds["MultiInstanceConfmapsHead"]
@@ -1453,15 +1848,84 @@ class BottomUpMultiClassLightningModule(LightningModel):
             "ClassMapsHead": classmaps_loss,
         }
         loss = sum([s * losses[t] for s, t in zip(self.loss_weights, losses)])
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
+        self.log(
+            "train/confmaps_loss",
+            confmap_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            "train/classmap_loss",
+            classmaps_loss,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Compute classification accuracy at GT keypoint locations
+        with torch.no_grad():
+            # Get output stride for class maps
+            cms_stride = self.head_configs.multi_class_bottomup.class_maps.output_stride
+            # Get GT instances and sample class maps at those locations
+            instances = batch["instances"]  # (batch, n_samples, max_inst, n_nodes, 2)
+            if instances.dim() == 5:
+                instances = instances.squeeze(1)  # (batch, max_inst, n_nodes, 2)
+            num_instances = batch["num_instances"]  # (batch,)
+            correct = 0
+            total = 0
+            for b in range(instances.shape[0]):
+                n_inst = num_instances[b].item()
+                for inst_idx in range(n_inst):
+                    for node_idx in range(instances.shape[2]):
+                        # Get keypoint location (in input image space)
+                        kp = instances[b, inst_idx, node_idx]  # (2,) = (x, y)
+                        if torch.isnan(kp).any():
+                            continue
+                        # Convert to class map space
+                        x_cm = (
+                            (kp[0] / cms_stride)
+                            .long()
+                            .clamp(0, classmaps.shape[-1] - 1)
+                        )
+                        y_cm = (
+                            (kp[1] / cms_stride)
+                            .long()
+                            .clamp(0, classmaps.shape[-2] - 1)
+                        )
+                        # Sample predicted and GT class at this location
+                        pred_class = classmaps[b, :, y_cm, x_cm].argmax()
+                        gt_class = y_classmap[b, :, y_cm, x_cm].argmax()
+                        if pred_class == gt_class:
+                            correct += 1
+                        total += 1
+            if total > 0:
+                class_accuracy = torch.tensor(correct / total, device=X.device)
+                self.log(
+                    "train/class_accuracy",
+                    class_accuracy,
+                    on_step=False,
+                    on_epoch=True,
+                    sync_dist=True,
+                )
         return loss
     def validation_step(self, batch, batch_idx):
@@ -1469,6 +1933,7 @@ class BottomUpMultiClassLightningModule(LightningModel):
         X = torch.squeeze(batch["image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_classmap = torch.squeeze(batch["class_maps"], dim=1)
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         classmaps = preds["ClassMapsHead"]
@@ -1494,26 +1959,128 @@ class BottomUpMultiClassLightningModule(LightningModel):
         }
         val_loss = sum([s * losses[t] for s, t in zip(self.loss_weights, losses)])
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
         self.log(
-            "learning_rate",
-            lr,
+            "val/loss",
+            val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
         self.log(
-            "val_loss",
-            val_loss,
-            prog_bar=True,
-            on_step=True,
+            "val/confmaps_loss",
+            confmap_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            "val/classmap_loss",
+            classmaps_loss,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        # Compute classification accuracy at GT keypoint locations
+        with torch.no_grad():
+            # Get output stride for class maps
+            cms_stride = self.head_configs.multi_class_bottomup.class_maps.output_stride
+            # Get GT instances and sample class maps at those locations
+            instances = batch["instances"]  # (batch, n_samples, max_inst, n_nodes, 2)
+            if instances.dim() == 5:
+                instances = instances.squeeze(1)  # (batch, max_inst, n_nodes, 2)
+            num_instances = batch["num_instances"]  # (batch,)
+            correct = 0
+            total = 0
+            for b in range(instances.shape[0]):
+                n_inst = num_instances[b].item()
+                for inst_idx in range(n_inst):
+                    for node_idx in range(instances.shape[2]):
+                        # Get keypoint location (in input image space)
+                        kp = instances[b, inst_idx, node_idx]  # (2,) = (x, y)
+                        if torch.isnan(kp).any():
+                            continue
+                        # Convert to class map space
+                        x_cm = (
+                            (kp[0] / cms_stride)
+                            .long()
+                            .clamp(0, classmaps.shape[-1] - 1)
+                        )
+                        y_cm = (
+                            (kp[1] / cms_stride)
+                            .long()
+                            .clamp(0, classmaps.shape[-2] - 1)
+                        )
+                        # Sample predicted and GT class at this location
+                        pred_class = classmaps[b, :, y_cm, x_cm].argmax()
+                        gt_class = y_classmap[b, :, y_cm, x_cm].argmax()
+                        if pred_class == gt_class:
+                            correct += 1
+                        total += 1
+            if total > 0:
+                class_accuracy = torch.tensor(correct / total, device=X.device)
+                self.log(
+                    "val/class_accuracy",
+                    class_accuracy,
+                    on_step=False,
+                    on_epoch=True,
+                    sync_dist=True,
+                )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            with torch.no_grad():
+                # Note: Do NOT squeeze the image here - the forward() method expects
+                # (batch, n_samples, C, H, W) and handles the n_samples squeeze internally
+                inference_output = self.bottomup_inf_layer(batch)
+                if isinstance(inference_output, list):
+                    inference_output = inference_output[0]
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions are already in original space (variable number of instances)
+                pred_peaks = inference_output["pred_instance_peaks"][i]
+                pred_scores = inference_output["pred_peak_values"][i]
+                if torch.is_tensor(pred_peaks):
+                    pred_peaks = pred_peaks.cpu().numpy()
+                if torch.is_tensor(pred_scores):
+                    pred_scores = pred_scores.cpu().numpy()
+                # Transform GT to original space
+                # Note: instances have shape (1, max_inst, n_nodes, 2) - squeeze n_samples dim
+                gt_prep = batch["instances"][i].cpu().numpy()
+                if gt_prep.ndim == 4:
+                    gt_prep = gt_prep.squeeze(0)  # (max_inst, n_nodes, 2)
+                gt_orig = gt_prep / eff
+                num_inst = batch["num_instances"][i].item()
+                gt_orig = gt_orig[:num_inst]  # Only valid instances
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_peaks,  # Original space, variable instances
+                        "pred_scores": pred_scores,
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_orig,  # Original space
+                        "num_instances": num_inst,
+                    }
+                )
 class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
     """Lightning Module for TopDownCenteredInstance ID Model.
@@ -1607,8 +2174,8 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
         self.node_names = self.head_configs.multi_class_topdown.confmaps.part_names
-    def visualize_example(self, sample):
-        """Visualize predictions during training (used with callbacks)."""
+    def get_visualization_data(self, sample) -> VisualizationData:
+        """Extract visualization data from a sample."""
         ex = sample.copy()
         ex["eff_scale"] = torch.tensor([1.0])
         for k, v in ex.items():
@@ -1616,27 +2183,41 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
                 ex[k] = v.to(device=self.device)
         ex["instance_image"] = ex["instance_image"].unsqueeze(dim=0)
         output = self.instance_peaks_inf_layer(ex)
         peaks = output["pred_instance_peaks"].cpu().numpy()
-        img = (
-            output["instance_image"][0, 0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        peak_values = output["pred_peak_values"].cpu().numpy()
+        img = output["instance_image"][0, 0].cpu().numpy().transpose(1, 2, 0)
         gt_instances = ex["instance"].cpu().numpy()
-        confmaps = (
-            output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
-        )  # convert from (C, H, W) to (H, W, C)
+        confmaps = output["pred_confmaps"][0].cpu().numpy().transpose(1, 2, 0)
+        return VisualizationData(
+            image=img,
+            pred_confmaps=confmaps,
+            pred_peaks=peaks,
+            pred_peak_values=peak_values,
+            gt_instances=gt_instances,
+            node_names=list(self.node_names) if self.node_names else [],
+            output_scale=confmaps.shape[0] / img.shape[0],
+            is_paired=True,
+        )
+    def visualize_example(self, sample):
+        """Visualize predictions during training (used with callbacks)."""
+        data = self.get_visualization_data(sample)
         scale = 1.0
-        if img.shape[0] < 512:
+        if data.image.shape[0] < 512:
             scale = 2.0
-        if img.shape[0] < 256:
+        if data.image.shape[0] < 256:
             scale = 4.0
-        fig = plot_img(img, dpi=72 * scale, scale=scale)
-        plot_confmaps(confmaps, output_scale=confmaps.shape[0] / img.shape[0])
-        plot_peaks(gt_instances, peaks, paired=True)
+        fig = plot_img(data.image, dpi=72 * scale, scale=scale)
+        plot_confmaps(data.pred_confmaps, output_scale=data.output_scale)
+        plot_peaks(data.gt_instances, data.pred_peaks, paired=data.is_paired)
         return fig
     def forward(self, img):
         """Forward pass of the model."""
         img = torch.squeeze(img, dim=1).to(self.device)
+        img = normalize_on_gpu(img)
         output = self.model(img)
         return {
             "CenteredInstanceConfmapsHead": output["CenteredInstanceConfmapsHead"],
@@ -1648,6 +2229,7 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
         X = torch.squeeze(batch["instance_image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_classvector = batch["class_vectors"]
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         classvector = preds["ClassVectorsHead"]
         confmaps = preds["CenteredInstanceConfmapsHead"]
@@ -1679,22 +2261,50 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
             channel_wise_loss = torch.sum(mse, dim=(0, 2, 3)) / (batch_size * h * w)
             for node_idx, name in enumerate(self.node_names):
                 self.log(
-                    f"{name}",
+                    f"train/confmaps/{name}",
                     channel_wise_loss[node_idx],
-                    prog_bar=True,
-                    on_step=True,
+                    prog_bar=False,
+                    on_step=False,
                     on_epoch=True,
-                    logger=True,
                     sync_dist=True,
                 )
+        # Log step-level loss (every batch, uses global_step x-axis)
         self.log(
-            "train_loss",
+            "loss",
             loss,
             prog_bar=True,
             on_step=True,
+            on_epoch=False,
+            sync_dist=True,
+        )
+        # Accumulate for epoch-averaged loss (logged in on_train_epoch_end)
+        self._accumulate_loss(loss)
+        self.log(
+            "train/confmaps_loss",
+            confmap_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        self.log(
+            "train/classvector_loss",
+            classvector_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        # Compute classification accuracy
+        with torch.no_grad():
+            pred_classes = torch.argmax(classvector, dim=1)
+            gt_classes = torch.argmax(y_classvector, dim=1)
+            class_accuracy = (pred_classes == gt_classes).float().mean()
+        self.log(
+            "train/class_accuracy",
+            class_accuracy,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
         return loss
@@ -1704,6 +2314,7 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
         X = torch.squeeze(batch["instance_image"], dim=1)
         y_confmap = torch.squeeze(batch["confidence_maps"], dim=1)
         y_classvector = batch["class_vectors"]
+        X = normalize_on_gpu(X)
         preds = self.model(X)
         classvector = preds["ClassVectorsHead"]
         confmaps = preds["CenteredInstanceConfmapsHead"]
@@ -1727,23 +2338,94 @@ class TopDownCenteredInstanceMultiClassLightningModule(LightningModel):
             "ClassVectorsHead": classvector_loss,
         }
         val_loss = sum([s * losses[t] for s, t in zip(self.loss_weights, losses)])
-        lr = self.optimizers().optimizer.param_groups[0]["lr"]
         self.log(
-            "learning_rate",
-            lr,
+            "val/loss",
+            val_loss,
             prog_bar=True,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
         self.log(
-            "val_loss",
-            val_loss,
-            prog_bar=True,
-            on_step=True,
+            "val/confmaps_loss",
+            confmap_loss,
+            on_step=False,
             on_epoch=True,
-            logger=True,
             sync_dist=True,
         )
+        self.log(
+            "val/classvector_loss",
+            classvector_loss,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        # Compute classification accuracy
+        with torch.no_grad():
+            pred_classes = torch.argmax(classvector, dim=1)
+            gt_classes = torch.argmax(y_classvector, dim=1)
+            class_accuracy = (pred_classes == gt_classes).float().mean()
+        self.log(
+            "val/class_accuracy",
+            class_accuracy,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True,
+        )
+        # Collect predictions for epoch-end evaluation if enabled
+        if self._collect_val_predictions:
+            # SAVE bbox BEFORE inference (it modifies in-place!)
+            bbox_prep_saved = batch["instance_bbox"].clone()
+            with torch.no_grad():
+                inference_output = self.instance_peaks_inf_layer(batch)
+            batch_size = len(batch["frame_idx"])
+            for i in range(batch_size):
+                eff = batch["eff_scale"][i].cpu().numpy()
+                # Predictions from inference (crop-relative, original scale)
+                pred_peaks_crop = (
+                    inference_output["pred_instance_peaks"][i].cpu().numpy()
+                )
+                pred_scores = inference_output["pred_peak_values"][i].cpu().numpy()
+                # Compute bbox offset in original space from SAVED prep bbox
+                # bbox has shape (n_samples=1, 4, 2) where 4 corners
+                bbox_prep = bbox_prep_saved[i].squeeze(0).cpu().numpy()  # (4, 2)
+                bbox_top_left_orig = (
+                    bbox_prep[0] / eff
+                )  # Top-left corner in original space
+                # Full image coordinates (original space)
+                pred_peaks_full = pred_peaks_crop + bbox_top_left_orig
+                # GT transform: crop-relative preprocessed -> full image original
+                gt_crop_prep = (
+                    batch["instance"][i].squeeze(0).cpu().numpy()
+                )  # (n_nodes, 2)
+                gt_crop_orig = gt_crop_prep / eff
+                gt_full_orig = gt_crop_orig + bbox_top_left_orig
+                self.val_predictions.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "pred_peaks": pred_peaks_full.reshape(
+                            1, -1, 2
+                        ),  # (1, n_nodes, 2)
+                        "pred_scores": pred_scores.reshape(1, -1),  # (1, n_nodes)
+                    }
+                )
+                self.val_ground_truth.append(
+                    {
+                        "video_idx": batch["video_idx"][i].item(),
+                        "frame_idx": batch["frame_idx"][i].item(),
+                        "gt_instances": gt_full_orig.reshape(
+                            1, -1, 2
+                        ),  # (1, n_nodes, 2)
+                        "num_instances": 1,
+                    }
+                )

sleap-nn 0.0.5__py3-none-any.whl → 0.1.0__py3-none-any.whl

sleap-nn 0.0.5py3-none-any.whl → 0.1.0py3-none-any.whl