PyPI - sleap-nn - Versions diffs - 0.0.5__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

sleap-nn 0.0.5py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sleap_nn/__init__.py +9 -2
sleap_nn/architectures/convnext.py +5 -0
sleap_nn/architectures/encoder_decoder.py +25 -6
sleap_nn/architectures/swint.py +8 -0
sleap_nn/cli.py +489 -46
sleap_nn/config/data_config.py +51 -8
sleap_nn/config/get_config.py +32 -24
sleap_nn/config/trainer_config.py +88 -0
sleap_nn/data/augmentation.py +61 -200
sleap_nn/data/custom_datasets.py +433 -61
sleap_nn/data/instance_cropping.py +71 -6
sleap_nn/data/normalization.py +45 -2
sleap_nn/data/providers.py +26 -0
sleap_nn/data/resizing.py +2 -2
sleap_nn/data/skia_augmentation.py +414 -0
sleap_nn/data/utils.py +135 -17
sleap_nn/evaluation.py +177 -42
sleap_nn/export/__init__.py +21 -0
sleap_nn/export/cli.py +1778 -0
sleap_nn/export/exporters/__init__.py +51 -0
sleap_nn/export/exporters/onnx_exporter.py +80 -0
sleap_nn/export/exporters/tensorrt_exporter.py +291 -0
sleap_nn/export/metadata.py +225 -0
sleap_nn/export/predictors/__init__.py +63 -0
sleap_nn/export/predictors/base.py +22 -0
sleap_nn/export/predictors/onnx.py +154 -0
sleap_nn/export/predictors/tensorrt.py +312 -0
sleap_nn/export/utils.py +307 -0
sleap_nn/export/wrappers/__init__.py +25 -0
sleap_nn/export/wrappers/base.py +96 -0
sleap_nn/export/wrappers/bottomup.py +243 -0
sleap_nn/export/wrappers/bottomup_multiclass.py +195 -0
sleap_nn/export/wrappers/centered_instance.py +56 -0
sleap_nn/export/wrappers/centroid.py +58 -0
sleap_nn/export/wrappers/single_instance.py +83 -0
sleap_nn/export/wrappers/topdown.py +180 -0
sleap_nn/export/wrappers/topdown_multiclass.py +304 -0
sleap_nn/inference/__init__.py +6 -0
sleap_nn/inference/bottomup.py +86 -20
sleap_nn/inference/peak_finding.py +93 -16
sleap_nn/inference/postprocessing.py +284 -0
sleap_nn/inference/predictors.py +339 -137
sleap_nn/inference/provenance.py +292 -0
sleap_nn/inference/topdown.py +55 -47
sleap_nn/legacy_models.py +65 -11
sleap_nn/predict.py +224 -19
sleap_nn/system_info.py +443 -0
sleap_nn/tracking/tracker.py +8 -1
sleap_nn/train.py +138 -44
sleap_nn/training/callbacks.py +1258 -5
sleap_nn/training/lightning_modules.py +902 -220
sleap_nn/training/model_trainer.py +424 -111
sleap_nn/training/schedulers.py +191 -0
sleap_nn/training/utils.py +367 -2
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/METADATA +35 -33
sleap_nn-0.1.0.dist-info/RECORD +88 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/WHEEL +1 -1
sleap_nn-0.0.5.dist-info/RECORD +0 -63
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/entry_points.txt +0 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/licenses/LICENSE +0 -0
{sleap_nn-0.0.5.dist-info → sleap_nn-0.1.0.dist-info}/top_level.txt +0 -0

sleap_nn/training/model_trainer.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import os
 import shutil
-import copy
 import attrs
 import torch
 import random
@@ -16,11 +15,14 @@ import yaml
 from pathlib import Path
 from typing import List, Optional
 from datetime import datetime
-from itertools import cycle, count
+from itertools import count
 from omegaconf import DictConfig, OmegaConf
 from lightning.pytorch.loggers import WandbLogger
 from sleap_nn.data.utils import check_cache_memory
-from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
+from lightning.pytorch.callbacks import (
+    ModelCheckpoint,
+    EarlyStopping,
+)
 from lightning.pytorch.profilers import (
     SimpleProfiler,
     AdvancedProfiler,
@@ -28,7 +30,11 @@ from lightning.pytorch.profilers import (
     PassThroughProfiler,
 )
 from sleap_io.io.skeleton import SkeletonYAMLEncoder
-from sleap_nn.data.instance_cropping import find_instance_crop_size
+from sleap_nn.data.instance_cropping import (
+    find_instance_crop_size,
+    find_max_instance_bbox_size,
+    compute_augmentation_padding,
+)
 from sleap_nn.data.providers import get_max_height_width
 from sleap_nn.data.custom_datasets import (
     get_train_val_dataloaders,
@@ -47,9 +53,11 @@ from sleap_nn.config.training_job_config import verify_training_cfg
 from sleap_nn.training.callbacks import (
     ProgressReporterZMQ,
     TrainingControllerZMQ,
-    MatplotlibSaver,
-    WandBPredImageLogger,
     CSVLoggerCallback,
+    SleapProgressBar,
+    EpochEndEvaluationCallback,
+    CentroidEvaluationCallback,
+    UnifiedVizCallback,
 )
 from sleap_nn import RANK
 from sleap_nn.legacy_models import get_keras_first_layer_channels
@@ -207,6 +215,52 @@ class ModelTrainer:
                 trainer_devices = 1
         return trainer_devices
+    def _count_labeled_frames(
+        self, labels_list: List[sio.Labels], user_only: bool = True
+    ) -> int:
+        """Count labeled frames, optionally filtering to user-labeled only.
+        Args:
+            labels_list: List of Labels objects to count frames from.
+            user_only: If True, count only frames with user instances.
+        Returns:
+            Total count of labeled frames.
+        """
+        total = 0
+        for label in labels_list:
+            if user_only:
+                total += sum(1 for lf in label if lf.has_user_instances)
+            else:
+                total += len(label)
+        return total
+    def _filter_to_user_labeled(self, labels: sio.Labels) -> sio.Labels:
+        """Filter a Labels object to only include user-labeled frames.
+        Args:
+            labels: Labels object to filter.
+        Returns:
+            New Labels object containing only frames with user instances.
+        """
+        # Filter labeled frames to only those with user instances
+        user_lfs = [lf for lf in labels if lf.has_user_instances]
+        # Set instances to user instances only
+        for lf in user_lfs:
+            lf.instances = lf.user_instances
+        # Create new Labels with filtered frames
+        return sio.Labels(
+            labeled_frames=user_lfs,
+            videos=labels.videos,
+            skeletons=labels.skeletons,
+            tracks=labels.tracks,
+            suggestions=labels.suggestions,
+            provenance=labels.provenance,
+        )
     def _setup_train_val_labels(
         self,
         labels: Optional[List[sio.Labels]] = None,
@@ -218,21 +272,35 @@ class ModelTrainer:
         total_val_lfs = 0
         self.skeletons = labels[0].skeletons
+        # Check if we should count only user-labeled frames
+        user_instances_only = OmegaConf.select(
+            self.config, "data_config.user_instances_only", default=True
+        )
         # check if all `.slp` file shave same skeleton structure (if multiple slp file paths are provided)
         skeleton = self.skeletons[0]
         for index, train_label in enumerate(labels):
             skel_temp = train_label.skeletons[0]
             skeletons_equal = skeleton.matches(skel_temp)
-            if skeletons_equal:
-                total_train_lfs += len(train_label)
-            else:
+            if not skeletons_equal:
                 message = f"The skeletons in the training labels: {index + 1} do not match the skeleton in the first training label file."
                 logger.error(message)
                 raise ValueError(message)
-        if val_labels is None or not len(val_labels):
+        # Check for same-data mode (train = val, for intentional overfitting)
+        use_same = OmegaConf.select(
+            self.config, "data_config.use_same_data_for_val", default=False
+        )
+        if use_same:
+            # Same mode: use identical data for train and val (for overfitting)
+            logger.info("Using same data for train and val (overfit mode)")
+            self.train_labels = labels
+            self.val_labels = labels
+            total_train_lfs = self._count_labeled_frames(labels, user_instances_only)
+            total_val_lfs = total_train_lfs
+        elif val_labels is None or not len(val_labels):
             # if val labels are not provided, split from train
-            total_train_lfs = 0
             val_fraction = OmegaConf.select(
                 self.config, "data_config.validation_fraction", default=0.1
             )
@@ -250,13 +318,14 @@ class ModelTrainer:
                 )
                 self.train_labels.append(train_split)
                 self.val_labels.append(val_split)
+                # make_training_splits returns only user-labeled frames
                 total_train_lfs += len(train_split)
                 total_val_lfs += len(val_split)
         else:
             self.train_labels = labels
             self.val_labels = val_labels
-            for val_l in self.val_labels:
-                total_val_lfs += len(val_l)
+            total_train_lfs = self._count_labeled_frames(labels, user_instances_only)
+            total_val_lfs = self._count_labeled_frames(val_labels, user_instances_only)
         logger.info(f"# Train Labeled frames: {total_train_lfs}")
         logger.info(f"# Val Labeled frames: {total_val_lfs}")
@@ -291,13 +360,70 @@ class ModelTrainer:
             ):
                 # compute crop size if not provided in config
                 if crop_size is None:
+                    # Get padding from config or auto-compute from augmentation settings
+                    padding = self.config.data_config.preprocessing.crop_padding
+                    if padding is None:
+                        # Auto-compute padding based on augmentation settings
+                        aug_config = self.config.data_config.augmentation_config
+                        if (
+                            self.config.data_config.use_augmentations_train
+                            and aug_config is not None
+                            and aug_config.geometric is not None
+                        ):
+                            geo = aug_config.geometric
+                            # Check if rotation is enabled (via rotation_p or affine_p)
+                            rotation_enabled = (
+                                geo.rotation_p is not None and geo.rotation_p > 0
+                            ) or (
+                                geo.rotation_p is None
+                                and geo.scale_p is None
+                                and geo.translate_p is None
+                                and geo.affine_p > 0
+                            )
+                            # Check if scale is enabled (via scale_p or affine_p)
+                            scale_enabled = (
+                                geo.scale_p is not None and geo.scale_p > 0
+                            ) or (
+                                geo.rotation_p is None
+                                and geo.scale_p is None
+                                and geo.translate_p is None
+                                and geo.affine_p > 0
+                            )
+                            if rotation_enabled or scale_enabled:
+                                # First find the actual max bbox size from labels
+                                bbox_size = find_max_instance_bbox_size(train_label)
+                                bbox_size = max(
+                                    bbox_size,
+                                    self.config.data_config.preprocessing.min_crop_size
+                                    or 100,
+                                )
+                                rotation_max = (
+                                    max(
+                                        abs(geo.rotation_min),
+                                        abs(geo.rotation_max),
+                                    )
+                                    if rotation_enabled
+                                    else 0.0
+                                )
+                                scale_max = geo.scale_max if scale_enabled else 1.0
+                                padding = compute_augmentation_padding(
+                                    bbox_size=bbox_size,
+                                    rotation_max=rotation_max,
+                                    scale_max=scale_max,
+                                )
+                            else:
+                                padding = 0
+                        else:
+                            padding = 0
                     crop_sz = find_instance_crop_size(
                         labels=train_label,
+                        padding=padding,
                         maximum_stride=self.config.model_config.backbone_config[
                             f"{self.backbone_type}"
                         ]["max_stride"],
                         min_crop_size=self.config.data_config.preprocessing.min_crop_size,
-                        input_scaling=self.config.data_config.preprocessing.scale,
                     )
                     if crop_sz > max_crop_size:
@@ -361,16 +487,36 @@ class ModelTrainer:
             ckpt_dir = "."
             self.config.trainer_config.ckpt_dir = ckpt_dir
         run_name = self.config.trainer_config.run_name
-        if run_name is None or run_name == "" or run_name == "None":
+        run_name_is_empty = run_name is None or run_name == "" or run_name == "None"
+        # Validate: multi-GPU + disk cache requires explicit run_name
+        if run_name_is_empty:
+            is_disk_caching = (
+                self.config.data_config.data_pipeline_fw
+                == "torch_dataset_cache_img_disk"
+            )
+            num_devices = self._get_trainer_devices()
+            if is_disk_caching and num_devices > 1:
+                raise ValueError(
+                    f"Multi-GPU training with disk caching requires an explicit `run_name`.\n\n"
+                    f"Detected {num_devices} device(s) with "
+                    f"`data_pipeline_fw='torch_dataset_cache_img_disk'`.\n"
+                    f"Without an explicit run_name, each GPU worker generates a different "
+                    f"timestamp-based directory, causing cache synchronization failures.\n\n"
+                    f"Please provide a run_name using one of these methods:\n"
+                    f"  - CLI: sleap-nn train config.yaml trainer_config.run_name=my_experiment\n"
+                    f"  - Config file: Set `trainer_config.run_name: my_experiment`\n"
+                    f"  - Python API: train(..., run_name='my_experiment')"
+                )
+            # Auto-generate timestamp-based run_name (safe for single GPU or non-disk-cache)
             sum_train_lfs = sum([len(train_label) for train_label in self.train_labels])
             sum_val_lfs = sum([len(val_label) for val_label in self.val_labels])
-            if self._get_trainer_devices() > 1:
-                run_name = f"{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
-            else:
-                run_name = (
-                    datetime.now().strftime("%y%m%d_%H%M%S")
-                    + f".{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
-                )
+            run_name = (
+                datetime.now().strftime("%y%m%d_%H%M%S")
+                + f".{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
+            )
         # If checkpoint path already exists, add suffix to prevent overwriting
         if (Path(ckpt_dir) / run_name).exists() and (
@@ -509,6 +655,10 @@ class ModelTrainer:
         """Compute config parameters."""
         logger.info("Setting up config...")
+        # Normalize empty strings to None for optional wandb fields
+        if self.config.trainer_config.wandb.prv_runid == "":
+            self.config.trainer_config.wandb.prv_runid = None
         # compute preprocessing parameters from the labels objects and fill in the config
         self._setup_preprocessing_config()
@@ -558,39 +708,89 @@ class ModelTrainer:
                 )
             )
-        # setup checkpoint path
+        # setup checkpoint path (generates run_name if not specified)
         self._setup_ckpt_path()
+        # Default wandb run name to trainer run_name if not specified
+        # Note: This must come after _setup_ckpt_path() which generates run_name
+        if self.config.trainer_config.wandb.name is None:
+            self.config.trainer_config.wandb.name = self.config.trainer_config.run_name
         # verify input_channels in model_config based on input image and pretrained model weights
         self._verify_model_input_channels()
     def _setup_model_ckpt_dir(self):
-        """Create the model ckpt folder."""
+        """Create the model ckpt folder and save ground truth labels."""
         ckpt_path = (
             Path(self.config.trainer_config.ckpt_dir)
             / self.config.trainer_config.run_name
         ).as_posix()
         logger.info(f"Setting up model ckpt dir: `{ckpt_path}`...")
-        if not Path(ckpt_path).exists():
-            try:
-                Path(ckpt_path).mkdir(parents=True, exist_ok=True)
-            except OSError as e:
-                message = f"Cannot create a new folder in {ckpt_path}.\n {e}"
-                logger.error(message)
-                raise OSError(message)
+        # Only rank 0 (or non-distributed) should create directories and save files
         if RANK in [0, -1]:
+            if not Path(ckpt_path).exists():
+                try:
+                    Path(ckpt_path).mkdir(parents=True, exist_ok=True)
+                except OSError as e:
+                    message = f"Cannot create a new folder in {ckpt_path}.\n {e}"
+                    logger.error(message)
+                    raise OSError(message)
+            # Check if we should filter to user-labeled frames only
+            user_instances_only = OmegaConf.select(
+                self.config, "data_config.user_instances_only", default=True
+            )
+            # Save train and val ground truth labels
             for idx, (train, val) in enumerate(zip(self.train_labels, self.val_labels)):
-                train.save(
-                    Path(ckpt_path) / f"labels_train_gt_{idx}.slp",
+                # Filter to user-labeled frames if needed (for evaluation)
+                if user_instances_only:
+                    train_filtered = self._filter_to_user_labeled(train)
+                    val_filtered = self._filter_to_user_labeled(val)
+                else:
+                    train_filtered = train
+                    val_filtered = val
+                train_filtered.save(
+                    Path(ckpt_path) / f"labels_gt.train.{idx}.slp",
                     restore_original_videos=False,
                 )
-                val.save(
-                    Path(ckpt_path) / f"labels_val_gt_{idx}.slp",
+                val_filtered.save(
+                    Path(ckpt_path) / f"labels_gt.val.{idx}.slp",
                     restore_original_videos=False,
                 )
+            # Save test ground truth labels if test paths are provided
+            test_file_path = OmegaConf.select(
+                self.config, "data_config.test_file_path", default=None
+            )
+            if test_file_path is not None:
+                # Normalize to list of strings
+                if isinstance(test_file_path, str):
+                    test_paths = [test_file_path]
+                else:
+                    test_paths = list(test_file_path)
+                for idx, test_path in enumerate(test_paths):
+                    # Only save if it's a .slp file (not a video file)
+                    if test_path.endswith(".slp") or test_path.endswith(".pkg.slp"):
+                        try:
+                            test_labels = sio.load_slp(test_path)
+                            if user_instances_only:
+                                test_filtered = self._filter_to_user_labeled(
+                                    test_labels
+                                )
+                            else:
+                                test_filtered = test_labels
+                            test_filtered.save(
+                                Path(ckpt_path) / f"labels_gt.test.{idx}.slp",
+                                restore_original_videos=False,
+                            )
+                        except Exception as e:
+                            logger.warning(
+                                f"Could not save test ground truth for {test_path}: {e}"
+                            )
     def _setup_viz_datasets(self):
         """Setup dataloaders."""
         data_viz_config = self.config.copy()
@@ -608,10 +808,40 @@ class ModelTrainer:
         base_cache_img_path = None
         if self.config.data_config.data_pipeline_fw == "torch_dataset_cache_img_memory":
             # check available memory. If insufficient memory, default to disk caching.
+            # Account for DataLoader worker memory overhead
+            train_num_workers = self.config.trainer_config.train_data_loader.num_workers
+            val_num_workers = self.config.trainer_config.val_data_loader.num_workers
+            max_num_workers = max(train_num_workers, val_num_workers)
             mem_available = check_cache_memory(
-                self.train_labels, self.val_labels, memory_buffer=MEMORY_BUFFER
+                self.train_labels,
+                self.val_labels,
+                memory_buffer=MEMORY_BUFFER,
+                num_workers=max_num_workers,
             )
             if not mem_available:
+                # Validate: multi-GPU + auto-generated run_name + fallback to disk cache
+                original_run_name = self._initial_config.trainer_config.run_name
+                run_name_was_auto = (
+                    original_run_name is None
+                    or original_run_name == ""
+                    or original_run_name == "None"
+                )
+                if run_name_was_auto and self.trainer.num_devices > 1:
+                    raise ValueError(
+                        f"Memory caching failed and disk caching fallback requires an "
+                        f"explicit `run_name` for multi-GPU training.\n\n"
+                        f"Detected {self.trainer.num_devices} device(s) with insufficient "
+                        f"memory for in-memory caching.\n"
+                        f"Without an explicit run_name, each GPU worker generates a different "
+                        f"timestamp-based directory, causing cache synchronization failures.\n\n"
+                        f"Please provide a run_name using one of these methods:\n"
+                        f"  - CLI: sleap-nn train config.yaml trainer_config.run_name=my_experiment\n"
+                        f"  - Config file: Set `trainer_config.run_name: my_experiment`\n"
+                        f"  - Python API: train(..., run_name='my_experiment')\n\n"
+                        f"Alternatively, use `data_pipeline_fw='torch_dataset'` to disable caching."
+                    )
                 self.config.data_config.data_pipeline_fw = (
                     "torch_dataset_cache_img_disk"
                 )
@@ -655,7 +885,7 @@ class ModelTrainer:
                     / self.config.trainer_config.run_name
                 ).as_posix(),
                 filename="best",
-                monitor="val_loss",
+                monitor="val/loss",
                 mode="min",
             )
             callbacks.append(checkpoint_callback)
@@ -663,18 +893,52 @@ class ModelTrainer:
             # csv log callback
             csv_log_keys = [
                 "epoch",
-                "train_loss",
-                "val_loss",
+                "train/loss",
+                "val/loss",
                 "learning_rate",
-                "train_time",
-                "val_time",
+                "train/time",
+                "val/time",
             ]
+            # Add model-specific keys for wandb parity
             if self.model_type in [
                 "single_instance",
                 "centered_instance",
                 "multi_class_topdown",
             ]:
-                csv_log_keys.extend(self.skeletons[0].node_names)
+                csv_log_keys.extend(
+                    [f"train/confmaps/{name}" for name in self.skeletons[0].node_names]
+                )
+            if self.model_type == "bottomup":
+                csv_log_keys.extend(
+                    [
+                        "train/confmaps_loss",
+                        "train/paf_loss",
+                        "val/confmaps_loss",
+                        "val/paf_loss",
+                    ]
+                )
+            if self.model_type == "multi_class_bottomup":
+                csv_log_keys.extend(
+                    [
+                        "train/confmaps_loss",
+                        "train/classmap_loss",
+                        "train/class_accuracy",
+                        "val/confmaps_loss",
+                        "val/classmap_loss",
+                        "val/class_accuracy",
+                    ]
+                )
+            if self.model_type == "multi_class_topdown":
+                csv_log_keys.extend(
+                    [
+                        "train/confmaps_loss",
+                        "train/classvector_loss",
+                        "train/class_accuracy",
+                        "val/confmaps_loss",
+                        "val/classvector_loss",
+                        "val/class_accuracy",
+                    ]
+                )
             csv_logger = CSVLoggerCallback(
                 filepath=Path(self.config.trainer_config.ckpt_dir)
                 / self.config.trainer_config.run_name
@@ -687,7 +951,7 @@ class ModelTrainer:
             # early stopping callback
             callbacks.append(
                 EarlyStopping(
-                    monitor="val_loss",
+                    monitor="val/loss",
                     mode="min",
                     verbose=False,
                     min_delta=self.config.trainer_config.early_stopping.min_delta,
@@ -716,6 +980,17 @@ class ModelTrainer:
             )
             loggers.append(wandb_logger)
+            # Log message about wandb local logs cleanup
+            should_delete_wandb_logs = wandb_config.delete_local_logs is True or (
+                wandb_config.delete_local_logs is None
+                and wandb_config.wandb_mode != "offline"
+            )
+            if should_delete_wandb_logs:
+                logger.info(
+                    "WandB local logs will be deleted after training completes. "
+                    "To keep logs, set trainer_config.wandb.delete_local_logs=false"
+                )
             # save the configs as yaml in the checkpoint dir
             # Mask API key in both configs to prevent saving to disk
             self.config.trainer_config.wandb.api_key = ""
@@ -734,11 +1009,8 @@ class ModelTrainer:
             )
             callbacks.append(ProgressReporterZMQ(address=publish_address))
-        # viz callbacks
+        # viz callbacks - use unified callback for all visualization outputs
         if self.config.trainer_config.visualize_preds_during_training:
-            train_viz_pipeline = cycle(viz_train_dataset)
-            val_viz_pipeline = cycle(viz_val_dataset)
             viz_dir = (
                 Path(self.config.trainer_config.ckpt_dir)
                 / self.config.trainer_config.run_name
@@ -748,77 +1020,74 @@ class ModelTrainer:
                 if RANK in [0, -1]:
                     Path(viz_dir).mkdir(parents=True, exist_ok=True)
-            callbacks.append(
-                MatplotlibSaver(
-                    save_folder=viz_dir,
-                    plot_fn=lambda: self.lightning_model.visualize_example(
-                        next(train_viz_pipeline)
-                    ),
-                    prefix="train",
-                )
+            # Get wandb viz config options
+            log_wandb = self.config.trainer_config.use_wandb and OmegaConf.select(
+                self.config, "trainer_config.wandb.save_viz_imgs_wandb", default=False
             )
+            wandb_modes = []
+            if log_wandb:
+                if OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_enabled", default=True
+                ):
+                    wandb_modes.append("direct")
+                if OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_boxes", default=False
+                ):
+                    wandb_modes.append("boxes")
+                if OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_masks", default=False
+                ):
+                    wandb_modes.append("masks")
+            # Single unified callback handles all visualization outputs
             callbacks.append(
-                MatplotlibSaver(
-                    save_folder=viz_dir,
-                    plot_fn=lambda: self.lightning_model.visualize_example(
-                        next(val_viz_pipeline)
+                UnifiedVizCallback(
+                    model_trainer=self,
+                    train_dataset=viz_train_dataset,
+                    val_dataset=viz_val_dataset,
+                    model_type=self.model_type,
+                    save_local=self.config.trainer_config.save_ckpt,
+                    local_save_dir=viz_dir,
+                    log_wandb=log_wandb,
+                    wandb_modes=wandb_modes if wandb_modes else ["direct"],
+                    wandb_box_size=OmegaConf.select(
+                        self.config, "trainer_config.wandb.viz_box_size", default=5.0
+                    ),
+                    wandb_confmap_threshold=OmegaConf.select(
+                        self.config,
+                        "trainer_config.wandb.viz_confmap_threshold",
+                        default=0.1,
+                    ),
+                    log_wandb_table=OmegaConf.select(
+                        self.config, "trainer_config.wandb.log_viz_table", default=False
                     ),
-                    prefix="validation",
                 )
             )
-            if self.model_type == "bottomup":
-                train_viz_pipeline1 = cycle(copy.deepcopy(viz_train_dataset))
-                val_viz_pipeline1 = cycle(copy.deepcopy(viz_val_dataset))
-                callbacks.append(
-                    MatplotlibSaver(
-                        save_folder=viz_dir,
-                        plot_fn=lambda: self.lightning_model.visualize_pafs_example(
-                            next(train_viz_pipeline1)
-                        ),
-                        prefix="train.pafs_magnitude",
-                    )
-                )
-                callbacks.append(
-                    MatplotlibSaver(
-                        save_folder=viz_dir,
-                        plot_fn=lambda: self.lightning_model.visualize_pafs_example(
-                            next(val_viz_pipeline1)
-                        ),
-                        prefix="validation.pafs_magnitude",
-                    )
-                )
+        # Add custom progress bar with better metric formatting
+        if self.config.trainer_config.enable_progress_bar:
+            callbacks.append(SleapProgressBar())
-            if self.model_type == "multi_class_bottomup":
-                train_viz_pipeline1 = cycle(copy.deepcopy(viz_train_dataset))
-                val_viz_pipeline1 = cycle(copy.deepcopy(viz_val_dataset))
+        # Add epoch-end evaluation callback if enabled
+        if self.config.trainer_config.eval.enabled:
+            if self.model_type == "centroid":
+                # Use centroid-specific evaluation with distance-based metrics
                 callbacks.append(
-                    MatplotlibSaver(
-                        save_folder=viz_dir,
-                        plot_fn=lambda: self.lightning_model.visualize_class_maps_example(
-                            next(train_viz_pipeline1)
-                        ),
-                        prefix="train.class_maps",
+                    CentroidEvaluationCallback(
+                        videos=self.val_labels[0].videos,
+                        eval_frequency=self.config.trainer_config.eval.frequency,
+                        match_threshold=self.config.trainer_config.eval.match_threshold,
                     )
                 )
+            else:
+                # Use standard OKS/PCK evaluation for pose models
                 callbacks.append(
-                    MatplotlibSaver(
-                        save_folder=viz_dir,
-                        plot_fn=lambda: self.lightning_model.visualize_class_maps_example(
-                            next(val_viz_pipeline1)
-                        ),
-                        prefix="validation.class_maps",
-                    )
-                )
-            if self.config.trainer_config.use_wandb and OmegaConf.select(
-                self.config, "trainer_config.wandb.save_viz_imgs_wandb", default=False
-            ):
-                callbacks.append(
-                    WandBPredImageLogger(
-                        viz_folder=viz_dir,
-                        wandb_run_name=self.config.trainer_config.wandb.name,
-                        is_bottomup=(self.model_type == "bottomup"),
+                    EpochEndEvaluationCallback(
+                        skeleton=self.skeletons[0],
+                        videos=self.val_labels[0].videos,
+                        eval_frequency=self.config.trainer_config.eval.frequency,
+                        oks_stddev=self.config.trainer_config.eval.oks_stddev,
+                        oks_scale=self.config.trainer_config.eval.oks_scale,
                     )
                 )
@@ -910,6 +1179,11 @@ class ModelTrainer:
                     : self.config.trainer_config.trainer_devices
                 ]
             ]
+            # Sort device indices in ascending order for NCCL compatibility.
+            # NCCL expects devices in consistent ascending order across ranks
+            # to properly set up communication rings. Without sorting, DDP may
+            # assign multiple ranks to the same GPU, causing "Duplicate GPU detected" errors.
+            devices.sort()
             logger.info(f"Using GPUs with most available memory: {devices}")
         # create lightning.Trainer instance.
@@ -931,6 +1205,10 @@ class ModelTrainer:
         # setup datasets
         train_dataset, val_dataset = self._setup_datasets()
+        # Barrier after dataset creation to ensure all workers wait for disk caching
+        # (rank 0 caches to disk, others must wait before reading cached files)
+        self.trainer.strategy.barrier()
         # set-up steps per epoch
         train_steps_per_epoch = self.config.trainer_config.train_steps_per_epoch
         if train_steps_per_epoch is None:
@@ -959,7 +1237,7 @@ class ModelTrainer:
         logger.info(f"Backbone model: {self.lightning_model.model.backbone}")
         logger.info(f"Head model: {self.lightning_model.model.head_layers}")
         total_params = sum(p.numel() for p in self.lightning_model.parameters())
-        logger.info(f"Total model parameters: {total_params}")
+        logger.info(f"Total model parameters: {total_params:,}")
         self.config.model_config.total_params = total_params
         # setup dataloaders
@@ -1000,6 +1278,26 @@ class ModelTrainer:
                         id=self.config.trainer_config.wandb.prv_runid,
                         group=self.config.trainer_config.wandb.group,
                     )
+                # Define custom x-axes for wandb metrics
+                # Epoch-level metrics use epoch as x-axis, step-level use default global_step
+                wandb.define_metric("epoch")
+                # Training metrics (train/ prefix for grouping) - all use epoch x-axis
+                wandb.define_metric("train/*", step_metric="epoch")
+                wandb.define_metric("train/confmaps/*", step_metric="epoch")
+                # Validation metrics (val/ prefix for grouping)
+                wandb.define_metric("val/*", step_metric="epoch")
+                # Evaluation metrics (eval/ prefix for grouping)
+                wandb.define_metric("eval/*", step_metric="epoch")
+                # Visualization images (need explicit nested paths)
+                wandb.define_metric("viz/*", step_metric="epoch")
+                wandb.define_metric("viz/train/*", step_metric="epoch")
+                wandb.define_metric("viz/val/*", step_metric="epoch")
                 self.config.trainer_config.wandb.current_run_id = wandb.run.id
                 wandb.config["run_name"] = self.config.trainer_config.wandb.name
                 wandb.config["run_config"] = OmegaConf.to_container(
@@ -1017,6 +1315,9 @@ class ModelTrainer:
         self.trainer.strategy.barrier()
+        # Flag to track if training was interrupted (not completed normally)
+        training_interrupted = False
         try:
             logger.info(
                 f"Finished trainer set up. [{time.time() - start_setup_time:.1f}s]"
@@ -1032,13 +1333,13 @@ class ModelTrainer:
         except KeyboardInterrupt:
             logger.info("Stopping training...")
+            training_interrupted = True
         finally:
             logger.info(
                 f"Finished training loop. [{(time.time() - start_train_time) / 60:.1f} min]"
             )
-            if self.trainer.global_rank == 0 and self.config.trainer_config.use_wandb:
-                wandb.finish()
+            # Note: wandb.finish() is called in train.py after post-training evaluation
             # delete image disk caching
             if (
@@ -1063,3 +1364,15 @@ class ModelTrainer:
                     if viz_dir.exists():
                         logger.info(f"Deleting viz folder at {viz_dir}...")
                         shutil.rmtree(viz_dir, ignore_errors=True)
+            # Clean up entire run folder if training was interrupted (KeyboardInterrupt)
+            if training_interrupted and self.trainer.global_rank == 0:
+                run_dir = (
+                    Path(self.config.trainer_config.ckpt_dir)
+                    / self.config.trainer_config.run_name
+                )
+                if run_dir.exists():
+                    logger.info(
+                        f"Training canceled - cleaning up run folder at {run_dir}..."
+                    )
+                    shutil.rmtree(run_dir, ignore_errors=True)

sleap-nn 0.0.5__py3-none-any.whl → 0.1.0__py3-none-any.whl

sleap-nn 0.0.5py3-none-any.whl → 0.1.0py3-none-any.whl