PyPI - sleap-nn - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.0a1__py3-none-any.whl - Mend

sleap-nn 0.1.0py3-none-any.whl → 0.1.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sleap_nn/__init__.py +1 -1
sleap_nn/architectures/convnext.py +0 -5
sleap_nn/architectures/encoder_decoder.py +6 -25
sleap_nn/architectures/swint.py +0 -8
sleap_nn/cli.py +60 -364
sleap_nn/config/data_config.py +5 -11
sleap_nn/config/get_config.py +4 -5
sleap_nn/config/trainer_config.py +0 -71
sleap_nn/data/augmentation.py +241 -50
sleap_nn/data/custom_datasets.py +34 -364
sleap_nn/data/instance_cropping.py +1 -1
sleap_nn/data/resizing.py +2 -2
sleap_nn/data/utils.py +17 -135
sleap_nn/evaluation.py +22 -81
sleap_nn/inference/bottomup.py +20 -86
sleap_nn/inference/peak_finding.py +19 -88
sleap_nn/inference/predictors.py +117 -224
sleap_nn/legacy_models.py +11 -65
sleap_nn/predict.py +9 -37
sleap_nn/train.py +4 -69
sleap_nn/training/callbacks.py +105 -1046
sleap_nn/training/lightning_modules.py +65 -602
sleap_nn/training/model_trainer.py +204 -201
{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/METADATA +3 -15
sleap_nn-0.1.0a1.dist-info/RECORD +65 -0
{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/WHEEL +1 -1
sleap_nn/data/skia_augmentation.py +0 -414
sleap_nn/export/__init__.py +0 -21
sleap_nn/export/cli.py +0 -1778
sleap_nn/export/exporters/__init__.py +0 -51
sleap_nn/export/exporters/onnx_exporter.py +0 -80
sleap_nn/export/exporters/tensorrt_exporter.py +0 -291
sleap_nn/export/metadata.py +0 -225
sleap_nn/export/predictors/__init__.py +0 -63
sleap_nn/export/predictors/base.py +0 -22
sleap_nn/export/predictors/onnx.py +0 -154
sleap_nn/export/predictors/tensorrt.py +0 -312
sleap_nn/export/utils.py +0 -307
sleap_nn/export/wrappers/__init__.py +0 -25
sleap_nn/export/wrappers/base.py +0 -96
sleap_nn/export/wrappers/bottomup.py +0 -243
sleap_nn/export/wrappers/bottomup_multiclass.py +0 -195
sleap_nn/export/wrappers/centered_instance.py +0 -56
sleap_nn/export/wrappers/centroid.py +0 -58
sleap_nn/export/wrappers/single_instance.py +0 -83
sleap_nn/export/wrappers/topdown.py +0 -180
sleap_nn/export/wrappers/topdown_multiclass.py +0 -304
sleap_nn/inference/postprocessing.py +0 -284
sleap_nn/training/schedulers.py +0 -191
sleap_nn-0.1.0.dist-info/RECORD +0 -88
{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/entry_points.txt +0 -0
{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/licenses/LICENSE +0 -0
{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/top_level.txt +0 -0

sleap_nn/training/model_trainer.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import shutil
+import copy
 import attrs
 import torch
 import random
@@ -15,13 +16,14 @@ import yaml
 from pathlib import Path
 from typing import List, Optional
 from datetime import datetime
-from itertools import count
+from itertools import cycle, count
 from omegaconf import DictConfig, OmegaConf
 from lightning.pytorch.loggers import WandbLogger
 from sleap_nn.data.utils import check_cache_memory
 from lightning.pytorch.callbacks import (
     ModelCheckpoint,
     EarlyStopping,
+    LearningRateMonitor,
 )
 from lightning.pytorch.profilers import (
     SimpleProfiler,
@@ -53,11 +55,12 @@ from sleap_nn.config.training_job_config import verify_training_cfg
 from sleap_nn.training.callbacks import (
     ProgressReporterZMQ,
     TrainingControllerZMQ,
+    MatplotlibSaver,
+    WandBPredImageLogger,
+    WandBVizCallback,
+    WandBVizCallbackWithPAFs,
     CSVLoggerCallback,
     SleapProgressBar,
-    EpochEndEvaluationCallback,
-    CentroidEvaluationCallback,
-    UnifiedVizCallback,
 )
 from sleap_nn import RANK
 from sleap_nn.legacy_models import get_keras_first_layer_channels
@@ -487,36 +490,16 @@ class ModelTrainer:
             ckpt_dir = "."
             self.config.trainer_config.ckpt_dir = ckpt_dir
         run_name = self.config.trainer_config.run_name
-        run_name_is_empty = run_name is None or run_name == "" or run_name == "None"
-        # Validate: multi-GPU + disk cache requires explicit run_name
-        if run_name_is_empty:
-            is_disk_caching = (
-                self.config.data_config.data_pipeline_fw
-                == "torch_dataset_cache_img_disk"
-            )
-            num_devices = self._get_trainer_devices()
-            if is_disk_caching and num_devices > 1:
-                raise ValueError(
-                    f"Multi-GPU training with disk caching requires an explicit `run_name`.\n\n"
-                    f"Detected {num_devices} device(s) with "
-                    f"`data_pipeline_fw='torch_dataset_cache_img_disk'`.\n"
-                    f"Without an explicit run_name, each GPU worker generates a different "
-                    f"timestamp-based directory, causing cache synchronization failures.\n\n"
-                    f"Please provide a run_name using one of these methods:\n"
-                    f"  - CLI: sleap-nn train config.yaml trainer_config.run_name=my_experiment\n"
-                    f"  - Config file: Set `trainer_config.run_name: my_experiment`\n"
-                    f"  - Python API: train(..., run_name='my_experiment')"
-                )
-            # Auto-generate timestamp-based run_name (safe for single GPU or non-disk-cache)
+        if run_name is None or run_name == "" or run_name == "None":
             sum_train_lfs = sum([len(train_label) for train_label in self.train_labels])
             sum_val_lfs = sum([len(val_label) for val_label in self.val_labels])
-            run_name = (
-                datetime.now().strftime("%y%m%d_%H%M%S")
-                + f".{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
-            )
+            if self._get_trainer_devices() > 1:
+                run_name = f"{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
+            else:
+                run_name = (
+                    datetime.now().strftime("%y%m%d_%H%M%S")
+                    + f".{self.model_type}.n={sum_train_lfs + sum_val_lfs}"
+                )
         # If checkpoint path already exists, add suffix to prevent overwriting
         if (Path(ckpt_dir) / run_name).exists() and (
@@ -659,6 +642,10 @@ class ModelTrainer:
         if self.config.trainer_config.wandb.prv_runid == "":
             self.config.trainer_config.wandb.prv_runid = None
+        # Default wandb run name to trainer run_name if not specified
+        if self.config.trainer_config.wandb.name is None:
+            self.config.trainer_config.wandb.name = self.config.trainer_config.run_name
         # compute preprocessing parameters from the labels objects and fill in the config
         self._setup_preprocessing_config()
@@ -708,14 +695,9 @@ class ModelTrainer:
                 )
             )
-        # setup checkpoint path (generates run_name if not specified)
+        # setup checkpoint path
         self._setup_ckpt_path()
-        # Default wandb run name to trainer run_name if not specified
-        # Note: This must come after _setup_ckpt_path() which generates run_name
-        if self.config.trainer_config.wandb.name is None:
-            self.config.trainer_config.wandb.name = self.config.trainer_config.run_name
         # verify input_channels in model_config based on input image and pretrained model weights
         self._verify_model_input_channels()
@@ -727,15 +709,15 @@ class ModelTrainer:
         ).as_posix()
         logger.info(f"Setting up model ckpt dir: `{ckpt_path}`...")
-        # Only rank 0 (or non-distributed) should create directories and save files
+        if not Path(ckpt_path).exists():
+            try:
+                Path(ckpt_path).mkdir(parents=True, exist_ok=True)
+            except OSError as e:
+                message = f"Cannot create a new folder in {ckpt_path}.\n {e}"
+                logger.error(message)
+                raise OSError(message)
         if RANK in [0, -1]:
-            if not Path(ckpt_path).exists():
-                try:
-                    Path(ckpt_path).mkdir(parents=True, exist_ok=True)
-                except OSError as e:
-                    message = f"Cannot create a new folder in {ckpt_path}.\n {e}"
-                    logger.error(message)
-                    raise OSError(message)
             # Check if we should filter to user-labeled frames only
             user_instances_only = OmegaConf.select(
                 self.config, "data_config.user_instances_only", default=True
@@ -808,40 +790,10 @@ class ModelTrainer:
         base_cache_img_path = None
         if self.config.data_config.data_pipeline_fw == "torch_dataset_cache_img_memory":
             # check available memory. If insufficient memory, default to disk caching.
-            # Account for DataLoader worker memory overhead
-            train_num_workers = self.config.trainer_config.train_data_loader.num_workers
-            val_num_workers = self.config.trainer_config.val_data_loader.num_workers
-            max_num_workers = max(train_num_workers, val_num_workers)
             mem_available = check_cache_memory(
-                self.train_labels,
-                self.val_labels,
-                memory_buffer=MEMORY_BUFFER,
-                num_workers=max_num_workers,
+                self.train_labels, self.val_labels, memory_buffer=MEMORY_BUFFER
             )
             if not mem_available:
-                # Validate: multi-GPU + auto-generated run_name + fallback to disk cache
-                original_run_name = self._initial_config.trainer_config.run_name
-                run_name_was_auto = (
-                    original_run_name is None
-                    or original_run_name == ""
-                    or original_run_name == "None"
-                )
-                if run_name_was_auto and self.trainer.num_devices > 1:
-                    raise ValueError(
-                        f"Memory caching failed and disk caching fallback requires an "
-                        f"explicit `run_name` for multi-GPU training.\n\n"
-                        f"Detected {self.trainer.num_devices} device(s) with insufficient "
-                        f"memory for in-memory caching.\n"
-                        f"Without an explicit run_name, each GPU worker generates a different "
-                        f"timestamp-based directory, causing cache synchronization failures.\n\n"
-                        f"Please provide a run_name using one of these methods:\n"
-                        f"  - CLI: sleap-nn train config.yaml trainer_config.run_name=my_experiment\n"
-                        f"  - Config file: Set `trainer_config.run_name: my_experiment`\n"
-                        f"  - Python API: train(..., run_name='my_experiment')\n\n"
-                        f"Alternatively, use `data_pipeline_fw='torch_dataset'` to disable caching."
-                    )
                 self.config.data_config.data_pipeline_fw = (
                     "torch_dataset_cache_img_disk"
                 )
@@ -885,7 +837,7 @@ class ModelTrainer:
                     / self.config.trainer_config.run_name
                 ).as_posix(),
                 filename="best",
-                monitor="val/loss",
+                monitor="val_loss",
                 mode="min",
             )
             callbacks.append(checkpoint_callback)
@@ -893,52 +845,18 @@ class ModelTrainer:
             # csv log callback
             csv_log_keys = [
                 "epoch",
-                "train/loss",
-                "val/loss",
+                "train_loss",
+                "val_loss",
                 "learning_rate",
-                "train/time",
-                "val/time",
+                "train_time",
+                "val_time",
             ]
-            # Add model-specific keys for wandb parity
             if self.model_type in [
                 "single_instance",
                 "centered_instance",
                 "multi_class_topdown",
             ]:
-                csv_log_keys.extend(
-                    [f"train/confmaps/{name}" for name in self.skeletons[0].node_names]
-                )
-            if self.model_type == "bottomup":
-                csv_log_keys.extend(
-                    [
-                        "train/confmaps_loss",
-                        "train/paf_loss",
-                        "val/confmaps_loss",
-                        "val/paf_loss",
-                    ]
-                )
-            if self.model_type == "multi_class_bottomup":
-                csv_log_keys.extend(
-                    [
-                        "train/confmaps_loss",
-                        "train/classmap_loss",
-                        "train/class_accuracy",
-                        "val/confmaps_loss",
-                        "val/classmap_loss",
-                        "val/class_accuracy",
-                    ]
-                )
-            if self.model_type == "multi_class_topdown":
-                csv_log_keys.extend(
-                    [
-                        "train/confmaps_loss",
-                        "train/classvector_loss",
-                        "train/class_accuracy",
-                        "val/confmaps_loss",
-                        "val/classvector_loss",
-                        "val/class_accuracy",
-                    ]
-                )
+                csv_log_keys.extend(self.skeletons[0].node_names)
             csv_logger = CSVLoggerCallback(
                 filepath=Path(self.config.trainer_config.ckpt_dir)
                 / self.config.trainer_config.run_name
@@ -951,7 +869,7 @@ class ModelTrainer:
             # early stopping callback
             callbacks.append(
                 EarlyStopping(
-                    monitor="val/loss",
+                    monitor="val_loss",
                     mode="min",
                     verbose=False,
                     min_delta=self.config.trainer_config.early_stopping.min_delta,
@@ -991,6 +909,10 @@ class ModelTrainer:
                     "To keep logs, set trainer_config.wandb.delete_local_logs=false"
                 )
+            # Learning rate monitor callback - logs LR at each step for dynamic schedulers
+            # Only added when wandb is enabled since it requires a logger
+            callbacks.append(LearningRateMonitor(logging_interval="step"))
             # save the configs as yaml in the checkpoint dir
             # Mask API key in both configs to prevent saving to disk
             self.config.trainer_config.wandb.api_key = ""
@@ -1009,8 +931,11 @@ class ModelTrainer:
             )
             callbacks.append(ProgressReporterZMQ(address=publish_address))
-        # viz callbacks - use unified callback for all visualization outputs
+        # viz callbacks
         if self.config.trainer_config.visualize_preds_during_training:
+            train_viz_pipeline = cycle(viz_train_dataset)
+            val_viz_pipeline = cycle(viz_val_dataset)
             viz_dir = (
                 Path(self.config.trainer_config.ckpt_dir)
                 / self.config.trainer_config.run_name
@@ -1020,77 +945,147 @@ class ModelTrainer:
                 if RANK in [0, -1]:
                     Path(viz_dir).mkdir(parents=True, exist_ok=True)
-            # Get wandb viz config options
-            log_wandb = self.config.trainer_config.use_wandb and OmegaConf.select(
-                self.config, "trainer_config.wandb.save_viz_imgs_wandb", default=False
-            )
-            wandb_modes = []
-            if log_wandb:
-                if OmegaConf.select(
-                    self.config, "trainer_config.wandb.viz_enabled", default=True
-                ):
-                    wandb_modes.append("direct")
-                if OmegaConf.select(
-                    self.config, "trainer_config.wandb.viz_boxes", default=False
-                ):
-                    wandb_modes.append("boxes")
-                if OmegaConf.select(
-                    self.config, "trainer_config.wandb.viz_masks", default=False
-                ):
-                    wandb_modes.append("masks")
-            # Single unified callback handles all visualization outputs
             callbacks.append(
-                UnifiedVizCallback(
-                    model_trainer=self,
-                    train_dataset=viz_train_dataset,
-                    val_dataset=viz_val_dataset,
-                    model_type=self.model_type,
-                    save_local=self.config.trainer_config.save_ckpt,
-                    local_save_dir=viz_dir,
-                    log_wandb=log_wandb,
-                    wandb_modes=wandb_modes if wandb_modes else ["direct"],
-                    wandb_box_size=OmegaConf.select(
-                        self.config, "trainer_config.wandb.viz_box_size", default=5.0
+                MatplotlibSaver(
+                    save_folder=viz_dir,
+                    plot_fn=lambda: self.lightning_model.visualize_example(
+                        next(train_viz_pipeline)
                     ),
-                    wandb_confmap_threshold=OmegaConf.select(
-                        self.config,
-                        "trainer_config.wandb.viz_confmap_threshold",
-                        default=0.1,
-                    ),
-                    log_wandb_table=OmegaConf.select(
-                        self.config, "trainer_config.wandb.log_viz_table", default=False
+                    prefix="train",
+                )
+            )
+            callbacks.append(
+                MatplotlibSaver(
+                    save_folder=viz_dir,
+                    plot_fn=lambda: self.lightning_model.visualize_example(
+                        next(val_viz_pipeline)
                     ),
+                    prefix="validation",
                 )
             )
-        # Add custom progress bar with better metric formatting
-        if self.config.trainer_config.enable_progress_bar:
-            callbacks.append(SleapProgressBar())
+            if self.model_type == "bottomup":
+                train_viz_pipeline1 = cycle(copy.deepcopy(viz_train_dataset))
+                val_viz_pipeline1 = cycle(copy.deepcopy(viz_val_dataset))
+                callbacks.append(
+                    MatplotlibSaver(
+                        save_folder=viz_dir,
+                        plot_fn=lambda: self.lightning_model.visualize_pafs_example(
+                            next(train_viz_pipeline1)
+                        ),
+                        prefix="train.pafs_magnitude",
+                    )
+                )
+                callbacks.append(
+                    MatplotlibSaver(
+                        save_folder=viz_dir,
+                        plot_fn=lambda: self.lightning_model.visualize_pafs_example(
+                            next(val_viz_pipeline1)
+                        ),
+                        prefix="validation.pafs_magnitude",
+                    )
+                )
-        # Add epoch-end evaluation callback if enabled
-        if self.config.trainer_config.eval.enabled:
-            if self.model_type == "centroid":
-                # Use centroid-specific evaluation with distance-based metrics
+            if self.model_type == "multi_class_bottomup":
+                train_viz_pipeline1 = cycle(copy.deepcopy(viz_train_dataset))
+                val_viz_pipeline1 = cycle(copy.deepcopy(viz_val_dataset))
                 callbacks.append(
-                    CentroidEvaluationCallback(
-                        videos=self.val_labels[0].videos,
-                        eval_frequency=self.config.trainer_config.eval.frequency,
-                        match_threshold=self.config.trainer_config.eval.match_threshold,
+                    MatplotlibSaver(
+                        save_folder=viz_dir,
+                        plot_fn=lambda: self.lightning_model.visualize_class_maps_example(
+                            next(train_viz_pipeline1)
+                        ),
+                        prefix="train.class_maps",
                     )
                 )
-            else:
-                # Use standard OKS/PCK evaluation for pose models
                 callbacks.append(
-                    EpochEndEvaluationCallback(
-                        skeleton=self.skeletons[0],
-                        videos=self.val_labels[0].videos,
-                        eval_frequency=self.config.trainer_config.eval.frequency,
-                        oks_stddev=self.config.trainer_config.eval.oks_stddev,
-                        oks_scale=self.config.trainer_config.eval.oks_scale,
+                    MatplotlibSaver(
+                        save_folder=viz_dir,
+                        plot_fn=lambda: self.lightning_model.visualize_class_maps_example(
+                            next(val_viz_pipeline1)
+                        ),
+                        prefix="validation.class_maps",
                     )
                 )
+            if self.config.trainer_config.use_wandb and OmegaConf.select(
+                self.config, "trainer_config.wandb.save_viz_imgs_wandb", default=False
+            ):
+                # Get wandb viz config options
+                viz_enabled = OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_enabled", default=True
+                )
+                viz_boxes = OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_boxes", default=False
+                )
+                viz_masks = OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_masks", default=False
+                )
+                viz_box_size = OmegaConf.select(
+                    self.config, "trainer_config.wandb.viz_box_size", default=5.0
+                )
+                viz_confmap_threshold = OmegaConf.select(
+                    self.config,
+                    "trainer_config.wandb.viz_confmap_threshold",
+                    default=0.1,
+                )
+                log_viz_table = OmegaConf.select(
+                    self.config, "trainer_config.wandb.log_viz_table", default=False
+                )
+                # Create viz data pipelines for wandb callback
+                wandb_train_viz_pipeline = cycle(copy.deepcopy(viz_train_dataset))
+                wandb_val_viz_pipeline = cycle(copy.deepcopy(viz_val_dataset))
+                if self.model_type == "bottomup":
+                    # Bottom-up model needs PAF visualizations
+                    wandb_train_pafs_pipeline = cycle(copy.deepcopy(viz_train_dataset))
+                    wandb_val_pafs_pipeline = cycle(copy.deepcopy(viz_val_dataset))
+                    callbacks.append(
+                        WandBVizCallbackWithPAFs(
+                            train_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_train_viz_pipeline)
+                            ),
+                            val_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_val_viz_pipeline)
+                            ),
+                            train_pafs_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_train_pafs_pipeline), include_pafs=True
+                            ),
+                            val_pafs_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_val_pafs_pipeline), include_pafs=True
+                            ),
+                            viz_enabled=viz_enabled,
+                            viz_boxes=viz_boxes,
+                            viz_masks=viz_masks,
+                            box_size=viz_box_size,
+                            confmap_threshold=viz_confmap_threshold,
+                            log_table=log_viz_table,
+                        )
+                    )
+                else:
+                    # Standard models
+                    callbacks.append(
+                        WandBVizCallback(
+                            train_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_train_viz_pipeline)
+                            ),
+                            val_viz_fn=lambda: self.lightning_model.get_visualization_data(
+                                next(wandb_val_viz_pipeline)
+                            ),
+                            viz_enabled=viz_enabled,
+                            viz_boxes=viz_boxes,
+                            viz_masks=viz_masks,
+                            box_size=viz_box_size,
+                            confmap_threshold=viz_confmap_threshold,
+                            log_table=log_viz_table,
+                        )
+                    )
+        # Add custom progress bar with better metric formatting
+        if self.config.trainer_config.enable_progress_bar:
+            callbacks.append(SleapProgressBar())
         return loggers, callbacks
     def _delete_cache_imgs(self):
@@ -1179,11 +1174,6 @@ class ModelTrainer:
                     : self.config.trainer_config.trainer_devices
                 ]
             ]
-            # Sort device indices in ascending order for NCCL compatibility.
-            # NCCL expects devices in consistent ascending order across ranks
-            # to properly set up communication rings. Without sorting, DDP may
-            # assign multiple ranks to the same GPU, causing "Duplicate GPU detected" errors.
-            devices.sort()
             logger.info(f"Using GPUs with most available memory: {devices}")
         # create lightning.Trainer instance.
@@ -1205,10 +1195,6 @@ class ModelTrainer:
         # setup datasets
         train_dataset, val_dataset = self._setup_datasets()
-        # Barrier after dataset creation to ensure all workers wait for disk caching
-        # (rank 0 caches to disk, others must wait before reading cached files)
-        self.trainer.strategy.barrier()
         # set-up steps per epoch
         train_steps_per_epoch = self.config.trainer_config.train_steps_per_epoch
         if train_steps_per_epoch is None:
@@ -1282,21 +1268,18 @@ class ModelTrainer:
                 # Define custom x-axes for wandb metrics
                 # Epoch-level metrics use epoch as x-axis, step-level use default global_step
                 wandb.define_metric("epoch")
-                # Training metrics (train/ prefix for grouping) - all use epoch x-axis
-                wandb.define_metric("train/*", step_metric="epoch")
-                wandb.define_metric("train/confmaps/*", step_metric="epoch")
-                # Validation metrics (val/ prefix for grouping)
-                wandb.define_metric("val/*", step_metric="epoch")
-                # Evaluation metrics (eval/ prefix for grouping)
-                wandb.define_metric("eval/*", step_metric="epoch")
-                # Visualization images (need explicit nested paths)
-                wandb.define_metric("viz/*", step_metric="epoch")
-                wandb.define_metric("viz/train/*", step_metric="epoch")
-                wandb.define_metric("viz/val/*", step_metric="epoch")
+                wandb.define_metric("val_loss", step_metric="epoch")
+                wandb.define_metric("val_time", step_metric="epoch")
+                wandb.define_metric("train_time", step_metric="epoch")
+                # Per-node losses use epoch as x-axis
+                for node_name in self.skeletons[0].node_names:
+                    wandb.define_metric(node_name, step_metric="epoch")
+                # Visualization images use epoch as x-axis
+                wandb.define_metric("train_predictions*", step_metric="epoch")
+                wandb.define_metric("val_predictions*", step_metric="epoch")
+                wandb.define_metric("train_pafs*", step_metric="epoch")
+                wandb.define_metric("val_pafs*", step_metric="epoch")
                 self.config.trainer_config.wandb.current_run_id = wandb.run.id
                 wandb.config["run_name"] = self.config.trainer_config.wandb.name
@@ -1339,7 +1322,27 @@ class ModelTrainer:
             logger.info(
                 f"Finished training loop. [{(time.time() - start_train_time) / 60:.1f} min]"
             )
-            # Note: wandb.finish() is called in train.py after post-training evaluation
+            if self.trainer.global_rank == 0 and self.config.trainer_config.use_wandb:
+                wandb.finish()
+                # Delete local wandb logs if configured
+                wandb_config = self.config.trainer_config.wandb
+                should_delete_wandb_logs = wandb_config.delete_local_logs is True or (
+                    wandb_config.delete_local_logs is None
+                    and wandb_config.wandb_mode != "offline"
+                )
+                if should_delete_wandb_logs:
+                    wandb_dir = (
+                        Path(self.config.trainer_config.ckpt_dir)
+                        / self.config.trainer_config.run_name
+                        / "wandb"
+                    )
+                    if wandb_dir.exists():
+                        logger.info(
+                            f"Deleting local wandb logs at {wandb_dir}... "
+                            "(set trainer_config.wandb.delete_local_logs=false to disable)"
+                        )
+                        shutil.rmtree(wandb_dir, ignore_errors=True)
             # delete image disk caching
             if (

{sleap_nn-0.1.0.dist-info → sleap_nn-0.1.0a1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sleap-nn
-Version: 0.1.0
+Version: 0.1.0a1
 Summary: Neural network backend for training and inference for animal pose estimation.
 Author-email: Divya Seshadri Murali <dimurali@salk.edu>, Elizabeth Berrigan <eberrigan@salk.edu>, Vincent Tu <vitu@ucsd.edu>, Liezl Maree <lmaree@salk.edu>, David Samy <davidasamy@gmail.com>, Talmo Pereira <talmo@salk.edu>
 License: BSD-3-Clause
@@ -13,10 +13,10 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Python: <3.14,>=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: sleap-io<0.7.0,>=0.6.2
+Requires-Dist: sleap-io<0.7.0,>=0.6.0
 Requires-Dist: numpy
 Requires-Dist: lightning
-Requires-Dist: skia-python>=87.0
+Requires-Dist: kornia
 Requires-Dist: jsonpickle
 Requires-Dist: scipy
 Requires-Dist: attrs
@@ -32,7 +32,6 @@ Requires-Dist: hydra-core
 Requires-Dist: jupyter
 Requires-Dist: jupyterlab
 Requires-Dist: pyzmq
-Requires-Dist: rich-click>=1.9.5
 Provides-Extra: torch
 Requires-Dist: torch; extra == "torch"
 Requires-Dist: torchvision>=0.20.0; extra == "torch"
@@ -48,17 +47,6 @@ Requires-Dist: torchvision>=0.20.0; extra == "torch-cuda128"
 Provides-Extra: torch-cuda130
 Requires-Dist: torch; extra == "torch-cuda130"
 Requires-Dist: torchvision>=0.20.0; extra == "torch-cuda130"
-Provides-Extra: export
-Requires-Dist: onnx>=1.15.0; extra == "export"
-Requires-Dist: onnxruntime>=1.16.0; extra == "export"
-Requires-Dist: onnxscript>=0.1.0; extra == "export"
-Provides-Extra: export-gpu
-Requires-Dist: onnx>=1.15.0; extra == "export-gpu"
-Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "export-gpu"
-Requires-Dist: onnxscript>=0.1.0; extra == "export-gpu"
-Provides-Extra: tensorrt
-Requires-Dist: tensorrt>=10.13.0; (sys_platform == "linux" or sys_platform == "win32") and extra == "tensorrt"
-Requires-Dist: torch-tensorrt>=2.5.0; (sys_platform == "linux" or sys_platform == "win32") and extra == "tensorrt"
 Dynamic: license-file
 # sleap-nn

sleap-nn 0.1.0__py3-none-any.whl → 0.1.0a1__py3-none-any.whl

sleap-nn 0.1.0py3-none-any.whl → 0.1.0a1py3-none-any.whl