PyPI - wavedl - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

wavedl 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

wavedl/__init__.py +1 -1
wavedl/hpo.py +451 -451
wavedl/models/__init__.py +28 -0
wavedl/models/{_timm_utils.py → _pretrained_utils.py} +128 -0
wavedl/models/base.py +48 -0
wavedl/models/caformer.py +1 -1
wavedl/models/cnn.py +2 -27
wavedl/models/convnext.py +5 -18
wavedl/models/convnext_v2.py +6 -22
wavedl/models/densenet.py +5 -18
wavedl/models/efficientnetv2.py +315 -315
wavedl/models/efficientvit.py +398 -0
wavedl/models/fastvit.py +6 -39
wavedl/models/mamba.py +44 -24
wavedl/models/maxvit.py +51 -48
wavedl/models/mobilenetv3.py +295 -295
wavedl/models/regnet.py +406 -406
wavedl/models/resnet.py +14 -56
wavedl/models/resnet3d.py +258 -258
wavedl/models/swin.py +443 -443
wavedl/models/tcn.py +393 -409
wavedl/models/unet.py +1 -5
wavedl/models/unireplknet.py +491 -0
wavedl/models/vit.py +3 -3
wavedl/train.py +1430 -1430
wavedl/utils/config.py +367 -367
wavedl/utils/cross_validation.py +530 -530
wavedl/utils/losses.py +216 -216
wavedl/utils/optimizers.py +216 -216
wavedl/utils/schedulers.py +251 -251
{wavedl-1.6.0.dist-info → wavedl-1.6.1.dist-info}/METADATA +93 -53
wavedl-1.6.1.dist-info/RECORD +46 -0
wavedl-1.6.0.dist-info/RECORD +0 -44
{wavedl-1.6.0.dist-info → wavedl-1.6.1.dist-info}/LICENSE +0 -0
{wavedl-1.6.0.dist-info → wavedl-1.6.1.dist-info}/WHEEL +0 -0
{wavedl-1.6.0.dist-info → wavedl-1.6.1.dist-info}/entry_points.txt +0 -0
{wavedl-1.6.0.dist-info → wavedl-1.6.1.dist-info}/top_level.txt +0 -0

wavedl/train.py CHANGED Viewed

@@ -1,1430 +1,1430 @@
-"""
-WaveDL - Deep Learning for Wave-based Inverse Problems
-=======================================================
-Target Environment: NVIDIA HPC GPUs (Multi-GPU DDP) | PyTorch 2.x | Python 3.11+
-A modular training framework for wave-based inverse problems and regression:
-  1. HPC-Grade DDP Training: BF16/FP16 mixed precision with torch.compile support
-  2. Dynamic Model Selection: Use --model flag to select any registered architecture
-  3. Zero-Copy Data Engine: Memmap-backed datasets for large-scale training
-  4. Physics-Aware Metrics: Real-time physical MAE with proper unscaling
-  5. Robust Checkpointing: Resume training, periodic saves, and training curves
-  6. Deep Observability: WandB integration with scatter analysis
-Usage:
-    # Recommended: Using the HPC launcher (handles accelerate configuration)
-    wavedl-hpc --model cnn --batch_size 128 --mixed_precision bf16 --wandb
-    # Or direct training module (use --precision, not --mixed_precision)
-    accelerate launch -m wavedl.train --model cnn --batch_size 128 --precision bf16
-    # Multi-GPU with explicit config
-    wavedl-hpc --num_gpus 4 --mixed_precision bf16 --model cnn --wandb
-    # Resume from checkpoint
-    accelerate launch -m wavedl.train --model cnn --resume best_checkpoint --wandb
-    # List available models
-    wavedl-train --list_models
-Note:
-    - wavedl-hpc: Uses --mixed_precision (passed to accelerate launch)
-    - wavedl.train: Uses --precision (internal module flag)
-    Both control the same behavior; use the appropriate flag for your entry point.
-Author: Ductho Le (ductho.le@outlook.com)
-"""
-from __future__ import annotations
-# =============================================================================
-# HPC Environment Setup (MUST be before any library imports)
-# =============================================================================
-# Auto-configure writable cache directories when home is not writable.
-# Uses current working directory as fallback - works on HPC and local machines.
-import os
-def _setup_cache_dir(env_var: str, subdir: str) -> None:
-    """Set cache directory to CWD if home is not writable."""
-    if env_var in os.environ:
-        return  # User already set, respect their choice
-    # Check if home is writable
-    home = os.path.expanduser("~")
-    if os.access(home, os.W_OK):
-        return  # Home is writable, let library use defaults
-    # Home not writable - use current working directory
-    cache_path = os.path.join(os.getcwd(), f".{subdir}")
-    os.makedirs(cache_path, exist_ok=True)
-    os.environ[env_var] = cache_path
-# Configure cache directories (before any library imports)
-_setup_cache_dir("TORCH_HOME", "torch_cache")
-_setup_cache_dir("MPLCONFIGDIR", "matplotlib")
-_setup_cache_dir("FONTCONFIG_CACHE", "fontconfig")
-_setup_cache_dir("XDG_DATA_HOME", "local/share")
-_setup_cache_dir("XDG_STATE_HOME", "local/state")
-_setup_cache_dir("XDG_CACHE_HOME", "cache")
-def _setup_per_rank_compile_cache() -> None:
-    """Set per-GPU Triton/Inductor cache to prevent multi-process race warnings.
-    When using torch.compile with multiple GPUs, all processes try to write to
-    the same cache directory, causing 'Directory is not empty - skipping!' warnings.
-    This gives each GPU rank its own isolated cache subdirectory.
-    """
-    # Get local rank from environment (set by accelerate/torchrun)
-    local_rank = os.environ.get("LOCAL_RANK", "0")
-    # Get cache base from environment or use CWD
-    cache_base = os.environ.get(
-        "TRITON_CACHE_DIR", os.path.join(os.getcwd(), ".triton_cache")
-    )
-    # Set per-rank cache directories
-    os.environ["TRITON_CACHE_DIR"] = os.path.join(cache_base, f"rank_{local_rank}")
-    os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(
-        os.environ.get(
-            "TORCHINDUCTOR_CACHE_DIR", os.path.join(os.getcwd(), ".inductor_cache")
-        ),
-        f"rank_{local_rank}",
-    )
-    # Create directories
-    os.makedirs(os.environ["TRITON_CACHE_DIR"], exist_ok=True)
-    os.makedirs(os.environ["TORCHINDUCTOR_CACHE_DIR"], exist_ok=True)
-# Setup per-rank compile caches (before torch imports)
-_setup_per_rank_compile_cache()
-# =============================================================================
-# Standard imports (after environment setup)
-# =============================================================================
-import argparse
-import logging
-import pickle
-import shutil
-import sys
-import time
-import warnings
-from typing import Any
-# Suppress Pydantic warnings from accelerate's internal Field() usage
-warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import torch
-import torch.distributed as dist
-from accelerate import Accelerator
-from accelerate.utils import set_seed
-from sklearn.metrics import r2_score
-from tqdm.auto import tqdm
-from wavedl.models import build_model, get_model, list_models
-from wavedl.utils import (
-    FIGURE_DPI,
-    MetricTracker,
-    broadcast_early_stop,
-    calc_pearson,
-    create_training_curves,
-    get_loss,
-    get_lr,
-    get_optimizer,
-    get_scheduler,
-    is_epoch_based,
-    list_losses,
-    list_optimizers,
-    list_schedulers,
-    plot_scientific_scatter,
-    prepare_data,
-)
-try:
-    import wandb
-    WANDB_AVAILABLE = True
-except ImportError:
-    WANDB_AVAILABLE = False
-# ==============================================================================
-# RUNTIME CONFIGURATION (post-import)
-# ==============================================================================
-# Configure matplotlib paths for HPC systems without writable home directories
-os.environ.setdefault("MPLCONFIGDIR", os.getenv("TMPDIR", "/tmp") + "/matplotlib")
-os.environ.setdefault("FONTCONFIG_PATH", "/etc/fonts")
-# Suppress warnings from known-noisy libraries, but preserve legitimate warnings
-# from torch/numpy about NaN, dtype, and numerical issues.
-warnings.filterwarnings("ignore", category=FutureWarning)
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-# Pydantic v1/v2 compatibility warnings
-warnings.filterwarnings("ignore", module="pydantic")
-warnings.filterwarnings("ignore", message=".*UnsupportedFieldAttributeWarning.*")
-# Transformer library warnings (loading configs, etc.)
-warnings.filterwarnings("ignore", module="transformers")
-# Accelerate verbose messages
-warnings.filterwarnings("ignore", module="accelerate")
-# torch.compile backend selection warnings
-warnings.filterwarnings("ignore", message=".*TorchDynamo.*")
-warnings.filterwarnings("ignore", message=".*Dynamo is not supported.*")
-# Note: UserWarning from torch/numpy core is NOT suppressed to preserve
-# legitimate warnings about NaN values, dtype mismatches, etc.
-# ==============================================================================
-# GPU PERFORMANCE OPTIMIZATIONS (Ampere/Hopper: A100, H100)
-# ==============================================================================
-# Enable TF32 for faster matmul (safe precision for training, ~2x speedup)
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.set_float32_matmul_precision("high")  # Use TF32 for float32 ops
-# Enable cuDNN autotuning for fixed-size inputs (CNN-like models benefit most)
-# Note: First few batches may be slower due to benchmarking
-torch.backends.cudnn.benchmark = True
-# ==============================================================================
-# LOGGING UTILITIES
-# ==============================================================================
-from contextlib import contextmanager
-@contextmanager
-def suppress_accelerate_logging():
-    """Temporarily suppress accelerate's verbose checkpoint save messages."""
-    accelerate_logger = logging.getLogger("accelerate.checkpointing")
-    original_level = accelerate_logger.level
-    accelerate_logger.setLevel(logging.WARNING)
-    try:
-        yield
-    finally:
-        accelerate_logger.setLevel(original_level)
-# ==============================================================================
-# ARGUMENT PARSING
-# ==============================================================================
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments with comprehensive options."""
-    parser = argparse.ArgumentParser(
-        description="Universal DDP Training Pipeline",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    # Model Selection
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="cnn",
-        help=f"Model architecture to train. Available: {list_models()}",
-    )
-    parser.add_argument(
-        "--list_models", action="store_true", help="List all available models and exit"
-    )
-    parser.add_argument(
-        "--import",
-        dest="import_modules",
-        type=str,
-        nargs="+",
-        default=[],
-        help="Python modules to import before training (for custom models)",
-    )
-    parser.add_argument(
-        "--no_pretrained",
-        dest="pretrained",
-        action="store_false",
-        help="Train from scratch without pretrained weights (default: use pretrained)",
-    )
-    parser.set_defaults(pretrained=True)
-    # Configuration File
-    parser.add_argument(
-        "--config",
-        type=str,
-        default=None,
-        help="Path to YAML config file. CLI args override config values.",
-    )
-    # Hyperparameters
-    parser.add_argument(
-        "--batch_size", type=int, default=128, help="Batch size per GPU"
-    )
-    parser.add_argument("--lr", type=float, default=1e-3, help="Initial learning rate")
-    parser.add_argument(
-        "--epochs", type=int, default=1000, help="Maximum training epochs"
-    )
-    parser.add_argument(
-        "--patience", type=int, default=20, help="Early stopping patience"
-    )
-    parser.add_argument("--weight_decay", type=float, default=1e-4, help="Weight decay")
-    parser.add_argument(
-        "--grad_clip", type=float, default=1.0, help="Gradient clipping norm"
-    )
-    # Loss Function
-    parser.add_argument(
-        "--loss",
-        type=str,
-        default="mse",
-        choices=["mse", "mae", "huber", "smooth_l1", "log_cosh", "weighted_mse"],
-        help=f"Loss function for training. Available: {list_losses()}",
-    )
-    parser.add_argument(
-        "--huber_delta", type=float, default=1.0, help="Delta for Huber loss"
-    )
-    parser.add_argument(
-        "--loss_weights",
-        type=str,
-        default=None,
-        help="Comma-separated weights for weighted_mse (e.g., '1.0,2.0,1.0')",
-    )
-    # Optimizer
-    parser.add_argument(
-        "--optimizer",
-        type=str,
-        default="adamw",
-        choices=["adamw", "adam", "sgd", "nadam", "radam", "rmsprop"],
-        help=f"Optimizer for training. Available: {list_optimizers()}",
-    )
-    parser.add_argument(
-        "--momentum", type=float, default=0.9, help="Momentum for SGD/RMSprop"
-    )
-    parser.add_argument(
-        "--nesterov", action="store_true", help="Use Nesterov momentum (SGD)"
-    )
-    parser.add_argument(
-        "--betas",
-        type=str,
-        default="0.9,0.999",
-        help="Betas for Adam variants (comma-separated)",
-    )
-    # Learning Rate Scheduler
-    parser.add_argument(
-        "--scheduler",
-        type=str,
-        default="plateau",
-        choices=[
-            "plateau",
-            "cosine",
-            "cosine_restarts",
-            "onecycle",
-            "step",
-            "multistep",
-            "exponential",
-            "linear_warmup",
-        ],
-        help=f"LR scheduler. Available: {list_schedulers()}",
-    )
-    parser.add_argument(
-        "--scheduler_patience",
-        type=int,
-        default=10,
-        help="Patience for ReduceLROnPlateau",
-    )
-    parser.add_argument(
-        "--min_lr", type=float, default=1e-6, help="Minimum learning rate"
-    )
-    parser.add_argument(
-        "--scheduler_factor", type=float, default=0.5, help="LR reduction factor"
-    )
-    parser.add_argument(
-        "--warmup_epochs", type=int, default=5, help="Warmup epochs for linear_warmup"
-    )
-    parser.add_argument(
-        "--step_size", type=int, default=30, help="Step size for StepLR"
-    )
-    parser.add_argument(
-        "--milestones",
-        type=str,
-        default=None,
-        help="Comma-separated epochs for MultiStepLR (e.g., '30,60,90')",
-    )
-    # Data
-    parser.add_argument(
-        "--data_path", type=str, default="train_data.npz", help="Path to NPZ dataset"
-    )
-    parser.add_argument(
-        "--workers",
-        type=int,
-        default=-1,
-        help="DataLoader workers per GPU (-1=auto-detect based on CPU cores)",
-    )
-    parser.add_argument("--seed", type=int, default=2025, help="Random seed")
-    parser.add_argument(
-        "--deterministic",
-        action="store_true",
-        help="Enable deterministic mode for reproducibility (slower, disables TF32/cuDNN benchmark)",
-    )
-    parser.add_argument(
-        "--cache_validate",
-        type=str,
-        default="sha256",
-        choices=["sha256", "fast", "size"],
-        help="Cache validation mode: sha256 (full hash), fast (partial), size (quick)",
-    )
-    parser.add_argument(
-        "--single_channel",
-        action="store_true",
-        help="Confirm data is single-channel (suppress ambiguous shape warnings for shallow 3D volumes)",
-    )
-    # Cross-Validation
-    parser.add_argument(
-        "--cv",
-        type=int,
-        default=0,
-        help="Enable K-fold cross-validation with K folds (0=disabled)",
-    )
-    parser.add_argument(
-        "--cv_stratify",
-        action="store_true",
-        help="Use stratified splitting for cross-validation",
-    )
-    parser.add_argument(
-        "--cv_bins",
-        type=int,
-        default=10,
-        help="Number of bins for stratified CV (only with --cv_stratify)",
-    )
-    # Checkpointing & Resume
-    parser.add_argument(
-        "--resume", type=str, default=None, help="Checkpoint directory to resume from"
-    )
-    parser.add_argument(
-        "--save_every",
-        type=int,
-        default=50,
-        help="Save checkpoint every N epochs (0=disable)",
-    )
-    parser.add_argument(
-        "--output_dir", type=str, default=".", help="Output directory for checkpoints"
-    )
-    parser.add_argument(
-        "--fresh",
-        action="store_true",
-        help="Force fresh training, ignore existing checkpoints",
-    )
-    # Performance
-    parser.add_argument(
-        "--compile", action="store_true", help="Enable torch.compile (PyTorch 2.x)"
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        default="bf16",
-        choices=["bf16", "fp16", "no"],
-        help="Mixed precision mode",
-    )
-    # Alias for consistency with wavedl-hpc (--mixed_precision)
-    parser.add_argument(
-        "--mixed_precision",
-        dest="precision",
-        type=str,
-        choices=["bf16", "fp16", "no"],
-        help=argparse.SUPPRESS,  # Hidden: use --precision instead
-    )
-    # Physical Constraints
-    parser.add_argument(
-        "--constraint",
-        type=str,
-        nargs="+",
-        default=[],
-        help="Soft constraint expressions: 'y0 - y1*y2' (penalize violations)",
-    )
-    parser.add_argument(
-        "--constraint_file",
-        type=str,
-        default=None,
-        help="Python file with constraint(pred, inputs) function",
-    )
-    parser.add_argument(
-        "--constraint_weight",
-        type=float,
-        nargs="+",
-        default=[0.1],
-        help="Weight(s) for soft constraints (one per constraint, or single shared weight)",
-    )
-    parser.add_argument(
-        "--constraint_reduction",
-        type=str,
-        default="mse",
-        choices=["mse", "mae"],
-        help="Reduction mode for constraint penalties",
-    )
-    # Logging
-    parser.add_argument(
-        "--wandb", action="store_true", help="Enable Weights & Biases logging"
-    )
-    parser.add_argument(
-        "--wandb_watch",
-        action="store_true",
-        help="Enable WandB gradient watching (adds overhead, useful for debugging)",
-    )
-    parser.add_argument(
-        "--project_name", type=str, default="DL-Training", help="WandB project name"
-    )
-    parser.add_argument("--run_name", type=str, default=None, help="WandB run name")
-    args = parser.parse_args()
-    return args, parser  # Returns (Namespace, ArgumentParser)
-# ==============================================================================
-# MAIN TRAINING FUNCTION
-# ==============================================================================
-def main():
-    args, parser = parse_args()
-    # Import custom model modules if specified
-    if args.import_modules:
-        import importlib
-        for module_name in args.import_modules:
-            try:
-                # Handle both module names (my_model) and file paths (./my_model.py)
-                if module_name.endswith(".py"):
-                    # Import from file path with unique module name
-                    import importlib.util
-                    # Derive unique module name from filename to avoid collisions
-                    base_name = os.path.splitext(os.path.basename(module_name))[0]
-                    unique_name = f"wavedl_custom_{base_name}"
-                    spec = importlib.util.spec_from_file_location(
-                        unique_name, module_name
-                    )
-                    if spec and spec.loader:
-                        module = importlib.util.module_from_spec(spec)
-                        sys.modules[unique_name] = module
-                        spec.loader.exec_module(module)
-                        print(f"✓ Imported custom module from: {module_name}")
-                else:
-                    # Import as regular module
-                    importlib.import_module(module_name)
-                    print(f"✓ Imported module: {module_name}")
-            except (ImportError, FileNotFoundError, SyntaxError, PermissionError) as e:
-                print(f"✗ Failed to import '{module_name}': {e}", file=sys.stderr)
-                if isinstance(e, FileNotFoundError):
-                    print("  File does not exist. Check the path.", file=sys.stderr)
-                elif isinstance(e, SyntaxError):
-                    print(
-                        f"  Syntax error at line {e.lineno}: {e.msg}", file=sys.stderr
-                    )
-                elif isinstance(e, PermissionError):
-                    print(
-                        "  Permission denied. Check file permissions.", file=sys.stderr
-                    )
-                else:
-                    print(
-                        "  Make sure the module is in your Python path or current directory.",
-                        file=sys.stderr,
-                    )
-                sys.exit(1)
-    # Handle --list_models flag
-    if args.list_models:
-        print("Available models:")
-        for name in list_models():
-            ModelClass = get_model(name)
-            # Get first non-empty docstring line
-            if ModelClass.__doc__:
-                lines = [
-                    l.strip() for l in ModelClass.__doc__.splitlines() if l.strip()
-                ]
-                doc_first_line = lines[0] if lines else "No description"
-            else:
-                doc_first_line = "No description"
-            print(f"  - {name}: {doc_first_line}")
-        sys.exit(0)
-    # Load and merge config file if provided
-    if args.config:
-        from wavedl.utils.config import (
-            load_config,
-            merge_config_with_args,
-            validate_config,
-        )
-        print(f"📄 Loading config from: {args.config}")
-        config = load_config(args.config)
-        # Validate config values
-        warnings_list = validate_config(config)
-        for w in warnings_list:
-            print(f"  ⚠ {w}")
-        # Merge config with CLI args (CLI takes precedence via parser defaults detection)
-        args = merge_config_with_args(config, args, parser=parser)
-    # Handle --cv flag (cross-validation mode)
-    if args.cv > 0:
-        print(f"🔄 Cross-Validation Mode: {args.cv} folds")
-        from wavedl.utils.cross_validation import run_cross_validation
-        # Load data for CV using memory-efficient loader
-        from wavedl.utils.data import DataSource, get_data_source
-        data_format = DataSource.detect_format(args.data_path)
-        source = get_data_source(data_format)
-        # Use memory-mapped loading when available (now returns LazyDataHandle for all formats)
-        _cv_handle = None
-        if hasattr(source, "load_mmap"):
-            _cv_handle = source.load_mmap(args.data_path)
-            X, y = _cv_handle.inputs, _cv_handle.outputs
-        else:
-            X, y = source.load(args.data_path)
-        # Handle sparse matrices (must materialize for CV shuffling)
-        if hasattr(X, "__getitem__") and len(X) > 0 and hasattr(X[0], "toarray"):
-            X = np.stack([x.toarray() for x in X])
-        # Normalize target shape: (N,) -> (N, 1) for consistency
-        if y.ndim == 1:
-            y = y.reshape(-1, 1)
-        # Validate and determine input shape (consistent with prepare_data)
-        # Check for ambiguous shapes that could be multi-channel or shallow 3D volume
-        sample_shape = X.shape[1:]  # Per-sample shape
-        # Same heuristic as prepare_data: detect ambiguous 3D shapes
-        is_ambiguous_shape = (
-            len(sample_shape) == 3  # Exactly 3D: could be (C, H, W) or (D, H, W)
-            and sample_shape[0] <= 16  # First dim looks like channels
-            and sample_shape[1] > 16
-            and sample_shape[2] > 16  # Both spatial dims are large
-        )
-        if is_ambiguous_shape and not args.single_channel:
-            raise ValueError(
-                f"Ambiguous input shape detected: sample shape {sample_shape}. "
-                f"This could be either:\n"
-                f"  - Multi-channel 2D data (C={sample_shape[0]}, H={sample_shape[1]}, W={sample_shape[2]})\n"
-                f"  - Single-channel 3D volume (D={sample_shape[0]}, H={sample_shape[1]}, W={sample_shape[2]})\n\n"
-                f"If this is single-channel 3D/shallow volume data, use --single_channel flag.\n"
-                f"If this is multi-channel 2D data, reshape to (N*C, H, W) with adjusted targets."
-            )
-        # in_shape = spatial dimensions for model registry (channel added during training)
-        in_shape = sample_shape
-        # Run cross-validation
-        try:
-            run_cross_validation(
-                X=X,
-                y=y,
-                model_name=args.model,
-                in_shape=in_shape,
-                out_size=y.shape[1],
-                folds=args.cv,
-                stratify=args.cv_stratify,
-                stratify_bins=args.cv_bins,
-                batch_size=args.batch_size,
-                lr=args.lr,
-                epochs=args.epochs,
-                patience=args.patience,
-                weight_decay=args.weight_decay,
-                loss_name=args.loss,
-                optimizer_name=args.optimizer,
-                scheduler_name=args.scheduler,
-                output_dir=args.output_dir,
-                workers=args.workers,
-                seed=args.seed,
-            )
-        finally:
-            # Clean up file handle if HDF5/MAT
-            if _cv_handle is not None and hasattr(_cv_handle, "close"):
-                try:
-                    _cv_handle.close()
-                except Exception as e:
-                    logging.debug(f"Failed to close CV data handle: {e}")
-        return
-    # ==========================================================================
-    # SYSTEM INITIALIZATION
-    # ==========================================================================
-    # Initialize Accelerator for DDP and mixed precision
-    accelerator = Accelerator(
-        mixed_precision=args.precision,
-        log_with="wandb" if args.wandb and WANDB_AVAILABLE else None,
-    )
-    set_seed(args.seed)
-    # Deterministic mode for scientific reproducibility
-    # Disables TF32 and cuDNN benchmark for exact reproducibility (slower)
-    if args.deterministic:
-        torch.backends.cudnn.benchmark = False
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
-        torch.use_deterministic_algorithms(True, warn_only=True)
-        if accelerator.is_main_process:
-            print("🔒 Deterministic mode enabled (slower but reproducible)")
-    # Configure logging (rank 0 only prints to console)
-    logging.basicConfig(
-        level=logging.INFO if accelerator.is_main_process else logging.ERROR,
-        format="%(asctime)s | %(levelname)s | %(message)s",
-        datefmt="%H:%M:%S",
-    )
-    logger = logging.getLogger("Trainer")
-    # Ensure output directory exists (critical for cache files, checkpoints, etc.)
-    os.makedirs(args.output_dir, exist_ok=True)
-    # Auto-detect optimal DataLoader workers if not specified
-    if args.workers < 0:
-        cpu_count = os.cpu_count() or 4
-        num_gpus = accelerator.num_processes
-        # Heuristic: 4-16 workers per GPU, bounded by available CPU cores
-        # Increased cap from 8 to 16 for high-throughput GPUs (H100, A100)
-        args.workers = min(16, max(2, (cpu_count - 2) // num_gpus))
-        if accelerator.is_main_process:
-            logger.info(
-                f"⚙️  Auto-detected workers: {args.workers} per GPU "
-                f"(CPUs: {cpu_count}, GPUs: {num_gpus})"
-            )
-    if accelerator.is_main_process:
-        logger.info(f"🚀 Cluster Status: {accelerator.num_processes}x GPUs detected")
-        logger.info(
-            f"   Model: {args.model} | Precision: {args.precision} | Compile: {args.compile}"
-        )
-        logger.info(
-            f"   Loss: {args.loss} | Optimizer: {args.optimizer} | Scheduler: {args.scheduler}"
-        )
-        logger.info(f"   Early Stopping Patience: {args.patience} epochs")
-        if args.save_every > 0:
-            logger.info(f"   Periodic Checkpointing: Every {args.save_every} epochs")
-        if args.resume:
-            logger.info(f"   📂 Resuming from: {args.resume}")
-        # Initialize WandB
-        if args.wandb and WANDB_AVAILABLE:
-            accelerator.init_trackers(
-                project_name=args.project_name,
-                config=vars(args),
-                init_kwargs={"wandb": {"name": args.run_name or f"{args.model}_run"}},
-            )
-    # ==========================================================================
-    # DATA & MODEL LOADING
-    # ==========================================================================
-    train_dl, val_dl, scaler, in_shape, out_dim = prepare_data(
-        args, logger, accelerator, cache_dir=args.output_dir
-    )
-    # Build model using registry
-    model = build_model(
-        args.model, in_shape=in_shape, out_size=out_dim, pretrained=args.pretrained
-    )
-    if accelerator.is_main_process:
-        param_info = model.parameter_summary()
-        logger.info(
-            f"   Model Parameters: {param_info['trainable_parameters']:,} trainable"
-        )
-        logger.info(f"   Model Size: {param_info['total_mb']:.2f} MB")
-    # Optional WandB model watching (opt-in due to overhead on large models)
-    if (
-        args.wandb
-        and args.wandb_watch
-        and WANDB_AVAILABLE
-        and accelerator.is_main_process
-    ):
-        wandb.watch(model, log="gradients", log_freq=100)
-        logger.info("   📊 WandB gradient watching enabled")
-    # Torch 2.0 compilation (requires compatible Triton on GPU)
-    if args.compile:
-        try:
-            # Test if Triton is available - just import the package
-            # Different Triton versions have different internal APIs, so just check base import
-            import triton
-            model = torch.compile(model)
-            if accelerator.is_main_process:
-                logger.info("   ✔ torch.compile enabled (Triton backend)")
-        except ImportError as e:
-            if accelerator.is_main_process:
-                if "triton" in str(e).lower():
-                    logger.warning(
-                        "   ⚠ Triton not installed or incompatible version - torch.compile disabled. "
-                        "Training will proceed without compilation."
-                    )
-                else:
-                    logger.warning(
-                        f"   ⚠ torch.compile setup failed: {e}. Continuing without compilation."
-                    )
-        except Exception as e:
-            if accelerator.is_main_process:
-                logger.warning(
-                    f"   ⚠ torch.compile failed: {e}. Continuing without compilation."
-                )
-    # ==========================================================================
-    # OPTIMIZER, SCHEDULER & LOSS CONFIGURATION
-    # ==========================================================================
-    # Parse comma-separated arguments with validation
-    try:
-        betas_list = [float(x.strip()) for x in args.betas.split(",")]
-        if len(betas_list) != 2:
-            raise ValueError(
-                f"--betas must have exactly 2 values, got {len(betas_list)}"
-            )
-        if not all(0.0 <= b < 1.0 for b in betas_list):
-            raise ValueError(f"--betas values must be in [0, 1), got {betas_list}")
-        betas = tuple(betas_list)
-    except ValueError as e:
-        raise ValueError(
-            f"Invalid --betas format '{args.betas}': {e}. Expected format: '0.9,0.999'"
-        )
-    loss_weights = None
-    if args.loss_weights:
-        loss_weights = [float(x.strip()) for x in args.loss_weights.split(",")]
-    milestones = None
-    if args.milestones:
-        milestones = [int(x.strip()) for x in args.milestones.split(",")]
-    # Create optimizer using factory
-    optimizer = get_optimizer(
-        name=args.optimizer,
-        params=model.get_optimizer_groups(args.lr, args.weight_decay),
-        lr=args.lr,
-        weight_decay=args.weight_decay,
-        momentum=args.momentum,
-        nesterov=args.nesterov,
-        betas=betas,
-    )
-    # Create loss function using factory
-    criterion = get_loss(
-        name=args.loss,
-        weights=loss_weights,
-        delta=args.huber_delta,
-    )
-    # Move criterion to device (important for WeightedMSELoss buffer)
-    criterion = criterion.to(accelerator.device)
-    # ==========================================================================
-    # PHYSICAL CONSTRAINTS INTEGRATION
-    # ==========================================================================
-    from wavedl.utils.constraints import (
-        PhysicsConstrainedLoss,
-        build_constraints,
-    )
-    # Build soft constraints
-    soft_constraints = build_constraints(
-        expressions=args.constraint,
-        file_path=args.constraint_file,
-        reduction=args.constraint_reduction,
-    )
-    # Wrap criterion with PhysicsConstrainedLoss if we have soft constraints
-    if soft_constraints:
-        # Pass output scaler so constraints can be evaluated in physical space
-        output_mean = scaler.mean_ if hasattr(scaler, "mean_") else None
-        output_std = scaler.scale_ if hasattr(scaler, "scale_") else None
-        criterion = PhysicsConstrainedLoss(
-            criterion,
-            soft_constraints,
-            weights=args.constraint_weight,
-            output_mean=output_mean,
-            output_std=output_std,
-        )
-        if accelerator.is_main_process:
-            logger.info(
-                f"   🔬 Physical constraints: {len(soft_constraints)} constraint(s) "
-                f"with weight(s) {args.constraint_weight}"
-            )
-            if output_mean is not None:
-                logger.info(
-                    "   📐 Constraints evaluated in physical space (denormalized)"
-                )
-    # Track if scheduler should step per batch (OneCycleLR) or per epoch
-    scheduler_step_per_batch = not is_epoch_based(args.scheduler)
-    # ==========================================================================
-    # DDP Preparation Strategy:
-    # - For batch-based schedulers (OneCycleLR): prepare DataLoaders first to get
-    #   the correct sharded batch count, then create scheduler
-    # - For epoch-based schedulers: create scheduler before prepare (no issue)
-    # ==========================================================================
-    if scheduler_step_per_batch:
-        # BATCH-BASED SCHEDULER (e.g., OneCycleLR)
-        # Prepare model, optimizer, dataloaders FIRST to get sharded loader length
-        model, optimizer, train_dl, val_dl = accelerator.prepare(
-            model, optimizer, train_dl, val_dl
-        )
-        # Now create scheduler with the CORRECT sharded steps_per_epoch
-        steps_per_epoch = len(train_dl)  # Post-DDP sharded length
-        scheduler = get_scheduler(
-            name=args.scheduler,
-            optimizer=optimizer,
-            epochs=args.epochs,
-            steps_per_epoch=steps_per_epoch,
-            min_lr=args.min_lr,
-            patience=args.scheduler_patience,
-            factor=args.scheduler_factor,
-            gamma=args.scheduler_factor,  # For Step/MultiStep/Exponential schedulers
-            step_size=args.step_size,
-            milestones=milestones,
-            warmup_epochs=args.warmup_epochs,
-        )
-        # Prepare scheduler separately (Accelerator wraps it for state saving)
-        scheduler = accelerator.prepare(scheduler)
-    else:
-        # EPOCH-BASED SCHEDULER (plateau, cosine, step, etc.)
-        # No batch count dependency - create scheduler before prepare
-        scheduler = get_scheduler(
-            name=args.scheduler,
-            optimizer=optimizer,
-            epochs=args.epochs,
-            steps_per_epoch=None,
-            min_lr=args.min_lr,
-            patience=args.scheduler_patience,
-            factor=args.scheduler_factor,
-            gamma=args.scheduler_factor,  # For Step/MultiStep/Exponential schedulers
-            step_size=args.step_size,
-            milestones=milestones,
-            warmup_epochs=args.warmup_epochs,
-        )
-        # For ReduceLROnPlateau: DON'T include scheduler in accelerator.prepare()
-        # because accelerator wraps scheduler.step() to sync across processes,
-        # which defeats our rank-0-only stepping for correct patience counting.
-        # Other schedulers are safe to prepare (no internal state affected by multi-call).
-        if args.scheduler == "plateau":
-            model, optimizer, train_dl, val_dl = accelerator.prepare(
-                model, optimizer, train_dl, val_dl
-            )
-            # Scheduler stays unwrapped - we handle sync manually in training loop
-            # But register it for checkpointing so state is saved/loaded on resume
-            accelerator.register_for_checkpointing(scheduler)
-        else:
-            model, optimizer, train_dl, val_dl, scheduler = accelerator.prepare(
-                model, optimizer, train_dl, val_dl, scheduler
-            )
-    # ==========================================================================
-    # AUTO-RESUME / RESUME FROM CHECKPOINT
-    # ==========================================================================
-    start_epoch = 0
-    best_val_loss = float("inf")
-    patience_ctr = 0
-    history: list[dict[str, Any]] = []
-    # Define checkpoint paths
-    best_ckpt_path = os.path.join(args.output_dir, "best_checkpoint")
-    complete_flag_path = os.path.join(args.output_dir, "training_complete.flag")
-    # Auto-resume logic (if not --fresh and no explicit --resume)
-    if not args.fresh and args.resume is None:
-        if os.path.exists(complete_flag_path):
-            # Training already completed
-            if accelerator.is_main_process:
-                logger.info(
-                    "✅ Training already completed (early stopping). Use --fresh to retrain."
-                )
-            return  # Exit gracefully
-        elif os.path.exists(best_ckpt_path):
-            # Incomplete training found - auto-resume
-            args.resume = best_ckpt_path
-            if accelerator.is_main_process:
-                logger.info(f"🔄 Auto-resuming from: {best_ckpt_path}")
-    if args.resume:
-        if os.path.exists(args.resume):
-            logger.info(f"🔄 Loading checkpoint from: {args.resume}")
-            accelerator.load_state(args.resume)
-            # Restore training metadata
-            meta_path = os.path.join(args.resume, "training_meta.pkl")
-            if os.path.exists(meta_path):
-                with open(meta_path, "rb") as f:
-                    meta = pickle.load(f)
-                start_epoch = meta.get("epoch", 0)
-                best_val_loss = meta.get("best_val_loss", float("inf"))
-                patience_ctr = meta.get("patience_ctr", 0)
-                logger.info(
-                    f"   ✅ Restored: Epoch {start_epoch}, Best Loss: {best_val_loss:.6f}"
-                )
-            else:
-                logger.warning(
-                    "   ⚠️ training_meta.pkl not found, starting from epoch 0"
-                )
-            # Restore history
-            history_path = os.path.join(args.output_dir, "training_history.csv")
-            if os.path.exists(history_path):
-                history = pd.read_csv(history_path).to_dict("records")
-                logger.info(f"   ✅ Loaded {len(history)} epochs from history")
-        else:
-            raise FileNotFoundError(f"Checkpoint not found: {args.resume}")
-    # ==========================================================================
-    # PHYSICAL METRIC SETUP
-    # ==========================================================================
-    # Physical MAE = normalized MAE * scaler.scale_
-    phys_scale = torch.tensor(
-        scaler.scale_, device=accelerator.device, dtype=torch.float32
-    )
-    # ==========================================================================
-    # TRAINING LOOP
-    # ==========================================================================
-    # Dynamic console header
-    if accelerator.is_main_process:
-        base_cols = ["Ep", "TrnLoss", "ValLoss", "R2", "PCC", "GradN", "LR", "MAE_Avg"]
-        param_cols = [f"MAE_P{i}" for i in range(out_dim)]
-        header = "{:<4} | {:<8} | {:<8} | {:<6} | {:<6} | {:<6} | {:<8} | {:<8}".format(
-            *base_cols
-        )
-        header += " | " + " | ".join([f"{c:<8}" for c in param_cols])
-        logger.info("=" * len(header))
-        logger.info(header)
-        logger.info("=" * len(header))
-    try:
-        total_training_time = 0.0
-        for epoch in range(start_epoch, args.epochs):
-            epoch_start_time = time.time()
-            # ==================== TRAINING PHASE ====================
-            model.train()
-            # Use GPU tensor for loss accumulation to avoid .item() sync per batch
-            train_loss_sum = torch.tensor(0.0, device=accelerator.device)
-            train_samples = 0
-            grad_norm_tracker = MetricTracker()
-            pbar = tqdm(
-                train_dl,
-                disable=not accelerator.is_main_process,
-                leave=False,
-                desc=f"Epoch {epoch + 1}",
-            )
-            for x, y in pbar:
-                with accelerator.accumulate(model):
-                    # Use mixed precision for forward pass (respects --precision flag)
-                    with accelerator.autocast():
-                        pred = model(x)
-                        # Pass inputs for input-dependent constraints (x_mean, x[...], etc.)
-                        if isinstance(criterion, PhysicsConstrainedLoss):
-                            loss = criterion(pred, y, x)
-                        else:
-                            loss = criterion(pred, y)
-                    accelerator.backward(loss)
-                    if accelerator.sync_gradients:
-                        grad_norm = accelerator.clip_grad_norm_(
-                            model.parameters(), args.grad_clip
-                        )
-                        if grad_norm is not None:
-                            grad_norm_tracker.update(grad_norm.item())
-                    optimizer.step()
-                    optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()
-                    # Per-batch LR scheduling (e.g., OneCycleLR)
-                    if scheduler_step_per_batch:
-                        scheduler.step()
-                    # Accumulate as tensors to avoid .item() sync per batch
-                    train_loss_sum += loss.detach() * x.size(0)
-                    train_samples += x.size(0)
-            # Single .item() call at end of epoch (reduces GPU sync overhead)
-            train_loss_scalar = train_loss_sum.item()
-            # Synchronize training metrics across GPUs
-            global_loss = accelerator.reduce(
-                torch.tensor([train_loss_scalar], device=accelerator.device),
-                reduction="sum",
-            ).item()
-            global_samples = accelerator.reduce(
-                torch.tensor([train_samples], device=accelerator.device),
-                reduction="sum",
-            ).item()
-            avg_train_loss = global_loss / global_samples
-            # ==================== VALIDATION PHASE ====================
-            model.eval()
-            # Use GPU tensor for loss accumulation (consistent with training phase)
-            val_loss_sum = torch.tensor(0.0, device=accelerator.device)
-            val_mae_sum = torch.zeros(out_dim, device=accelerator.device)
-            val_samples = 0
-            # Accumulate predictions locally ON CPU to prevent GPU OOM
-            local_preds = []
-            local_targets = []
-            with torch.inference_mode():
-                for x, y in val_dl:
-                    # Use mixed precision for validation (consistent with training)
-                    with accelerator.autocast():
-                        pred = model(x)
-                        # Pass inputs for input-dependent constraints
-                        if isinstance(criterion, PhysicsConstrainedLoss):
-                            loss = criterion(pred, y, x)
-                        else:
-                            loss = criterion(pred, y)
-                    val_loss_sum += loss.detach() * x.size(0)
-                    val_samples += x.size(0)
-                    # Physical MAE
-                    mae_batch = torch.abs((pred - y) * phys_scale).sum(dim=0)
-                    val_mae_sum += mae_batch
-                    # Store on CPU (critical for large val sets)
-                    local_preds.append(pred.detach().cpu())
-                    local_targets.append(y.detach().cpu())
-            # Concatenate locally (keep on GPU for gather_for_metrics compatibility)
-            local_preds_cat = torch.cat(local_preds)
-            local_targets_cat = torch.cat(local_targets)
-            # Gather predictions and targets using Accelerate's CPU-efficient utility
-            # gather_for_metrics handles:
-            # - DDP padding removal (no need to trim manually)
-            # - Efficient cross-rank gathering without GPU memory spike
-            # - Returns concatenated tensors on CPU for metric computation
-            if accelerator.num_processes > 1:
-                # Move to GPU for gather (required by NCCL), then back to CPU
-                # gather_for_metrics is more memory-efficient than manual gather
-                # as it processes in chunks internally
-                gathered_preds = accelerator.gather_for_metrics(
-                    local_preds_cat.to(accelerator.device)
-                ).cpu()
-                gathered_targets = accelerator.gather_for_metrics(
-                    local_targets_cat.to(accelerator.device)
-                ).cpu()
-            else:
-                # Single-GPU mode: no gathering needed
-                gathered_preds = local_preds_cat
-                gathered_targets = local_targets_cat
-            # Synchronize validation metrics (scalars only - efficient)
-            val_loss_scalar = val_loss_sum.item()
-            val_metrics = torch.cat(
-                [
-                    torch.tensor([val_loss_scalar], device=accelerator.device),
-                    val_mae_sum,
-                ]
-            )
-            val_metrics_sync = accelerator.reduce(val_metrics, reduction="sum")
-            total_val_samples = accelerator.reduce(
-                torch.tensor([val_samples], device=accelerator.device), reduction="sum"
-            ).item()
-            avg_val_loss = val_metrics_sync[0].item() / total_val_samples
-            # Cast to float32 before numpy (bf16 tensors can't convert directly)
-            avg_mae_per_param = (
-                (val_metrics_sync[1:] / total_val_samples).float().cpu().numpy()
-            )
-            avg_mae = avg_mae_per_param.mean()
-            # ==================== LOGGING & CHECKPOINTING ====================
-            if accelerator.is_main_process:
-                # Scientific metrics - cast to float32 before numpy
-                # gather_for_metrics already handles DDP padding removal
-                y_pred = gathered_preds.float().numpy()
-                y_true = gathered_targets.float().numpy()
-                # Guard against tiny validation sets (R² undefined for <2 samples)
-                if len(y_true) >= 2:
-                    r2 = r2_score(y_true, y_pred)
-                else:
-                    r2 = float("nan")
-                pcc = calc_pearson(y_true, y_pred)
-                current_lr = get_lr(optimizer)
-                # Update history
-                epoch_end_time = time.time()
-                epoch_time = epoch_end_time - epoch_start_time
-                total_training_time += epoch_time
-                epoch_stats = {
-                    "epoch": epoch + 1,
-                    "train_loss": avg_train_loss,
-                    "val_loss": avg_val_loss,
-                    "val_r2": r2,
-                    "val_pearson": pcc,
-                    "val_mae_avg": avg_mae,
-                    "grad_norm": grad_norm_tracker.avg,
-                    "lr": current_lr,
-                    "epoch_time": round(epoch_time, 2),
-                    "total_time": round(total_training_time, 2),
-                }
-                for i, mae in enumerate(avg_mae_per_param):
-                    epoch_stats[f"MAE_Phys_P{i}"] = mae
-                history.append(epoch_stats)
-                # Console display
-                base_str = f"{epoch + 1:<4} | {avg_train_loss:<8.4f} | {avg_val_loss:<8.4f} | {r2:<6.4f} | {pcc:<6.4f} | {grad_norm_tracker.avg:<6.4f} | {current_lr:<8.2e} | {avg_mae:<8.4f}"
-                param_str = " | ".join([f"{m:<8.4f}" for m in avg_mae_per_param])
-                logger.info(f"{base_str} | {param_str}")
-                # WandB logging
-                if args.wandb and WANDB_AVAILABLE:
-                    log_dict = {
-                        "main/train_loss": avg_train_loss,
-                        "main/val_loss": avg_val_loss,
-                        "metrics/r2_score": r2,
-                        "metrics/pearson_corr": pcc,
-                        "metrics/mae_avg": avg_mae,
-                        "system/grad_norm": grad_norm_tracker.avg,
-                        "hyper/lr": current_lr,
-                    }
-                    for i, mae in enumerate(avg_mae_per_param):
-                        log_dict[f"mae_detailed/P{i}"] = mae
-                    # Periodic scatter plots
-                    if (epoch % 5 == 0) or (avg_val_loss < best_val_loss):
-                        real_true = scaler.inverse_transform(y_true)
-                        real_pred = scaler.inverse_transform(y_pred)
-                        fig = plot_scientific_scatter(real_true, real_pred)
-                        log_dict["plots/scatter_analysis"] = wandb.Image(fig)
-                        plt.close(fig)
-                    accelerator.log(log_dict)
-            # ==========================================================================
-            # DDP-SAFE CHECKPOINT LOGIC
-            # ==========================================================================
-            # Step 1: Determine if this is the best epoch (BEFORE updating best_val_loss)
-            is_best_epoch = False
-            if accelerator.is_main_process:
-                if avg_val_loss < best_val_loss:
-                    is_best_epoch = True
-            # Step 2: Broadcast decision to all ranks (required for save_state)
-            is_best_epoch = broadcast_early_stop(is_best_epoch, accelerator)
-            # Step 3: Save checkpoint with all ranks participating
-            if is_best_epoch:
-                ckpt_dir = os.path.join(args.output_dir, "best_checkpoint")
-                with suppress_accelerate_logging():
-                    accelerator.save_state(ckpt_dir, safe_serialization=False)
-                # Step 4: Rank 0 handles metadata and updates tracking variables
-                if accelerator.is_main_process:
-                    best_val_loss = avg_val_loss  # Update AFTER checkpoint saved
-                    patience_ctr = 0
-                    with open(os.path.join(ckpt_dir, "training_meta.pkl"), "wb") as f:
-                        pickle.dump(
-                            {
-                                "epoch": epoch + 1,
-                                "best_val_loss": best_val_loss,
-                                "patience_ctr": patience_ctr,
-                                # Model info for auto-detection during inference
-                                "model_name": args.model,
-                                "in_shape": in_shape,
-                                "out_dim": out_dim,
-                            },
-                            f,
-                        )
-                    # Unwrap model for saving (handle torch.compile compatibility)
-                    try:
-                        unwrapped = accelerator.unwrap_model(model)
-                    except KeyError:
-                        # torch.compile model may not have _orig_mod in expected location
-                        # Fall back to getting the module directly
-                        unwrapped = model.module if hasattr(model, "module") else model
-                        # If still compiled, try to get the underlying model
-                        if hasattr(unwrapped, "_orig_mod"):
-                            unwrapped = unwrapped._orig_mod
-                    torch.save(
-                        unwrapped.state_dict(),
-                        os.path.join(args.output_dir, "best_model_weights.pth"),
-                    )
-                    # Copy scaler to checkpoint for portability
-                    scaler_src = os.path.join(args.output_dir, "scaler.pkl")
-                    scaler_dst = os.path.join(ckpt_dir, "scaler.pkl")
-                    if os.path.exists(scaler_src) and not os.path.exists(scaler_dst):
-                        shutil.copy2(scaler_src, scaler_dst)
-                    logger.info(
-                        f"   💾 Best model saved (val_loss: {best_val_loss:.6f})"
-                    )
-                    # Also save CSV on best model (ensures progress is saved)
-                    pd.DataFrame(history).to_csv(
-                        os.path.join(args.output_dir, "training_history.csv"),
-                        index=False,
-                    )
-            else:
-                if accelerator.is_main_process:
-                    patience_ctr += 1
-            # Periodic checkpoint (all ranks participate in save_state)
-            periodic_checkpoint_needed = (
-                args.save_every > 0 and (epoch + 1) % args.save_every == 0
-            )
-            if periodic_checkpoint_needed:
-                ckpt_name = f"epoch_{epoch + 1}_checkpoint"
-                ckpt_dir = os.path.join(args.output_dir, ckpt_name)
-                with suppress_accelerate_logging():
-                    accelerator.save_state(ckpt_dir, safe_serialization=False)
-                if accelerator.is_main_process:
-                    with open(os.path.join(ckpt_dir, "training_meta.pkl"), "wb") as f:
-                        pickle.dump(
-                            {
-                                "epoch": epoch + 1,
-                                "best_val_loss": best_val_loss,
-                                "patience_ctr": patience_ctr,
-                                # Model info for auto-detection during inference
-                                "model_name": args.model,
-                                "in_shape": in_shape,
-                                "out_dim": out_dim,
-                            },
-                            f,
-                        )
-                    logger.info(f"   📁 Periodic checkpoint: {ckpt_name}")
-                    # Save CSV with each checkpoint (keeps logs in sync with model state)
-                    pd.DataFrame(history).to_csv(
-                        os.path.join(args.output_dir, "training_history.csv"),
-                        index=False,
-                    )
-            # Learning rate scheduling (epoch-based schedulers only)
-            # NOTE: For ReduceLROnPlateau with DDP, we must step only on main process
-            # to avoid patience counter being incremented by all GPU processes.
-            # Then we sync the new LR to all processes to keep them consistent.
-            if not scheduler_step_per_batch:
-                if args.scheduler == "plateau":
-                    # Step only on main process to avoid multi-GPU patience bug
-                    if accelerator.is_main_process:
-                        scheduler.step(avg_val_loss)
-                    # Sync LR across all processes after main process updates it
-                    accelerator.wait_for_everyone()
-                    # Broadcast new LR from rank 0 to all processes
-                    if dist.is_initialized():
-                        if accelerator.is_main_process:
-                            new_lr = optimizer.param_groups[0]["lr"]
-                        else:
-                            new_lr = 0.0
-                        new_lr_tensor = torch.tensor(
-                            new_lr, device=accelerator.device, dtype=torch.float32
-                        )
-                        dist.broadcast(new_lr_tensor, src=0)
-                        # Update LR on non-main processes
-                        if not accelerator.is_main_process:
-                            for param_group in optimizer.param_groups:
-                                param_group["lr"] = new_lr_tensor.item()
-                else:
-                    scheduler.step()
-            # DDP-safe early stopping
-            should_stop = (
-                patience_ctr >= args.patience if accelerator.is_main_process else False
-            )
-            if broadcast_early_stop(should_stop, accelerator):
-                if accelerator.is_main_process:
-                    logger.info(
-                        f"🛑 Early stopping at epoch {epoch + 1} (patience={args.patience})"
-                    )
-                    # Create completion flag to prevent auto-resume
-                    with open(
-                        os.path.join(args.output_dir, "training_complete.flag"), "w"
-                    ) as f:
-                        f.write(
-                            f"Training completed via early stopping at epoch {epoch + 1}\n"
-                        )
-                break
-    except KeyboardInterrupt:
-        logger.warning("Training interrupted. Saving emergency checkpoint...")
-        with suppress_accelerate_logging():
-            accelerator.save_state(
-                os.path.join(args.output_dir, "interrupted_checkpoint"),
-                safe_serialization=False,
-            )
-    except Exception as e:
-        logger.error(f"Critical error: {e}", exc_info=True)
-        raise
-    else:
-        # Training completed normally (reached max epochs without early stopping)
-        # Create completion flag to prevent auto-resume on re-run
-        if accelerator.is_main_process:
-            if not os.path.exists(complete_flag_path):
-                with open(complete_flag_path, "w") as f:
-                    f.write(f"Training completed normally after {args.epochs} epochs\n")
-                logger.info(f"✅ Training completed after {args.epochs} epochs")
-    finally:
-        # Final CSV write to capture all epochs (handles non-multiple-of-10 endings)
-        if accelerator.is_main_process and len(history) > 0:
-            pd.DataFrame(history).to_csv(
-                os.path.join(args.output_dir, "training_history.csv"),
-                index=False,
-            )
-        # Generate training curves plot (PNG + SVG)
-        if accelerator.is_main_process and len(history) > 0:
-            try:
-                fig = create_training_curves(history, show_lr=True)
-                for fmt in ["png", "svg"]:
-                    fig.savefig(
-                        os.path.join(args.output_dir, f"training_curves.{fmt}"),
-                        dpi=FIGURE_DPI,
-                        bbox_inches="tight",
-                    )
-                plt.close(fig)
-                logger.info("✔ Saved: training_curves.png, training_curves.svg")
-            except Exception as e:
-                logger.warning(f"Could not generate training curves: {e}")
-        if args.wandb and WANDB_AVAILABLE:
-            accelerator.end_training()
-        # Clean up distributed process group to prevent resource leak warning
-        if torch.distributed.is_initialized():
-            torch.distributed.destroy_process_group()
-        logger.info("Training completed.")
-if __name__ == "__main__":
-    try:
-        torch.multiprocessing.set_start_method("spawn")
-    except RuntimeError:
-        pass
-    main()
+"""
+WaveDL - Deep Learning for Wave-based Inverse Problems
+=======================================================
+Target Environment: NVIDIA HPC GPUs (Multi-GPU DDP) | PyTorch 2.x | Python 3.11+
+A modular training framework for wave-based inverse problems and regression:
+  1. HPC-Grade DDP Training: BF16/FP16 mixed precision with torch.compile support
+  2. Dynamic Model Selection: Use --model flag to select any registered architecture
+  3. Zero-Copy Data Engine: Memmap-backed datasets for large-scale training
+  4. Physics-Aware Metrics: Real-time physical MAE with proper unscaling
+  5. Robust Checkpointing: Resume training, periodic saves, and training curves
+  6. Deep Observability: WandB integration with scatter analysis
+Usage:
+    # Recommended: Using the HPC launcher (handles accelerate configuration)
+    wavedl-hpc --model cnn --batch_size 128 --mixed_precision bf16 --wandb
+    # Or direct training module (use --precision, not --mixed_precision)
+    accelerate launch -m wavedl.train --model cnn --batch_size 128 --precision bf16
+    # Multi-GPU with explicit config
+    wavedl-hpc --num_gpus 4 --mixed_precision bf16 --model cnn --wandb
+    # Resume from checkpoint
+    accelerate launch -m wavedl.train --model cnn --resume best_checkpoint --wandb
+    # List available models
+    wavedl-train --list_models
+Note:
+    - wavedl-hpc: Uses --mixed_precision (passed to accelerate launch)
+    - wavedl.train: Uses --precision (internal module flag)
+    Both control the same behavior; use the appropriate flag for your entry point.
+Author: Ductho Le (ductho.le@outlook.com)
+"""
+from __future__ import annotations
+# =============================================================================
+# HPC Environment Setup (MUST be before any library imports)
+# =============================================================================
+# Auto-configure writable cache directories when home is not writable.
+# Uses current working directory as fallback - works on HPC and local machines.
+import os
+def _setup_cache_dir(env_var: str, subdir: str) -> None:
+    """Set cache directory to CWD if home is not writable."""
+    if env_var in os.environ:
+        return  # User already set, respect their choice
+    # Check if home is writable
+    home = os.path.expanduser("~")
+    if os.access(home, os.W_OK):
+        return  # Home is writable, let library use defaults
+    # Home not writable - use current working directory
+    cache_path = os.path.join(os.getcwd(), f".{subdir}")
+    os.makedirs(cache_path, exist_ok=True)
+    os.environ[env_var] = cache_path
+# Configure cache directories (before any library imports)
+_setup_cache_dir("TORCH_HOME", "torch_cache")
+_setup_cache_dir("MPLCONFIGDIR", "matplotlib")
+_setup_cache_dir("FONTCONFIG_CACHE", "fontconfig")
+_setup_cache_dir("XDG_DATA_HOME", "local/share")
+_setup_cache_dir("XDG_STATE_HOME", "local/state")
+_setup_cache_dir("XDG_CACHE_HOME", "cache")
+def _setup_per_rank_compile_cache() -> None:
+    """Set per-GPU Triton/Inductor cache to prevent multi-process race warnings.
+    When using torch.compile with multiple GPUs, all processes try to write to
+    the same cache directory, causing 'Directory is not empty - skipping!' warnings.
+    This gives each GPU rank its own isolated cache subdirectory.
+    """
+    # Get local rank from environment (set by accelerate/torchrun)
+    local_rank = os.environ.get("LOCAL_RANK", "0")
+    # Get cache base from environment or use CWD
+    cache_base = os.environ.get(
+        "TRITON_CACHE_DIR", os.path.join(os.getcwd(), ".triton_cache")
+    )
+    # Set per-rank cache directories
+    os.environ["TRITON_CACHE_DIR"] = os.path.join(cache_base, f"rank_{local_rank}")
+    os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(
+        os.environ.get(
+            "TORCHINDUCTOR_CACHE_DIR", os.path.join(os.getcwd(), ".inductor_cache")
+        ),
+        f"rank_{local_rank}",
+    )
+    # Create directories
+    os.makedirs(os.environ["TRITON_CACHE_DIR"], exist_ok=True)
+    os.makedirs(os.environ["TORCHINDUCTOR_CACHE_DIR"], exist_ok=True)
+# Setup per-rank compile caches (before torch imports)
+_setup_per_rank_compile_cache()
+# =============================================================================
+# Standard imports (after environment setup)
+# =============================================================================
+import argparse
+import logging
+import pickle
+import shutil
+import sys
+import time
+import warnings
+from typing import Any
+# Suppress Pydantic warnings from accelerate's internal Field() usage
+warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from sklearn.metrics import r2_score
+from tqdm.auto import tqdm
+from wavedl.models import build_model, get_model, list_models
+from wavedl.utils import (
+    FIGURE_DPI,
+    MetricTracker,
+    broadcast_early_stop,
+    calc_pearson,
+    create_training_curves,
+    get_loss,
+    get_lr,
+    get_optimizer,
+    get_scheduler,
+    is_epoch_based,
+    list_losses,
+    list_optimizers,
+    list_schedulers,
+    plot_scientific_scatter,
+    prepare_data,
+)
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    WANDB_AVAILABLE = False
+# ==============================================================================
+# RUNTIME CONFIGURATION (post-import)
+# ==============================================================================
+# Configure matplotlib paths for HPC systems without writable home directories
+os.environ.setdefault("MPLCONFIGDIR", os.getenv("TMPDIR", "/tmp") + "/matplotlib")
+os.environ.setdefault("FONTCONFIG_PATH", "/etc/fonts")
+# Suppress warnings from known-noisy libraries, but preserve legitimate warnings
+# from torch/numpy about NaN, dtype, and numerical issues.
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+# Pydantic v1/v2 compatibility warnings
+warnings.filterwarnings("ignore", module="pydantic")
+warnings.filterwarnings("ignore", message=".*UnsupportedFieldAttributeWarning.*")
+# Transformer library warnings (loading configs, etc.)
+warnings.filterwarnings("ignore", module="transformers")
+# Accelerate verbose messages
+warnings.filterwarnings("ignore", module="accelerate")
+# torch.compile backend selection warnings
+warnings.filterwarnings("ignore", message=".*TorchDynamo.*")
+warnings.filterwarnings("ignore", message=".*Dynamo is not supported.*")
+# Note: UserWarning from torch/numpy core is NOT suppressed to preserve
+# legitimate warnings about NaN values, dtype mismatches, etc.
+# ==============================================================================
+# GPU PERFORMANCE OPTIMIZATIONS (Ampere/Hopper: A100, H100)
+# ==============================================================================
+# Enable TF32 for faster matmul (safe precision for training, ~2x speedup)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision("high")  # Use TF32 for float32 ops
+# Enable cuDNN autotuning for fixed-size inputs (CNN-like models benefit most)
+# Note: First few batches may be slower due to benchmarking
+torch.backends.cudnn.benchmark = True
+# ==============================================================================
+# LOGGING UTILITIES
+# ==============================================================================
+from contextlib import contextmanager
+@contextmanager
+def suppress_accelerate_logging():
+    """Temporarily suppress accelerate's verbose checkpoint save messages."""
+    accelerate_logger = logging.getLogger("accelerate.checkpointing")
+    original_level = accelerate_logger.level
+    accelerate_logger.setLevel(logging.WARNING)
+    try:
+        yield
+    finally:
+        accelerate_logger.setLevel(original_level)
+# ==============================================================================
+# ARGUMENT PARSING
+# ==============================================================================
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments with comprehensive options."""
+    parser = argparse.ArgumentParser(
+        description="Universal DDP Training Pipeline",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    # Model Selection
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="cnn",
+        help=f"Model architecture to train. Available: {list_models()}",
+    )
+    parser.add_argument(
+        "--list_models", action="store_true", help="List all available models and exit"
+    )
+    parser.add_argument(
+        "--import",
+        dest="import_modules",
+        type=str,
+        nargs="+",
+        default=[],
+        help="Python modules to import before training (for custom models)",
+    )
+    parser.add_argument(
+        "--no_pretrained",
+        dest="pretrained",
+        action="store_false",
+        help="Train from scratch without pretrained weights (default: use pretrained)",
+    )
+    parser.set_defaults(pretrained=True)
+    # Configuration File
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="Path to YAML config file. CLI args override config values.",
+    )
+    # Hyperparameters
+    parser.add_argument(
+        "--batch_size", type=int, default=128, help="Batch size per GPU"
+    )
+    parser.add_argument("--lr", type=float, default=1e-3, help="Initial learning rate")
+    parser.add_argument(
+        "--epochs", type=int, default=1000, help="Maximum training epochs"
+    )
+    parser.add_argument(
+        "--patience", type=int, default=20, help="Early stopping patience"
+    )
+    parser.add_argument("--weight_decay", type=float, default=1e-4, help="Weight decay")
+    parser.add_argument(
+        "--grad_clip", type=float, default=1.0, help="Gradient clipping norm"
+    )
+    # Loss Function
+    parser.add_argument(
+        "--loss",
+        type=str,
+        default="mse",
+        choices=["mse", "mae", "huber", "smooth_l1", "log_cosh", "weighted_mse"],
+        help=f"Loss function for training. Available: {list_losses()}",
+    )
+    parser.add_argument(
+        "--huber_delta", type=float, default=1.0, help="Delta for Huber loss"
+    )
+    parser.add_argument(
+        "--loss_weights",
+        type=str,
+        default=None,
+        help="Comma-separated weights for weighted_mse (e.g., '1.0,2.0,1.0')",
+    )
+    # Optimizer
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="adamw",
+        choices=["adamw", "adam", "sgd", "nadam", "radam", "rmsprop"],
+        help=f"Optimizer for training. Available: {list_optimizers()}",
+    )
+    parser.add_argument(
+        "--momentum", type=float, default=0.9, help="Momentum for SGD/RMSprop"
+    )
+    parser.add_argument(
+        "--nesterov", action="store_true", help="Use Nesterov momentum (SGD)"
+    )
+    parser.add_argument(
+        "--betas",
+        type=str,
+        default="0.9,0.999",
+        help="Betas for Adam variants (comma-separated)",
+    )
+    # Learning Rate Scheduler
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="plateau",
+        choices=[
+            "plateau",
+            "cosine",
+            "cosine_restarts",
+            "onecycle",
+            "step",
+            "multistep",
+            "exponential",
+            "linear_warmup",
+        ],
+        help=f"LR scheduler. Available: {list_schedulers()}",
+    )
+    parser.add_argument(
+        "--scheduler_patience",
+        type=int,
+        default=10,
+        help="Patience for ReduceLROnPlateau",
+    )
+    parser.add_argument(
+        "--min_lr", type=float, default=1e-6, help="Minimum learning rate"
+    )
+    parser.add_argument(
+        "--scheduler_factor", type=float, default=0.5, help="LR reduction factor"
+    )
+    parser.add_argument(
+        "--warmup_epochs", type=int, default=5, help="Warmup epochs for linear_warmup"
+    )
+    parser.add_argument(
+        "--step_size", type=int, default=30, help="Step size for StepLR"
+    )
+    parser.add_argument(
+        "--milestones",
+        type=str,
+        default=None,
+        help="Comma-separated epochs for MultiStepLR (e.g., '30,60,90')",
+    )
+    # Data
+    parser.add_argument(
+        "--data_path", type=str, default="train_data.npz", help="Path to NPZ dataset"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=-1,
+        help="DataLoader workers per GPU (-1=auto-detect based on CPU cores)",
+    )
+    parser.add_argument("--seed", type=int, default=2025, help="Random seed")
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Enable deterministic mode for reproducibility (slower, disables TF32/cuDNN benchmark)",
+    )
+    parser.add_argument(
+        "--cache_validate",
+        type=str,
+        default="sha256",
+        choices=["sha256", "fast", "size"],
+        help="Cache validation mode: sha256 (full hash), fast (partial), size (quick)",
+    )
+    parser.add_argument(
+        "--single_channel",
+        action="store_true",
+        help="Confirm data is single-channel (suppress ambiguous shape warnings for shallow 3D volumes)",
+    )
+    # Cross-Validation
+    parser.add_argument(
+        "--cv",
+        type=int,
+        default=0,
+        help="Enable K-fold cross-validation with K folds (0=disabled)",
+    )
+    parser.add_argument(
+        "--cv_stratify",
+        action="store_true",
+        help="Use stratified splitting for cross-validation",
+    )
+    parser.add_argument(
+        "--cv_bins",
+        type=int,
+        default=10,
+        help="Number of bins for stratified CV (only with --cv_stratify)",
+    )
+    # Checkpointing & Resume
+    parser.add_argument(
+        "--resume", type=str, default=None, help="Checkpoint directory to resume from"
+    )
+    parser.add_argument(
+        "--save_every",
+        type=int,
+        default=50,
+        help="Save checkpoint every N epochs (0=disable)",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=".", help="Output directory for checkpoints"
+    )
+    parser.add_argument(
+        "--fresh",
+        action="store_true",
+        help="Force fresh training, ignore existing checkpoints",
+    )
+    # Performance
+    parser.add_argument(
+        "--compile", action="store_true", help="Enable torch.compile (PyTorch 2.x)"
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="bf16",
+        choices=["bf16", "fp16", "no"],
+        help="Mixed precision mode",
+    )
+    # Alias for consistency with wavedl-hpc (--mixed_precision)
+    parser.add_argument(
+        "--mixed_precision",
+        dest="precision",
+        type=str,
+        choices=["bf16", "fp16", "no"],
+        help=argparse.SUPPRESS,  # Hidden: use --precision instead
+    )
+    # Physical Constraints
+    parser.add_argument(
+        "--constraint",
+        type=str,
+        nargs="+",
+        default=[],
+        help="Soft constraint expressions: 'y0 - y1*y2' (penalize violations)",
+    )
+    parser.add_argument(
+        "--constraint_file",
+        type=str,
+        default=None,
+        help="Python file with constraint(pred, inputs) function",
+    )
+    parser.add_argument(
+        "--constraint_weight",
+        type=float,
+        nargs="+",
+        default=[0.1],
+        help="Weight(s) for soft constraints (one per constraint, or single shared weight)",
+    )
+    parser.add_argument(
+        "--constraint_reduction",
+        type=str,
+        default="mse",
+        choices=["mse", "mae"],
+        help="Reduction mode for constraint penalties",
+    )
+    # Logging
+    parser.add_argument(
+        "--wandb", action="store_true", help="Enable Weights & Biases logging"
+    )
+    parser.add_argument(
+        "--wandb_watch",
+        action="store_true",
+        help="Enable WandB gradient watching (adds overhead, useful for debugging)",
+    )
+    parser.add_argument(
+        "--project_name", type=str, default="DL-Training", help="WandB project name"
+    )
+    parser.add_argument("--run_name", type=str, default=None, help="WandB run name")
+    args = parser.parse_args()
+    return args, parser  # Returns (Namespace, ArgumentParser)
+# ==============================================================================
+# MAIN TRAINING FUNCTION
+# ==============================================================================
+def main():
+    args, parser = parse_args()
+    # Import custom model modules if specified
+    if args.import_modules:
+        import importlib
+        for module_name in args.import_modules:
+            try:
+                # Handle both module names (my_model) and file paths (./my_model.py)
+                if module_name.endswith(".py"):
+                    # Import from file path with unique module name
+                    import importlib.util
+                    # Derive unique module name from filename to avoid collisions
+                    base_name = os.path.splitext(os.path.basename(module_name))[0]
+                    unique_name = f"wavedl_custom_{base_name}"
+                    spec = importlib.util.spec_from_file_location(
+                        unique_name, module_name
+                    )
+                    if spec and spec.loader:
+                        module = importlib.util.module_from_spec(spec)
+                        sys.modules[unique_name] = module
+                        spec.loader.exec_module(module)
+                        print(f"✓ Imported custom module from: {module_name}")
+                else:
+                    # Import as regular module
+                    importlib.import_module(module_name)
+                    print(f"✓ Imported module: {module_name}")
+            except (ImportError, FileNotFoundError, SyntaxError, PermissionError) as e:
+                print(f"✗ Failed to import '{module_name}': {e}", file=sys.stderr)
+                if isinstance(e, FileNotFoundError):
+                    print("  File does not exist. Check the path.", file=sys.stderr)
+                elif isinstance(e, SyntaxError):
+                    print(
+                        f"  Syntax error at line {e.lineno}: {e.msg}", file=sys.stderr
+                    )
+                elif isinstance(e, PermissionError):
+                    print(
+                        "  Permission denied. Check file permissions.", file=sys.stderr
+                    )
+                else:
+                    print(
+                        "  Make sure the module is in your Python path or current directory.",
+                        file=sys.stderr,
+                    )
+                sys.exit(1)
+    # Handle --list_models flag
+    if args.list_models:
+        print("Available models:")
+        for name in list_models():
+            ModelClass = get_model(name)
+            # Get first non-empty docstring line
+            if ModelClass.__doc__:
+                lines = [
+                    l.strip() for l in ModelClass.__doc__.splitlines() if l.strip()
+                ]
+                doc_first_line = lines[0] if lines else "No description"
+            else:
+                doc_first_line = "No description"
+            print(f"  - {name}: {doc_first_line}")
+        sys.exit(0)
+    # Load and merge config file if provided
+    if args.config:
+        from wavedl.utils.config import (
+            load_config,
+            merge_config_with_args,
+            validate_config,
+        )
+        print(f"📄 Loading config from: {args.config}")
+        config = load_config(args.config)
+        # Validate config values
+        warnings_list = validate_config(config)
+        for w in warnings_list:
+            print(f"  ⚠ {w}")
+        # Merge config with CLI args (CLI takes precedence via parser defaults detection)
+        args = merge_config_with_args(config, args, parser=parser)
+    # Handle --cv flag (cross-validation mode)
+    if args.cv > 0:
+        print(f"🔄 Cross-Validation Mode: {args.cv} folds")
+        from wavedl.utils.cross_validation import run_cross_validation
+        # Load data for CV using memory-efficient loader
+        from wavedl.utils.data import DataSource, get_data_source
+        data_format = DataSource.detect_format(args.data_path)
+        source = get_data_source(data_format)
+        # Use memory-mapped loading when available (now returns LazyDataHandle for all formats)
+        _cv_handle = None
+        if hasattr(source, "load_mmap"):
+            _cv_handle = source.load_mmap(args.data_path)
+            X, y = _cv_handle.inputs, _cv_handle.outputs
+        else:
+            X, y = source.load(args.data_path)
+        # Handle sparse matrices (must materialize for CV shuffling)
+        if hasattr(X, "__getitem__") and len(X) > 0 and hasattr(X[0], "toarray"):
+            X = np.stack([x.toarray() for x in X])
+        # Normalize target shape: (N,) -> (N, 1) for consistency
+        if y.ndim == 1:
+            y = y.reshape(-1, 1)
+        # Validate and determine input shape (consistent with prepare_data)
+        # Check for ambiguous shapes that could be multi-channel or shallow 3D volume
+        sample_shape = X.shape[1:]  # Per-sample shape
+        # Same heuristic as prepare_data: detect ambiguous 3D shapes
+        is_ambiguous_shape = (
+            len(sample_shape) == 3  # Exactly 3D: could be (C, H, W) or (D, H, W)
+            and sample_shape[0] <= 16  # First dim looks like channels
+            and sample_shape[1] > 16
+            and sample_shape[2] > 16  # Both spatial dims are large
+        )
+        if is_ambiguous_shape and not args.single_channel:
+            raise ValueError(
+                f"Ambiguous input shape detected: sample shape {sample_shape}. "
+                f"This could be either:\n"
+                f"  - Multi-channel 2D data (C={sample_shape[0]}, H={sample_shape[1]}, W={sample_shape[2]})\n"
+                f"  - Single-channel 3D volume (D={sample_shape[0]}, H={sample_shape[1]}, W={sample_shape[2]})\n\n"
+                f"If this is single-channel 3D/shallow volume data, use --single_channel flag.\n"
+                f"If this is multi-channel 2D data, reshape to (N*C, H, W) with adjusted targets."
+            )
+        # in_shape = spatial dimensions for model registry (channel added during training)
+        in_shape = sample_shape
+        # Run cross-validation
+        try:
+            run_cross_validation(
+                X=X,
+                y=y,
+                model_name=args.model,
+                in_shape=in_shape,
+                out_size=y.shape[1],
+                folds=args.cv,
+                stratify=args.cv_stratify,
+                stratify_bins=args.cv_bins,
+                batch_size=args.batch_size,
+                lr=args.lr,
+                epochs=args.epochs,
+                patience=args.patience,
+                weight_decay=args.weight_decay,
+                loss_name=args.loss,
+                optimizer_name=args.optimizer,
+                scheduler_name=args.scheduler,
+                output_dir=args.output_dir,
+                workers=args.workers,
+                seed=args.seed,
+            )
+        finally:
+            # Clean up file handle if HDF5/MAT
+            if _cv_handle is not None and hasattr(_cv_handle, "close"):
+                try:
+                    _cv_handle.close()
+                except Exception as e:
+                    logging.debug(f"Failed to close CV data handle: {e}")
+        return
+    # ==========================================================================
+    # SYSTEM INITIALIZATION
+    # ==========================================================================
+    # Initialize Accelerator for DDP and mixed precision
+    accelerator = Accelerator(
+        mixed_precision=args.precision,
+        log_with="wandb" if args.wandb and WANDB_AVAILABLE else None,
+    )
+    set_seed(args.seed)
+    # Deterministic mode for scientific reproducibility
+    # Disables TF32 and cuDNN benchmark for exact reproducibility (slower)
+    if args.deterministic:
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        torch.use_deterministic_algorithms(True, warn_only=True)
+        if accelerator.is_main_process:
+            print("🔒 Deterministic mode enabled (slower but reproducible)")
+    # Configure logging (rank 0 only prints to console)
+    logging.basicConfig(
+        level=logging.INFO if accelerator.is_main_process else logging.ERROR,
+        format="%(asctime)s | %(levelname)s | %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    logger = logging.getLogger("Trainer")
+    # Ensure output directory exists (critical for cache files, checkpoints, etc.)
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Auto-detect optimal DataLoader workers if not specified
+    if args.workers < 0:
+        cpu_count = os.cpu_count() or 4
+        num_gpus = accelerator.num_processes
+        # Heuristic: 4-16 workers per GPU, bounded by available CPU cores
+        # Increased cap from 8 to 16 for high-throughput GPUs (H100, A100)
+        args.workers = min(16, max(2, (cpu_count - 2) // num_gpus))
+        if accelerator.is_main_process:
+            logger.info(
+                f"⚙️  Auto-detected workers: {args.workers} per GPU "
+                f"(CPUs: {cpu_count}, GPUs: {num_gpus})"
+            )
+    if accelerator.is_main_process:
+        logger.info(f"🚀 Cluster Status: {accelerator.num_processes}x GPUs detected")
+        logger.info(
+            f"   Model: {args.model} | Precision: {args.precision} | Compile: {args.compile}"
+        )
+        logger.info(
+            f"   Loss: {args.loss} | Optimizer: {args.optimizer} | Scheduler: {args.scheduler}"
+        )
+        logger.info(f"   Early Stopping Patience: {args.patience} epochs")
+        if args.save_every > 0:
+            logger.info(f"   Periodic Checkpointing: Every {args.save_every} epochs")
+        if args.resume:
+            logger.info(f"   📂 Resuming from: {args.resume}")
+        # Initialize WandB
+        if args.wandb and WANDB_AVAILABLE:
+            accelerator.init_trackers(
+                project_name=args.project_name,
+                config=vars(args),
+                init_kwargs={"wandb": {"name": args.run_name or f"{args.model}_run"}},
+            )
+    # ==========================================================================
+    # DATA & MODEL LOADING
+    # ==========================================================================
+    train_dl, val_dl, scaler, in_shape, out_dim = prepare_data(
+        args, logger, accelerator, cache_dir=args.output_dir
+    )
+    # Build model using registry
+    model = build_model(
+        args.model, in_shape=in_shape, out_size=out_dim, pretrained=args.pretrained
+    )
+    if accelerator.is_main_process:
+        param_info = model.parameter_summary()
+        logger.info(
+            f"   Model Parameters: {param_info['trainable_parameters']:,} trainable"
+        )
+        logger.info(f"   Model Size: {param_info['total_mb']:.2f} MB")
+    # Optional WandB model watching (opt-in due to overhead on large models)
+    if (
+        args.wandb
+        and args.wandb_watch
+        and WANDB_AVAILABLE
+        and accelerator.is_main_process
+    ):
+        wandb.watch(model, log="gradients", log_freq=100)
+        logger.info("   📊 WandB gradient watching enabled")
+    # Torch 2.0 compilation (requires compatible Triton on GPU)
+    if args.compile:
+        try:
+            # Test if Triton is available - just import the package
+            # Different Triton versions have different internal APIs, so just check base import
+            import triton
+            model = torch.compile(model)
+            if accelerator.is_main_process:
+                logger.info("   ✔ torch.compile enabled (Triton backend)")
+        except ImportError as e:
+            if accelerator.is_main_process:
+                if "triton" in str(e).lower():
+                    logger.warning(
+                        "   ⚠ Triton not installed or incompatible version - torch.compile disabled. "
+                        "Training will proceed without compilation."
+                    )
+                else:
+                    logger.warning(
+                        f"   ⚠ torch.compile setup failed: {e}. Continuing without compilation."
+                    )
+        except Exception as e:
+            if accelerator.is_main_process:
+                logger.warning(
+                    f"   ⚠ torch.compile failed: {e}. Continuing without compilation."
+                )
+    # ==========================================================================
+    # OPTIMIZER, SCHEDULER & LOSS CONFIGURATION
+    # ==========================================================================
+    # Parse comma-separated arguments with validation
+    try:
+        betas_list = [float(x.strip()) for x in args.betas.split(",")]
+        if len(betas_list) != 2:
+            raise ValueError(
+                f"--betas must have exactly 2 values, got {len(betas_list)}"
+            )
+        if not all(0.0 <= b < 1.0 for b in betas_list):
+            raise ValueError(f"--betas values must be in [0, 1), got {betas_list}")
+        betas = tuple(betas_list)
+    except ValueError as e:
+        raise ValueError(
+            f"Invalid --betas format '{args.betas}': {e}. Expected format: '0.9,0.999'"
+        )
+    loss_weights = None
+    if args.loss_weights:
+        loss_weights = [float(x.strip()) for x in args.loss_weights.split(",")]
+    milestones = None
+    if args.milestones:
+        milestones = [int(x.strip()) for x in args.milestones.split(",")]
+    # Create optimizer using factory
+    optimizer = get_optimizer(
+        name=args.optimizer,
+        params=model.get_optimizer_groups(args.lr, args.weight_decay),
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        momentum=args.momentum,
+        nesterov=args.nesterov,
+        betas=betas,
+    )
+    # Create loss function using factory
+    criterion = get_loss(
+        name=args.loss,
+        weights=loss_weights,
+        delta=args.huber_delta,
+    )
+    # Move criterion to device (important for WeightedMSELoss buffer)
+    criterion = criterion.to(accelerator.device)
+    # ==========================================================================
+    # PHYSICAL CONSTRAINTS INTEGRATION
+    # ==========================================================================
+    from wavedl.utils.constraints import (
+        PhysicsConstrainedLoss,
+        build_constraints,
+    )
+    # Build soft constraints
+    soft_constraints = build_constraints(
+        expressions=args.constraint,
+        file_path=args.constraint_file,
+        reduction=args.constraint_reduction,
+    )
+    # Wrap criterion with PhysicsConstrainedLoss if we have soft constraints
+    if soft_constraints:
+        # Pass output scaler so constraints can be evaluated in physical space
+        output_mean = scaler.mean_ if hasattr(scaler, "mean_") else None
+        output_std = scaler.scale_ if hasattr(scaler, "scale_") else None
+        criterion = PhysicsConstrainedLoss(
+            criterion,
+            soft_constraints,
+            weights=args.constraint_weight,
+            output_mean=output_mean,
+            output_std=output_std,
+        )
+        if accelerator.is_main_process:
+            logger.info(
+                f"   🔬 Physical constraints: {len(soft_constraints)} constraint(s) "
+                f"with weight(s) {args.constraint_weight}"
+            )
+            if output_mean is not None:
+                logger.info(
+                    "   📐 Constraints evaluated in physical space (denormalized)"
+                )
+    # Track if scheduler should step per batch (OneCycleLR) or per epoch
+    scheduler_step_per_batch = not is_epoch_based(args.scheduler)
+    # ==========================================================================
+    # DDP Preparation Strategy:
+    # - For batch-based schedulers (OneCycleLR): prepare DataLoaders first to get
+    #   the correct sharded batch count, then create scheduler
+    # - For epoch-based schedulers: create scheduler before prepare (no issue)
+    # ==========================================================================
+    if scheduler_step_per_batch:
+        # BATCH-BASED SCHEDULER (e.g., OneCycleLR)
+        # Prepare model, optimizer, dataloaders FIRST to get sharded loader length
+        model, optimizer, train_dl, val_dl = accelerator.prepare(
+            model, optimizer, train_dl, val_dl
+        )
+        # Now create scheduler with the CORRECT sharded steps_per_epoch
+        steps_per_epoch = len(train_dl)  # Post-DDP sharded length
+        scheduler = get_scheduler(
+            name=args.scheduler,
+            optimizer=optimizer,
+            epochs=args.epochs,
+            steps_per_epoch=steps_per_epoch,
+            min_lr=args.min_lr,
+            patience=args.scheduler_patience,
+            factor=args.scheduler_factor,
+            gamma=args.scheduler_factor,  # For Step/MultiStep/Exponential schedulers
+            step_size=args.step_size,
+            milestones=milestones,
+            warmup_epochs=args.warmup_epochs,
+        )
+        # Prepare scheduler separately (Accelerator wraps it for state saving)
+        scheduler = accelerator.prepare(scheduler)
+    else:
+        # EPOCH-BASED SCHEDULER (plateau, cosine, step, etc.)
+        # No batch count dependency - create scheduler before prepare
+        scheduler = get_scheduler(
+            name=args.scheduler,
+            optimizer=optimizer,
+            epochs=args.epochs,
+            steps_per_epoch=None,
+            min_lr=args.min_lr,
+            patience=args.scheduler_patience,
+            factor=args.scheduler_factor,
+            gamma=args.scheduler_factor,  # For Step/MultiStep/Exponential schedulers
+            step_size=args.step_size,
+            milestones=milestones,
+            warmup_epochs=args.warmup_epochs,
+        )
+        # For ReduceLROnPlateau: DON'T include scheduler in accelerator.prepare()
+        # because accelerator wraps scheduler.step() to sync across processes,
+        # which defeats our rank-0-only stepping for correct patience counting.
+        # Other schedulers are safe to prepare (no internal state affected by multi-call).
+        if args.scheduler == "plateau":
+            model, optimizer, train_dl, val_dl = accelerator.prepare(
+                model, optimizer, train_dl, val_dl
+            )
+            # Scheduler stays unwrapped - we handle sync manually in training loop
+            # But register it for checkpointing so state is saved/loaded on resume
+            accelerator.register_for_checkpointing(scheduler)
+        else:
+            model, optimizer, train_dl, val_dl, scheduler = accelerator.prepare(
+                model, optimizer, train_dl, val_dl, scheduler
+            )
+    # ==========================================================================
+    # AUTO-RESUME / RESUME FROM CHECKPOINT
+    # ==========================================================================
+    start_epoch = 0
+    best_val_loss = float("inf")
+    patience_ctr = 0
+    history: list[dict[str, Any]] = []
+    # Define checkpoint paths
+    best_ckpt_path = os.path.join(args.output_dir, "best_checkpoint")
+    complete_flag_path = os.path.join(args.output_dir, "training_complete.flag")
+    # Auto-resume logic (if not --fresh and no explicit --resume)
+    if not args.fresh and args.resume is None:
+        if os.path.exists(complete_flag_path):
+            # Training already completed
+            if accelerator.is_main_process:
+                logger.info(
+                    "✅ Training already completed (early stopping). Use --fresh to retrain."
+                )
+            return  # Exit gracefully
+        elif os.path.exists(best_ckpt_path):
+            # Incomplete training found - auto-resume
+            args.resume = best_ckpt_path
+            if accelerator.is_main_process:
+                logger.info(f"🔄 Auto-resuming from: {best_ckpt_path}")
+    if args.resume:
+        if os.path.exists(args.resume):
+            logger.info(f"🔄 Loading checkpoint from: {args.resume}")
+            accelerator.load_state(args.resume)
+            # Restore training metadata
+            meta_path = os.path.join(args.resume, "training_meta.pkl")
+            if os.path.exists(meta_path):
+                with open(meta_path, "rb") as f:
+                    meta = pickle.load(f)
+                start_epoch = meta.get("epoch", 0)
+                best_val_loss = meta.get("best_val_loss", float("inf"))
+                patience_ctr = meta.get("patience_ctr", 0)
+                logger.info(
+                    f"   ✅ Restored: Epoch {start_epoch}, Best Loss: {best_val_loss:.6f}"
+                )
+            else:
+                logger.warning(
+                    "   ⚠️ training_meta.pkl not found, starting from epoch 0"
+                )
+            # Restore history
+            history_path = os.path.join(args.output_dir, "training_history.csv")
+            if os.path.exists(history_path):
+                history = pd.read_csv(history_path).to_dict("records")
+                logger.info(f"   ✅ Loaded {len(history)} epochs from history")
+        else:
+            raise FileNotFoundError(f"Checkpoint not found: {args.resume}")
+    # ==========================================================================
+    # PHYSICAL METRIC SETUP
+    # ==========================================================================
+    # Physical MAE = normalized MAE * scaler.scale_
+    phys_scale = torch.tensor(
+        scaler.scale_, device=accelerator.device, dtype=torch.float32
+    )
+    # ==========================================================================
+    # TRAINING LOOP
+    # ==========================================================================
+    # Dynamic console header
+    if accelerator.is_main_process:
+        base_cols = ["Ep", "TrnLoss", "ValLoss", "R2", "PCC", "GradN", "LR", "MAE_Avg"]
+        param_cols = [f"MAE_P{i}" for i in range(out_dim)]
+        header = "{:<4} | {:<8} | {:<8} | {:<6} | {:<6} | {:<6} | {:<8} | {:<8}".format(
+            *base_cols
+        )
+        header += " | " + " | ".join([f"{c:<8}" for c in param_cols])
+        logger.info("=" * len(header))
+        logger.info(header)
+        logger.info("=" * len(header))
+    try:
+        total_training_time = 0.0
+        for epoch in range(start_epoch, args.epochs):
+            epoch_start_time = time.time()
+            # ==================== TRAINING PHASE ====================
+            model.train()
+            # Use GPU tensor for loss accumulation to avoid .item() sync per batch
+            train_loss_sum = torch.tensor(0.0, device=accelerator.device)
+            train_samples = 0
+            grad_norm_tracker = MetricTracker()
+            pbar = tqdm(
+                train_dl,
+                disable=not accelerator.is_main_process,
+                leave=False,
+                desc=f"Epoch {epoch + 1}",
+            )
+            for x, y in pbar:
+                with accelerator.accumulate(model):
+                    # Use mixed precision for forward pass (respects --precision flag)
+                    with accelerator.autocast():
+                        pred = model(x)
+                        # Pass inputs for input-dependent constraints (x_mean, x[...], etc.)
+                        if isinstance(criterion, PhysicsConstrainedLoss):
+                            loss = criterion(pred, y, x)
+                        else:
+                            loss = criterion(pred, y)
+                    accelerator.backward(loss)
+                    if accelerator.sync_gradients:
+                        grad_norm = accelerator.clip_grad_norm_(
+                            model.parameters(), args.grad_clip
+                        )
+                        if grad_norm is not None:
+                            grad_norm_tracker.update(grad_norm.item())
+                    optimizer.step()
+                    optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()
+                    # Per-batch LR scheduling (e.g., OneCycleLR)
+                    if scheduler_step_per_batch:
+                        scheduler.step()
+                    # Accumulate as tensors to avoid .item() sync per batch
+                    train_loss_sum += loss.detach() * x.size(0)
+                    train_samples += x.size(0)
+            # Single .item() call at end of epoch (reduces GPU sync overhead)
+            train_loss_scalar = train_loss_sum.item()
+            # Synchronize training metrics across GPUs
+            global_loss = accelerator.reduce(
+                torch.tensor([train_loss_scalar], device=accelerator.device),
+                reduction="sum",
+            ).item()
+            global_samples = accelerator.reduce(
+                torch.tensor([train_samples], device=accelerator.device),
+                reduction="sum",
+            ).item()
+            avg_train_loss = global_loss / global_samples
+            # ==================== VALIDATION PHASE ====================
+            model.eval()
+            # Use GPU tensor for loss accumulation (consistent with training phase)
+            val_loss_sum = torch.tensor(0.0, device=accelerator.device)
+            val_mae_sum = torch.zeros(out_dim, device=accelerator.device)
+            val_samples = 0
+            # Accumulate predictions locally ON CPU to prevent GPU OOM
+            local_preds = []
+            local_targets = []
+            with torch.inference_mode():
+                for x, y in val_dl:
+                    # Use mixed precision for validation (consistent with training)
+                    with accelerator.autocast():
+                        pred = model(x)
+                        # Pass inputs for input-dependent constraints
+                        if isinstance(criterion, PhysicsConstrainedLoss):
+                            loss = criterion(pred, y, x)
+                        else:
+                            loss = criterion(pred, y)
+                    val_loss_sum += loss.detach() * x.size(0)
+                    val_samples += x.size(0)
+                    # Physical MAE
+                    mae_batch = torch.abs((pred - y) * phys_scale).sum(dim=0)
+                    val_mae_sum += mae_batch
+                    # Store on CPU (critical for large val sets)
+                    local_preds.append(pred.detach().cpu())
+                    local_targets.append(y.detach().cpu())
+            # Concatenate locally (keep on GPU for gather_for_metrics compatibility)
+            local_preds_cat = torch.cat(local_preds)
+            local_targets_cat = torch.cat(local_targets)
+            # Gather predictions and targets using Accelerate's CPU-efficient utility
+            # gather_for_metrics handles:
+            # - DDP padding removal (no need to trim manually)
+            # - Efficient cross-rank gathering without GPU memory spike
+            # - Returns concatenated tensors on CPU for metric computation
+            if accelerator.num_processes > 1:
+                # Move to GPU for gather (required by NCCL), then back to CPU
+                # gather_for_metrics is more memory-efficient than manual gather
+                # as it processes in chunks internally
+                gathered_preds = accelerator.gather_for_metrics(
+                    local_preds_cat.to(accelerator.device)
+                ).cpu()
+                gathered_targets = accelerator.gather_for_metrics(
+                    local_targets_cat.to(accelerator.device)
+                ).cpu()
+            else:
+                # Single-GPU mode: no gathering needed
+                gathered_preds = local_preds_cat
+                gathered_targets = local_targets_cat
+            # Synchronize validation metrics (scalars only - efficient)
+            val_loss_scalar = val_loss_sum.item()
+            val_metrics = torch.cat(
+                [
+                    torch.tensor([val_loss_scalar], device=accelerator.device),
+                    val_mae_sum,
+                ]
+            )
+            val_metrics_sync = accelerator.reduce(val_metrics, reduction="sum")
+            total_val_samples = accelerator.reduce(
+                torch.tensor([val_samples], device=accelerator.device), reduction="sum"
+            ).item()
+            avg_val_loss = val_metrics_sync[0].item() / total_val_samples
+            # Cast to float32 before numpy (bf16 tensors can't convert directly)
+            avg_mae_per_param = (
+                (val_metrics_sync[1:] / total_val_samples).float().cpu().numpy()
+            )
+            avg_mae = avg_mae_per_param.mean()
+            # ==================== LOGGING & CHECKPOINTING ====================
+            if accelerator.is_main_process:
+                # Scientific metrics - cast to float32 before numpy
+                # gather_for_metrics already handles DDP padding removal
+                y_pred = gathered_preds.float().numpy()
+                y_true = gathered_targets.float().numpy()
+                # Guard against tiny validation sets (R² undefined for <2 samples)
+                if len(y_true) >= 2:
+                    r2 = r2_score(y_true, y_pred)
+                else:
+                    r2 = float("nan")
+                pcc = calc_pearson(y_true, y_pred)
+                current_lr = get_lr(optimizer)
+                # Update history
+                epoch_end_time = time.time()
+                epoch_time = epoch_end_time - epoch_start_time
+                total_training_time += epoch_time
+                epoch_stats = {
+                    "epoch": epoch + 1,
+                    "train_loss": avg_train_loss,
+                    "val_loss": avg_val_loss,
+                    "val_r2": r2,
+                    "val_pearson": pcc,
+                    "val_mae_avg": avg_mae,
+                    "grad_norm": grad_norm_tracker.avg,
+                    "lr": current_lr,
+                    "epoch_time": round(epoch_time, 2),
+                    "total_time": round(total_training_time, 2),
+                }
+                for i, mae in enumerate(avg_mae_per_param):
+                    epoch_stats[f"MAE_Phys_P{i}"] = mae
+                history.append(epoch_stats)
+                # Console display
+                base_str = f"{epoch + 1:<4} | {avg_train_loss:<8.4f} | {avg_val_loss:<8.4f} | {r2:<6.4f} | {pcc:<6.4f} | {grad_norm_tracker.avg:<6.4f} | {current_lr:<8.2e} | {avg_mae:<8.4f}"
+                param_str = " | ".join([f"{m:<8.4f}" for m in avg_mae_per_param])
+                logger.info(f"{base_str} | {param_str}")
+                # WandB logging
+                if args.wandb and WANDB_AVAILABLE:
+                    log_dict = {
+                        "main/train_loss": avg_train_loss,
+                        "main/val_loss": avg_val_loss,
+                        "metrics/r2_score": r2,
+                        "metrics/pearson_corr": pcc,
+                        "metrics/mae_avg": avg_mae,
+                        "system/grad_norm": grad_norm_tracker.avg,
+                        "hyper/lr": current_lr,
+                    }
+                    for i, mae in enumerate(avg_mae_per_param):
+                        log_dict[f"mae_detailed/P{i}"] = mae
+                    # Periodic scatter plots
+                    if (epoch % 5 == 0) or (avg_val_loss < best_val_loss):
+                        real_true = scaler.inverse_transform(y_true)
+                        real_pred = scaler.inverse_transform(y_pred)
+                        fig = plot_scientific_scatter(real_true, real_pred)
+                        log_dict["plots/scatter_analysis"] = wandb.Image(fig)
+                        plt.close(fig)
+                    accelerator.log(log_dict)
+            # ==========================================================================
+            # DDP-SAFE CHECKPOINT LOGIC
+            # ==========================================================================
+            # Step 1: Determine if this is the best epoch (BEFORE updating best_val_loss)
+            is_best_epoch = False
+            if accelerator.is_main_process:
+                if avg_val_loss < best_val_loss:
+                    is_best_epoch = True
+            # Step 2: Broadcast decision to all ranks (required for save_state)
+            is_best_epoch = broadcast_early_stop(is_best_epoch, accelerator)
+            # Step 3: Save checkpoint with all ranks participating
+            if is_best_epoch:
+                ckpt_dir = os.path.join(args.output_dir, "best_checkpoint")
+                with suppress_accelerate_logging():
+                    accelerator.save_state(ckpt_dir, safe_serialization=False)
+                # Step 4: Rank 0 handles metadata and updates tracking variables
+                if accelerator.is_main_process:
+                    best_val_loss = avg_val_loss  # Update AFTER checkpoint saved
+                    patience_ctr = 0
+                    with open(os.path.join(ckpt_dir, "training_meta.pkl"), "wb") as f:
+                        pickle.dump(
+                            {
+                                "epoch": epoch + 1,
+                                "best_val_loss": best_val_loss,
+                                "patience_ctr": patience_ctr,
+                                # Model info for auto-detection during inference
+                                "model_name": args.model,
+                                "in_shape": in_shape,
+                                "out_dim": out_dim,
+                            },
+                            f,
+                        )
+                    # Unwrap model for saving (handle torch.compile compatibility)
+                    try:
+                        unwrapped = accelerator.unwrap_model(model)
+                    except KeyError:
+                        # torch.compile model may not have _orig_mod in expected location
+                        # Fall back to getting the module directly
+                        unwrapped = model.module if hasattr(model, "module") else model
+                        # If still compiled, try to get the underlying model
+                        if hasattr(unwrapped, "_orig_mod"):
+                            unwrapped = unwrapped._orig_mod
+                    torch.save(
+                        unwrapped.state_dict(),
+                        os.path.join(args.output_dir, "best_model_weights.pth"),
+                    )
+                    # Copy scaler to checkpoint for portability
+                    scaler_src = os.path.join(args.output_dir, "scaler.pkl")
+                    scaler_dst = os.path.join(ckpt_dir, "scaler.pkl")
+                    if os.path.exists(scaler_src) and not os.path.exists(scaler_dst):
+                        shutil.copy2(scaler_src, scaler_dst)
+                    logger.info(
+                        f"   💾 Best model saved (val_loss: {best_val_loss:.6f})"
+                    )
+                    # Also save CSV on best model (ensures progress is saved)
+                    pd.DataFrame(history).to_csv(
+                        os.path.join(args.output_dir, "training_history.csv"),
+                        index=False,
+                    )
+            else:
+                if accelerator.is_main_process:
+                    patience_ctr += 1
+            # Periodic checkpoint (all ranks participate in save_state)
+            periodic_checkpoint_needed = (
+                args.save_every > 0 and (epoch + 1) % args.save_every == 0
+            )
+            if periodic_checkpoint_needed:
+                ckpt_name = f"epoch_{epoch + 1}_checkpoint"
+                ckpt_dir = os.path.join(args.output_dir, ckpt_name)
+                with suppress_accelerate_logging():
+                    accelerator.save_state(ckpt_dir, safe_serialization=False)
+                if accelerator.is_main_process:
+                    with open(os.path.join(ckpt_dir, "training_meta.pkl"), "wb") as f:
+                        pickle.dump(
+                            {
+                                "epoch": epoch + 1,
+                                "best_val_loss": best_val_loss,
+                                "patience_ctr": patience_ctr,
+                                # Model info for auto-detection during inference
+                                "model_name": args.model,
+                                "in_shape": in_shape,
+                                "out_dim": out_dim,
+                            },
+                            f,
+                        )
+                    logger.info(f"   📁 Periodic checkpoint: {ckpt_name}")
+                    # Save CSV with each checkpoint (keeps logs in sync with model state)
+                    pd.DataFrame(history).to_csv(
+                        os.path.join(args.output_dir, "training_history.csv"),
+                        index=False,
+                    )
+            # Learning rate scheduling (epoch-based schedulers only)
+            # NOTE: For ReduceLROnPlateau with DDP, we must step only on main process
+            # to avoid patience counter being incremented by all GPU processes.
+            # Then we sync the new LR to all processes to keep them consistent.
+            if not scheduler_step_per_batch:
+                if args.scheduler == "plateau":
+                    # Step only on main process to avoid multi-GPU patience bug
+                    if accelerator.is_main_process:
+                        scheduler.step(avg_val_loss)
+                    # Sync LR across all processes after main process updates it
+                    accelerator.wait_for_everyone()
+                    # Broadcast new LR from rank 0 to all processes
+                    if dist.is_initialized():
+                        if accelerator.is_main_process:
+                            new_lr = optimizer.param_groups[0]["lr"]
+                        else:
+                            new_lr = 0.0
+                        new_lr_tensor = torch.tensor(
+                            new_lr, device=accelerator.device, dtype=torch.float32
+                        )
+                        dist.broadcast(new_lr_tensor, src=0)
+                        # Update LR on non-main processes
+                        if not accelerator.is_main_process:
+                            for param_group in optimizer.param_groups:
+                                param_group["lr"] = new_lr_tensor.item()
+                else:
+                    scheduler.step()
+            # DDP-safe early stopping
+            should_stop = (
+                patience_ctr >= args.patience if accelerator.is_main_process else False
+            )
+            if broadcast_early_stop(should_stop, accelerator):
+                if accelerator.is_main_process:
+                    logger.info(
+                        f"🛑 Early stopping at epoch {epoch + 1} (patience={args.patience})"
+                    )
+                    # Create completion flag to prevent auto-resume
+                    with open(
+                        os.path.join(args.output_dir, "training_complete.flag"), "w"
+                    ) as f:
+                        f.write(
+                            f"Training completed via early stopping at epoch {epoch + 1}\n"
+                        )
+                break
+    except KeyboardInterrupt:
+        logger.warning("Training interrupted. Saving emergency checkpoint...")
+        with suppress_accelerate_logging():
+            accelerator.save_state(
+                os.path.join(args.output_dir, "interrupted_checkpoint"),
+                safe_serialization=False,
+            )
+    except Exception as e:
+        logger.error(f"Critical error: {e}", exc_info=True)
+        raise
+    else:
+        # Training completed normally (reached max epochs without early stopping)
+        # Create completion flag to prevent auto-resume on re-run
+        if accelerator.is_main_process:
+            if not os.path.exists(complete_flag_path):
+                with open(complete_flag_path, "w") as f:
+                    f.write(f"Training completed normally after {args.epochs} epochs\n")
+                logger.info(f"✅ Training completed after {args.epochs} epochs")
+    finally:
+        # Final CSV write to capture all epochs (handles non-multiple-of-10 endings)
+        if accelerator.is_main_process and len(history) > 0:
+            pd.DataFrame(history).to_csv(
+                os.path.join(args.output_dir, "training_history.csv"),
+                index=False,
+            )
+        # Generate training curves plot (PNG + SVG)
+        if accelerator.is_main_process and len(history) > 0:
+            try:
+                fig = create_training_curves(history, show_lr=True)
+                for fmt in ["png", "svg"]:
+                    fig.savefig(
+                        os.path.join(args.output_dir, f"training_curves.{fmt}"),
+                        dpi=FIGURE_DPI,
+                        bbox_inches="tight",
+                    )
+                plt.close(fig)
+                logger.info("✔ Saved: training_curves.png, training_curves.svg")
+            except Exception as e:
+                logger.warning(f"Could not generate training curves: {e}")
+        if args.wandb and WANDB_AVAILABLE:
+            accelerator.end_training()
+        # Clean up distributed process group to prevent resource leak warning
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+        logger.info("Training completed.")
+if __name__ == "__main__":
+    try:
+        torch.multiprocessing.set_start_method("spawn")
+    except RuntimeError:
+        pass
+    main()

wavedl 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

wavedl 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl