PyPI - wavedl - Versions diffs - 1.5.1__tar.gz → 1.5.3__tar.gz - Mend

wavedl 1.5.1tar.gz → 1.5.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{wavedl-1.5.1/src/wavedl.egg-info → wavedl-1.5.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: wavedl
-Version: 1.5.1
+Version: 1.5.3
 Summary: A Scalable Deep Learning Framework for Wave-Based Inverse Problems
 Author: Ductho Le
 License: MIT
@@ -301,8 +301,8 @@ python -m wavedl.test --checkpoint <checkpoint_folder> --data_path <test_data> \
 **Requirements** (your model must):
 1. Inherit from `BaseModel`
-2. Accept `in_channels`, `num_outputs`, `input_shape` in `__init__`
-3. Return a tensor of shape `(batch, num_outputs)` from `forward()`
+2. Accept `in_shape`, `out_size` in `__init__`
+3. Return a tensor of shape `(batch, out_size)` from `forward()`
 ---
@@ -315,23 +315,22 @@ from wavedl.models import BaseModel, register_model
 @register_model("my_model")  # This name is used with --model flag
 class MyModel(BaseModel):
-    def __init__(self, in_channels, num_outputs, input_shape):
-        # in_channels: number of input channels (auto-detected from data)
-        # num_outputs: number of parameters to predict (auto-detected from data)
-        # input_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
-        super().__init__(in_channels, num_outputs, input_shape)
-        # Define your layers (this is just an example)
-        self.conv1 = nn.Conv2d(in_channels, 64, 3, padding=1)
+    def __init__(self, in_shape, out_size, **kwargs):
+        # in_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
+        # out_size: number of parameters to predict (auto-detected from data)
+        super().__init__(in_shape, out_size)
+        # Define your layers (this is just an example for 2D)
+        self.conv1 = nn.Conv2d(1, 64, 3, padding=1)  # Input always has 1 channel
         self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
-        self.fc = nn.Linear(128, num_outputs)
+        self.fc = nn.Linear(128, out_size)
     def forward(self, x):
-        # Input x has shape: (batch, in_channels, *input_shape)
+        # Input x has shape: (batch, 1, *in_shape)
         x = F.relu(self.conv1(x))
         x = F.relu(self.conv2(x))
         x = x.mean(dim=[-2, -1])  # Global average pooling
-        return self.fc(x)  # Output shape: (batch, num_outputs)
+        return self.fc(x)  # Output shape: (batch, out_size)
 ```
 **Step 2: Train**
@@ -573,14 +572,19 @@ WaveDL automatically enables performance optimizations for modern GPUs:
 </details>
 <details>
-<summary><b>Environment Variables (wavedl-hpc)</b></summary>
+<summary><b>HPC CLI Arguments (wavedl-hpc)</b></summary>
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `--num_gpus` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override |
+| `--num_machines` | `1` | Number of machines in distributed setup |
+| `--mixed_precision` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
+| `--dynamo_backend` | `no` | PyTorch Dynamo backend |
+**Environment Variables (for logging):**
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `NUM_GPUS` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override (e.g., `NUM_GPUS=2`) |
-| `NUM_MACHINES` | `1` | Number of machines in distributed setup |
-| `MIXED_PRECISION` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
-| `DYNAMO_BACKEND` | `no` | PyTorch Dynamo backend |
 | `WANDB_MODE` | `offline` | WandB mode: `offline` or `online` |
 </details>

{wavedl-1.5.1 → wavedl-1.5.3}/README.md RENAMED Viewed

@@ -256,8 +256,8 @@ python -m wavedl.test --checkpoint <checkpoint_folder> --data_path <test_data> \
 **Requirements** (your model must):
 1. Inherit from `BaseModel`
-2. Accept `in_channels`, `num_outputs`, `input_shape` in `__init__`
-3. Return a tensor of shape `(batch, num_outputs)` from `forward()`
+2. Accept `in_shape`, `out_size` in `__init__`
+3. Return a tensor of shape `(batch, out_size)` from `forward()`
 ---
@@ -270,23 +270,22 @@ from wavedl.models import BaseModel, register_model
 @register_model("my_model")  # This name is used with --model flag
 class MyModel(BaseModel):
-    def __init__(self, in_channels, num_outputs, input_shape):
-        # in_channels: number of input channels (auto-detected from data)
-        # num_outputs: number of parameters to predict (auto-detected from data)
-        # input_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
-        super().__init__(in_channels, num_outputs, input_shape)
-        # Define your layers (this is just an example)
-        self.conv1 = nn.Conv2d(in_channels, 64, 3, padding=1)
+    def __init__(self, in_shape, out_size, **kwargs):
+        # in_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
+        # out_size: number of parameters to predict (auto-detected from data)
+        super().__init__(in_shape, out_size)
+        # Define your layers (this is just an example for 2D)
+        self.conv1 = nn.Conv2d(1, 64, 3, padding=1)  # Input always has 1 channel
         self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
-        self.fc = nn.Linear(128, num_outputs)
+        self.fc = nn.Linear(128, out_size)
     def forward(self, x):
-        # Input x has shape: (batch, in_channels, *input_shape)
+        # Input x has shape: (batch, 1, *in_shape)
         x = F.relu(self.conv1(x))
         x = F.relu(self.conv2(x))
         x = x.mean(dim=[-2, -1])  # Global average pooling
-        return self.fc(x)  # Output shape: (batch, num_outputs)
+        return self.fc(x)  # Output shape: (batch, out_size)
 ```
 **Step 2: Train**
@@ -528,14 +527,19 @@ WaveDL automatically enables performance optimizations for modern GPUs:
 </details>
 <details>
-<summary><b>Environment Variables (wavedl-hpc)</b></summary>
+<summary><b>HPC CLI Arguments (wavedl-hpc)</b></summary>
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `--num_gpus` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override |
+| `--num_machines` | `1` | Number of machines in distributed setup |
+| `--mixed_precision` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
+| `--dynamo_backend` | `no` | PyTorch Dynamo backend |
+**Environment Variables (for logging):**
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `NUM_GPUS` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override (e.g., `NUM_GPUS=2`) |
-| `NUM_MACHINES` | `1` | Number of machines in distributed setup |
-| `MIXED_PRECISION` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
-| `DYNAMO_BACKEND` | `no` | PyTorch Dynamo backend |
 | `WANDB_MODE` | `offline` | WandB mode: `offline` or `online` |
 </details>

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ For inference:
     # or: python -m wavedl.test --checkpoint best_checkpoint --data_path test.npz
 """
-__version__ = "1.5.1"
+__version__ = "1.5.3"
 __author__ = "Ductho Le"
 __email__ = "ductho.le@outlook.com"

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/hpc.py RENAMED Viewed

@@ -57,30 +57,35 @@ def setup_hpc_environment() -> None:
     """Configure environment variables for HPC systems.
     Handles restricted home directories (e.g., Compute Canada) and
-    offline logging configurations.
+    offline logging configurations. Always uses CWD-based TORCH_HOME
+    since compute nodes typically lack internet access.
     """
-    # Check if home is writable
+    # Use CWD for cache base since HPC compute nodes typically lack internet
+    cache_base = os.getcwd()
+    # TORCH_HOME always set to CWD - compute nodes need pre-cached weights
+    os.environ.setdefault("TORCH_HOME", f"{cache_base}/.torch_cache")
+    Path(os.environ["TORCH_HOME"]).mkdir(parents=True, exist_ok=True)
+    # Triton/Inductor caches - prevents permission errors with --compile
+    # These MUST be set before any torch.compile calls
+    os.environ.setdefault("TRITON_CACHE_DIR", f"{cache_base}/.triton_cache")
+    os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{cache_base}/.inductor_cache")
+    Path(os.environ["TRITON_CACHE_DIR"]).mkdir(parents=True, exist_ok=True)
+    Path(os.environ["TORCHINDUCTOR_CACHE_DIR"]).mkdir(parents=True, exist_ok=True)
+    # Check if home is writable for other caches
     home = os.path.expanduser("~")
     home_writable = os.access(home, os.W_OK)
-    # Use SLURM_TMPDIR if available, otherwise CWD for HPC, or system temp
-    if home_writable:
-        # Local machine - let libraries use defaults
-        cache_base = None
-    else:
-        # HPC with restricted home - use CWD for persistent caches
-        cache_base = os.getcwd()
-    # Only set environment variables if home is not writable
-    if cache_base:
-        os.environ.setdefault("TORCH_HOME", f"{cache_base}/.torch_cache")
+    # Other caches only if home is not writable
+    if not home_writable:
         os.environ.setdefault("MPLCONFIGDIR", f"{cache_base}/.matplotlib")
         os.environ.setdefault("FONTCONFIG_CACHE", f"{cache_base}/.fontconfig")
         os.environ.setdefault("XDG_CACHE_HOME", f"{cache_base}/.cache")
         # Ensure directories exist
         for env_var in [
-            "TORCH_HOME",
             "MPLCONFIGDIR",
             "FONTCONFIG_CACHE",
             "XDG_CACHE_HOME",
@@ -89,10 +94,9 @@ def setup_hpc_environment() -> None:
     # WandB configuration (offline by default for HPC)
     os.environ.setdefault("WANDB_MODE", "offline")
-    if cache_base:
-        os.environ.setdefault("WANDB_DIR", f"{cache_base}/.wandb")
-        os.environ.setdefault("WANDB_CACHE_DIR", f"{cache_base}/.wandb_cache")
-        os.environ.setdefault("WANDB_CONFIG_DIR", f"{cache_base}/.wandb_config")
+    os.environ.setdefault("WANDB_DIR", f"{cache_base}/.wandb")
+    os.environ.setdefault("WANDB_CACHE_DIR", f"{cache_base}/.wandb_cache")
+    os.environ.setdefault("WANDB_CONFIG_DIR", f"{cache_base}/.wandb_config")
     # Suppress non-critical warnings
     os.environ.setdefault(

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/models/resnet.py RENAMED Viewed

@@ -49,6 +49,36 @@ def _get_conv_layers(
         raise ValueError(f"Unsupported dimensionality: {dim}D. Supported: 1D, 2D, 3D.")
+def _get_num_groups(num_channels: int, preferred_groups: int = 32) -> int:
+    """
+    Get valid num_groups for GroupNorm that divides num_channels evenly.
+    Args:
+        num_channels: Number of channels to normalize
+        preferred_groups: Preferred number of groups (default: 32)
+    Returns:
+        Valid num_groups that divides num_channels
+    Raises:
+        ValueError: If no valid divisor found (shouldn't happen with power-of-2 channels)
+    """
+    # Try preferred groups first, then decrease
+    for groups in [preferred_groups, 16, 8, 4, 2, 1]:
+        if groups <= num_channels and num_channels % groups == 0:
+            return groups
+    # Fallback: find any valid divisor
+    for groups in range(min(32, num_channels), 0, -1):
+        if num_channels % groups == 0:
+            return groups
+    raise ValueError(
+        f"Cannot find valid num_groups for {num_channels} channels. "
+        f"Consider using base_width that is a power of 2 (e.g., 32, 64, 128)."
+    )
 class BasicBlock(nn.Module):
     """
     Basic residual block for ResNet-18/34.
@@ -77,12 +107,12 @@ class BasicBlock(nn.Module):
             padding=1,
             bias=False,
         )
-        self.gn1 = nn.GroupNorm(min(32, out_channels), out_channels)
+        self.gn1 = nn.GroupNorm(_get_num_groups(out_channels), out_channels)
         self.relu = nn.ReLU(inplace=True)
         self.conv2 = Conv(
             out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False
         )
-        self.gn2 = nn.GroupNorm(min(32, out_channels), out_channels)
+        self.gn2 = nn.GroupNorm(_get_num_groups(out_channels), out_channels)
         self.downsample = downsample
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -125,7 +155,7 @@ class Bottleneck(nn.Module):
         # 1x1 reduce
         self.conv1 = Conv(in_channels, out_channels, kernel_size=1, bias=False)
-        self.gn1 = nn.GroupNorm(min(32, out_channels), out_channels)
+        self.gn1 = nn.GroupNorm(_get_num_groups(out_channels), out_channels)
         # 3x3 conv
         self.conv2 = Conv(
@@ -136,15 +166,14 @@ class Bottleneck(nn.Module):
             padding=1,
             bias=False,
         )
-        self.gn2 = nn.GroupNorm(min(32, out_channels), out_channels)
+        self.gn2 = nn.GroupNorm(_get_num_groups(out_channels), out_channels)
         # 1x1 expand
         self.conv3 = Conv(
             out_channels, out_channels * self.expansion, kernel_size=1, bias=False
         )
-        self.gn3 = nn.GroupNorm(
-            min(32, out_channels * self.expansion), out_channels * self.expansion
-        )
+        expanded_channels = out_channels * self.expansion
+        self.gn3 = nn.GroupNorm(_get_num_groups(expanded_channels), expanded_channels)
         self.relu = nn.ReLU(inplace=True)
         self.downsample = downsample
@@ -200,7 +229,7 @@ class ResNetBase(BaseModel):
         # Stem: 7x7 conv (or equivalent for 1D/3D)
         self.conv1 = Conv(1, base_width, kernel_size=7, stride=2, padding=3, bias=False)
-        self.gn1 = nn.GroupNorm(min(32, base_width), base_width)
+        self.gn1 = nn.GroupNorm(_get_num_groups(base_width), base_width)
         self.relu = nn.ReLU(inplace=True)
         self.maxpool = MaxPool(kernel_size=3, stride=2, padding=1)
@@ -246,7 +275,7 @@ class ResNetBase(BaseModel):
                     bias=False,
                 ),
                 nn.GroupNorm(
-                    min(32, out_channels * block.expansion),
+                    _get_num_groups(out_channels * block.expansion),
                     out_channels * block.expansion,
                 ),
             )

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/train.py RENAMED Viewed

@@ -69,6 +69,39 @@ _setup_cache_dir("XDG_DATA_HOME", "local/share")
 _setup_cache_dir("XDG_STATE_HOME", "local/state")
 _setup_cache_dir("XDG_CACHE_HOME", "cache")
+def _setup_per_rank_compile_cache() -> None:
+    """Set per-GPU Triton/Inductor cache to prevent multi-process race warnings.
+    When using torch.compile with multiple GPUs, all processes try to write to
+    the same cache directory, causing 'Directory is not empty - skipping!' warnings.
+    This gives each GPU rank its own isolated cache subdirectory.
+    """
+    # Get local rank from environment (set by accelerate/torchrun)
+    local_rank = os.environ.get("LOCAL_RANK", "0")
+    # Get cache base from environment or use CWD
+    cache_base = os.environ.get(
+        "TRITON_CACHE_DIR", os.path.join(os.getcwd(), ".triton_cache")
+    )
+    # Set per-rank cache directories
+    os.environ["TRITON_CACHE_DIR"] = os.path.join(cache_base, f"rank_{local_rank}")
+    os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(
+        os.environ.get(
+            "TORCHINDUCTOR_CACHE_DIR", os.path.join(os.getcwd(), ".inductor_cache")
+        ),
+        f"rank_{local_rank}",
+    )
+    # Create directories
+    os.makedirs(os.environ["TRITON_CACHE_DIR"], exist_ok=True)
+    os.makedirs(os.environ["TORCHINDUCTOR_CACHE_DIR"], exist_ok=True)
+# Setup per-rank compile caches (before torch imports)
+_setup_per_rank_compile_cache()
 # =============================================================================
 # Standard imports (after environment setup)
 # =============================================================================
@@ -908,7 +941,6 @@ def main():
         logger.info("=" * len(header))
     try:
-        time.time()
         total_training_time = 0.0
         for epoch in range(start_epoch, args.epochs):
@@ -1002,49 +1034,29 @@ def main():
                     local_preds.append(pred.detach().cpu())
                     local_targets.append(y.detach().cpu())
-            # Concatenate locally on CPU (no GPU memory spike)
-            cpu_preds = torch.cat(local_preds)
-            cpu_targets = torch.cat(local_targets)
-            # Gather predictions and targets to rank 0 only (memory-efficient)
-            # Avoids duplicating full validation set on every GPU
-            if torch.distributed.is_initialized():
-                # DDP mode: gather only to rank 0
-                # NCCL backend requires CUDA tensors for collective ops
-                gpu_preds = cpu_preds.to(accelerator.device)
-                gpu_targets = cpu_targets.to(accelerator.device)
-                if accelerator.is_main_process:
-                    # Rank 0: allocate gather buffers on GPU
-                    all_preds_list = [
-                        torch.zeros_like(gpu_preds)
-                        for _ in range(accelerator.num_processes)
-                    ]
-                    all_targets_list = [
-                        torch.zeros_like(gpu_targets)
-                        for _ in range(accelerator.num_processes)
-                    ]
-                    torch.distributed.gather(
-                        gpu_preds, gather_list=all_preds_list, dst=0
-                    )
-                    torch.distributed.gather(
-                        gpu_targets, gather_list=all_targets_list, dst=0
-                    )
-                    # Move back to CPU for metric computation
-                    gathered = [
-                        (
-                            torch.cat(all_preds_list).cpu(),
-                            torch.cat(all_targets_list).cpu(),
-                        )
-                    ]
-                else:
-                    # Other ranks: send to rank 0, don't allocate gather buffers
-                    torch.distributed.gather(gpu_preds, gather_list=None, dst=0)
-                    torch.distributed.gather(gpu_targets, gather_list=None, dst=0)
-                    gathered = [(cpu_preds, cpu_targets)]  # Placeholder, not used
+            # Concatenate locally (keep on GPU for gather_for_metrics compatibility)
+            local_preds_cat = torch.cat(local_preds)
+            local_targets_cat = torch.cat(local_targets)
+            # Gather predictions and targets using Accelerate's CPU-efficient utility
+            # gather_for_metrics handles:
+            # - DDP padding removal (no need to trim manually)
+            # - Efficient cross-rank gathering without GPU memory spike
+            # - Returns concatenated tensors on CPU for metric computation
+            if accelerator.num_processes > 1:
+                # Move to GPU for gather (required by NCCL), then back to CPU
+                # gather_for_metrics is more memory-efficient than manual gather
+                # as it processes in chunks internally
+                gathered_preds = accelerator.gather_for_metrics(
+                    local_preds_cat.to(accelerator.device)
+                ).cpu()
+                gathered_targets = accelerator.gather_for_metrics(
+                    local_targets_cat.to(accelerator.device)
+                ).cpu()
             else:
                 # Single-GPU mode: no gathering needed
-                gathered = [(cpu_preds, cpu_targets)]
+                gathered_preds = local_preds_cat
+                gathered_targets = local_targets_cat
             # Synchronize validation metrics (scalars only - efficient)
             val_loss_scalar = val_loss_sum.item()
@@ -1069,20 +1081,10 @@ def main():
             # ==================== LOGGING & CHECKPOINTING ====================
             if accelerator.is_main_process:
-                # Concatenate gathered tensors from all ranks (only on rank 0)
-                # gathered is list of tuples: [(preds_rank0, targs_rank0), (preds_rank1, targs_rank1), ...]
-                all_preds = torch.cat([item[0] for item in gathered])
-                all_targets = torch.cat([item[1] for item in gathered])
                 # Scientific metrics - cast to float32 before numpy
-                y_pred = all_preds.float().numpy()
-                y_true = all_targets.float().numpy()
-                # Trim DDP padding
-                real_len = len(val_dl.dataset)
-                if len(y_pred) > real_len:
-                    y_pred = y_pred[:real_len]
-                    y_true = y_true[:real_len]
+                # gather_for_metrics already handles DDP padding removal
+                y_pred = gathered_preds.float().numpy()
+                y_true = gathered_targets.float().numpy()
                 # Guard against tiny validation sets (R² undefined for <2 samples)
                 if len(y_true) >= 2:

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/utils/config.py RENAMED Viewed

@@ -183,9 +183,11 @@ def save_config(
             config[key] = value
     # Add metadata
+    from wavedl import __version__
     config["_metadata"] = {
         "saved_at": datetime.now().isoformat(),
-        "wavedl_version": "1.0.0",
+        "wavedl_version": __version__,
     }
     output_path = Path(output_path)

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/utils/cross_validation.py RENAMED Viewed

@@ -337,6 +337,17 @@ def run_cross_validation(
         torch.cuda.manual_seed_all(seed)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Auto-detect optimal DataLoader workers if not specified (matches train.py behavior)
+    if workers < 0:
+        cpu_count = os.cpu_count() or 4
+        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
+        # Heuristic: 4-16 workers per GPU, bounded by available CPU cores
+        workers = min(16, max(2, (cpu_count - 2) // max(1, num_gpus)))
+        logger.info(
+            f"⚙️  Auto-detected workers: {workers} (CPUs: {cpu_count}, GPUs: {num_gpus})"
+        )
     logger.info(f"🚀 K-Fold Cross-Validation ({folds} folds)")
     logger.info(f"   Model: {model_name} | Device: {device}")
     logger.info(

{wavedl-1.5.1 → wavedl-1.5.3}/src/wavedl/utils/data.py RENAMED Viewed

@@ -202,18 +202,31 @@ class NPZSource(DataSource):
     """Load data from NumPy .npz archives."""
     @staticmethod
-    def _safe_load(path: str, mmap_mode: str | None = None):
-        """Load NPZ with pickle only if needed (sparse matrix support)."""
+    def _safe_load(path: str, keys_to_probe: list[str], mmap_mode: str | None = None):
+        """Load NPZ with pickle only if needed (sparse matrix support).
+        The error for object arrays happens at ACCESS time, not load time.
+        So we need to probe the keys to detect if pickle is required.
+        """
+        data = np.load(path, allow_pickle=False, mmap_mode=mmap_mode)
         try:
-            return np.load(path, allow_pickle=False, mmap_mode=mmap_mode)
-        except ValueError:
-            # Fallback for sparse matrices stored as object arrays
-            return np.load(path, allow_pickle=True, mmap_mode=mmap_mode)
+            # Probe keys to trigger error if object arrays exist
+            for key in keys_to_probe:
+                if key in data:
+                    _ = data[key]  # This raises ValueError for object arrays
+            return data
+        except ValueError as e:
+            if "allow_pickle=False" in str(e):
+                # Fallback for sparse matrices stored as object arrays
+                data.close() if hasattr(data, "close") else None
+                return np.load(path, allow_pickle=True, mmap_mode=mmap_mode)
+            raise
     def load(self, path: str) -> tuple[np.ndarray, np.ndarray]:
         """Load NPZ file (pickle enabled only for sparse matrices)."""
-        data = self._safe_load(path)
-        keys = list(data.keys())
+        # First pass to find keys without loading data
+        with np.load(path, allow_pickle=False) as probe:
+            keys = list(probe.keys())
         input_key = self._find_key(keys, INPUT_KEYS)
         output_key = self._find_key(keys, OUTPUT_KEYS)
@@ -225,6 +238,7 @@ class NPZSource(DataSource):
                 f"Found: {keys}"
             )
+        data = self._safe_load(path, [input_key, output_key])
         inp = data[input_key]
         outp = data[output_key]
@@ -243,8 +257,9 @@ class NPZSource(DataSource):
         Note: Returns memory-mapped arrays - do NOT modify them.
         """
-        data = self._safe_load(path, mmap_mode="r")
-        keys = list(data.keys())
+        # First pass to find keys without loading data
+        with np.load(path, allow_pickle=False) as probe:
+            keys = list(probe.keys())
         input_key = self._find_key(keys, INPUT_KEYS)
         output_key = self._find_key(keys, OUTPUT_KEYS)
@@ -256,6 +271,7 @@ class NPZSource(DataSource):
                 f"Found: {keys}"
             )
+        data = self._safe_load(path, [input_key, output_key], mmap_mode="r")
         inp = data[input_key]
         outp = data[output_key]
@@ -263,8 +279,9 @@ class NPZSource(DataSource):
     def load_outputs_only(self, path: str) -> np.ndarray:
         """Load only targets from NPZ (avoids loading large input arrays)."""
-        data = self._safe_load(path)
-        keys = list(data.keys())
+        # First pass to find keys without loading data
+        with np.load(path, allow_pickle=False) as probe:
+            keys = list(probe.keys())
         output_key = self._find_key(keys, OUTPUT_KEYS)
         if output_key is None:
@@ -273,6 +290,7 @@ class NPZSource(DataSource):
                 f"Supported keys: {OUTPUT_KEYS}. Found: {keys}"
             )
+        data = self._safe_load(path, [output_key])
         return data[output_key]
@@ -745,25 +763,77 @@ def load_test_data(
             k for k in OUTPUT_KEYS if k != "output_test"
         ]
-    # Load data using appropriate source
+    # Load data using appropriate source with test-key priority
+    # We detect keys first to ensure input_test/output_test are used when present
     try:
-        inp, outp = source.load(path)
+        if format == "npz":
+            with np.load(path, allow_pickle=False) as probe:
+                keys = list(probe.keys())
+            inp_key = DataSource._find_key(keys, custom_input_keys)
+            out_key = DataSource._find_key(keys, custom_output_keys)
+            if inp_key is None:
+                raise KeyError(
+                    f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
+                )
+            data = NPZSource._safe_load(
+                path, [inp_key] + ([out_key] if out_key else [])
+            )
+            inp = data[inp_key]
+            if inp.dtype == object:
+                inp = np.array(
+                    [x.toarray() if hasattr(x, "toarray") else x for x in inp]
+                )
+            outp = data[out_key] if out_key else None
+        elif format == "hdf5":
+            with h5py.File(path, "r") as f:
+                keys = list(f.keys())
+                inp_key = DataSource._find_key(keys, custom_input_keys)
+                out_key = DataSource._find_key(keys, custom_output_keys)
+                if inp_key is None:
+                    raise KeyError(
+                        f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
+                    )
+                inp = f[inp_key][:]
+                outp = f[out_key][:] if out_key else None
+        elif format == "mat":
+            mat_source = MATSource()
+            with h5py.File(path, "r") as f:
+                keys = list(f.keys())
+                inp_key = DataSource._find_key(keys, custom_input_keys)
+                out_key = DataSource._find_key(keys, custom_output_keys)
+                if inp_key is None:
+                    raise KeyError(
+                        f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
+                    )
+                inp = mat_source._load_dataset(f, inp_key)
+                if out_key:
+                    outp = mat_source._load_dataset(f, out_key)
+                    if outp.ndim == 2 and outp.shape[0] == 1:
+                        outp = outp.T
+                else:
+                    outp = None
+        else:
+            # Fallback to default source.load() for unknown formats
+            inp, outp = source.load(path)
     except KeyError:
         # Try with just inputs if outputs not found (inference-only mode)
         if format == "npz":
-            data = NPZSource._safe_load(path)
-            keys = list(data.keys())
+            # First pass to find keys
+            with np.load(path, allow_pickle=False) as probe:
+                keys = list(probe.keys())
             inp_key = DataSource._find_key(keys, custom_input_keys)
             if inp_key is None:
                 raise KeyError(
                     f"Input key not found. Tried: {custom_input_keys}. Found: {keys}"
                 )
+            out_key = DataSource._find_key(keys, custom_output_keys)
+            keys_to_probe = [inp_key] + ([out_key] if out_key else [])
+            data = NPZSource._safe_load(path, keys_to_probe)
             inp = data[inp_key]
             if inp.dtype == object:
                 inp = np.array(
                     [x.toarray() if hasattr(x, "toarray") else x for x in inp]
                 )
-            out_key = DataSource._find_key(keys, custom_output_keys)
             outp = data[out_key] if out_key else None
         elif format == "hdf5":
             # HDF5: input-only loading for inference

{wavedl-1.5.1 → wavedl-1.5.3/src/wavedl.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: wavedl
-Version: 1.5.1
+Version: 1.5.3
 Summary: A Scalable Deep Learning Framework for Wave-Based Inverse Problems
 Author: Ductho Le
 License: MIT
@@ -301,8 +301,8 @@ python -m wavedl.test --checkpoint <checkpoint_folder> --data_path <test_data> \
 **Requirements** (your model must):
 1. Inherit from `BaseModel`
-2. Accept `in_channels`, `num_outputs`, `input_shape` in `__init__`
-3. Return a tensor of shape `(batch, num_outputs)` from `forward()`
+2. Accept `in_shape`, `out_size` in `__init__`
+3. Return a tensor of shape `(batch, out_size)` from `forward()`
 ---
@@ -315,23 +315,22 @@ from wavedl.models import BaseModel, register_model
 @register_model("my_model")  # This name is used with --model flag
 class MyModel(BaseModel):
-    def __init__(self, in_channels, num_outputs, input_shape):
-        # in_channels: number of input channels (auto-detected from data)
-        # num_outputs: number of parameters to predict (auto-detected from data)
-        # input_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
-        super().__init__(in_channels, num_outputs, input_shape)
-        # Define your layers (this is just an example)
-        self.conv1 = nn.Conv2d(in_channels, 64, 3, padding=1)
+    def __init__(self, in_shape, out_size, **kwargs):
+        # in_shape: spatial dimensions, e.g., (128,) or (64, 64) or (32, 32, 32)
+        # out_size: number of parameters to predict (auto-detected from data)
+        super().__init__(in_shape, out_size)
+        # Define your layers (this is just an example for 2D)
+        self.conv1 = nn.Conv2d(1, 64, 3, padding=1)  # Input always has 1 channel
         self.conv2 = nn.Conv2d(64, 128, 3, padding=1)
-        self.fc = nn.Linear(128, num_outputs)
+        self.fc = nn.Linear(128, out_size)
     def forward(self, x):
-        # Input x has shape: (batch, in_channels, *input_shape)
+        # Input x has shape: (batch, 1, *in_shape)
         x = F.relu(self.conv1(x))
         x = F.relu(self.conv2(x))
         x = x.mean(dim=[-2, -1])  # Global average pooling
-        return self.fc(x)  # Output shape: (batch, num_outputs)
+        return self.fc(x)  # Output shape: (batch, out_size)
 ```
 **Step 2: Train**
@@ -573,14 +572,19 @@ WaveDL automatically enables performance optimizations for modern GPUs:
 </details>
 <details>
-<summary><b>Environment Variables (wavedl-hpc)</b></summary>
+<summary><b>HPC CLI Arguments (wavedl-hpc)</b></summary>
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `--num_gpus` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override |
+| `--num_machines` | `1` | Number of machines in distributed setup |
+| `--mixed_precision` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
+| `--dynamo_backend` | `no` | PyTorch Dynamo backend |
+**Environment Variables (for logging):**
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `NUM_GPUS` | **Auto-detected** | Number of GPUs to use. By default, automatically detected via `nvidia-smi`. Set explicitly to override (e.g., `NUM_GPUS=2`) |
-| `NUM_MACHINES` | `1` | Number of machines in distributed setup |
-| `MIXED_PRECISION` | `bf16` | Precision mode: `bf16`, `fp16`, or `no` |
-| `DYNAMO_BACKEND` | `no` | PyTorch Dynamo backend |
 | `WANDB_MODE` | `offline` | WandB mode: `offline` or `online` |
 </details>