PyPI - wavedl - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

wavedl 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

wavedl/__init__.py +1 -1
wavedl/hpo.py +115 -9
wavedl/models/_pretrained_utils.py +72 -0
wavedl/models/_template.py +7 -6
wavedl/models/cnn.py +20 -0
wavedl/models/convnext.py +3 -70
wavedl/models/convnext_v2.py +1 -18
wavedl/models/mamba.py +126 -38
wavedl/models/resnet3d.py +23 -5
wavedl/models/unireplknet.py +1 -18
wavedl/models/vit.py +18 -8
wavedl/test.py +5 -23
wavedl/train.py +492 -26
wavedl/utils/__init__.py +49 -9
wavedl/utils/config.py +6 -8
wavedl/utils/cross_validation.py +17 -4
wavedl/utils/data.py +140 -174
wavedl/utils/metrics.py +26 -5
wavedl/utils/schedulers.py +2 -2
{wavedl-1.6.3.dist-info → wavedl-1.7.0.dist-info}/METADATA +35 -14
wavedl-1.7.0.dist-info/RECORD +46 -0
wavedl-1.6.3.dist-info/RECORD +0 -46
{wavedl-1.6.3.dist-info → wavedl-1.7.0.dist-info}/LICENSE +0 -0
{wavedl-1.6.3.dist-info → wavedl-1.7.0.dist-info}/WHEEL +0 -0
{wavedl-1.6.3.dist-info → wavedl-1.7.0.dist-info}/entry_points.txt +0 -0
{wavedl-1.6.3.dist-info → wavedl-1.7.0.dist-info}/top_level.txt +0 -0

wavedl/models/mamba.py CHANGED Viewed

@@ -56,6 +56,14 @@ __all__ = [
 # SELECTIVE SSM CORE (Pure PyTorch Implementation)
 # =============================================================================
+# Maximum sequence length for stable parallel scan without chunking
+# Beyond this, the chunked implementation is used automatically
+MAX_SAFE_SEQUENCE_LENGTH = 512
+# Recommended maximum for this pure-PyTorch implementation
+# For longer sequences, consider using the optimized mamba-ssm package
+MAX_RECOMMENDED_SEQUENCE_LENGTH = 2048
 class SelectiveSSM(nn.Module):
     """
@@ -64,8 +72,17 @@ class SelectiveSSM(nn.Module):
     The key innovation is making the SSM parameters (B, C, Δ) input-dependent,
     allowing the model to selectively focus on or ignore inputs.
-    This is a simplified pure-PyTorch implementation. For production use,
-    consider the optimized mamba-ssm package.
+    This is a pure-PyTorch implementation with chunked parallel scan for
+    numerical stability. For sequences > 2048 or production use, consider
+    the optimized mamba-ssm package with CUDA kernels.
+    Args:
+        d_model: Model dimension
+        d_state: SSM state dimension (default: 16)
+        d_conv: Local convolution width (default: 4)
+        expand: Expansion factor for inner dimension (default: 2)
+        chunk_size: Chunk size for parallel scan (default: 256).
+            Smaller = more stable but slower. Larger = faster but may overflow.
     """
     def __init__(
@@ -74,12 +91,14 @@ class SelectiveSSM(nn.Module):
         d_state: int = 16,
         d_conv: int = 4,
         expand: int = 2,
+        chunk_size: int = 256,
     ):
         super().__init__()
         self.d_model = d_model
         self.d_state = d_state
         self.d_inner = d_model * expand
+        self.chunk_size = chunk_size
         # Input projection
         self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)
@@ -116,6 +135,17 @@ class SelectiveSSM(nn.Module):
         """
         _B, L, _D = x.shape
+        # Warn for very long sequences
+        if L > MAX_RECOMMENDED_SEQUENCE_LENGTH and self.training:
+            import warnings
+            warnings.warn(
+                f"Sequence length {L} > {MAX_RECOMMENDED_SEQUENCE_LENGTH}. "
+                "Consider using mamba-ssm package for better performance.",
+                UserWarning,
+                stacklevel=2,
+            )
         # Input projection and split
         xz = self.in_proj(x)  # (B, L, 2*d_inner)
         x, z = xz.chunk(2, dim=-1)  # Each: (B, L, d_inner)
@@ -135,8 +165,11 @@ class SelectiveSSM(nn.Module):
         # Discretize A
         A = -torch.exp(self.A_log)  # (d_state,)
-        # Selective scan (simplified, not optimized)
-        y = self._selective_scan(x, delta, A, B_param, C_param, self.D)
+        # Use chunked scan for long sequences, direct scan for short
+        if L > MAX_SAFE_SEQUENCE_LENGTH:
+            y = self._chunked_selective_scan(x, delta, A, B_param, C_param, self.D)
+        else:
+            y = self._selective_scan_single(x, delta, A, B_param, C_param, self.D)
         # Gating
         y = y * F.silu(z)
@@ -144,7 +177,7 @@ class SelectiveSSM(nn.Module):
         # Output projection
         return self.out_proj(y)
-    def _selective_scan(
+    def _selective_scan_single(
         self,
         x: torch.Tensor,
         delta: torch.Tensor,
@@ -154,54 +187,109 @@ class SelectiveSSM(nn.Module):
         D: torch.Tensor,
     ) -> torch.Tensor:
         """
-        Vectorized selective scan using parallel associative scan.
+        Single-chunk parallel scan for short sequences (L <= MAX_SAFE_SEQUENCE_LENGTH).
-        This implementation avoids the sequential for-loop by computing
-        all timesteps in parallel using cumulative products and sums.
-        ~100x faster than the naive sequential implementation.
+        Uses log-space cumsum which is stable for short sequences.
         """
+        # Compute discretized A_bar: (B, L, d_inner, d_state)
+        A_bar = torch.exp(delta.unsqueeze(-1) * A)
-        # Compute discretized A_bar for all timesteps: (B, L, d_inner, d_state)
-        A_bar = torch.exp(delta.unsqueeze(-1) * A)  # (B, L, d_inner, d_state)
-        # Compute input contribution: delta * B * x for all timesteps
-        # B: (B, L, d_state), x: (B, L, d_inner), delta: (B, L, d_inner)
-        # Result: (B, L, d_inner, d_state)
+        # Input contribution: (B, L, d_inner, d_state)
         BX = delta.unsqueeze(-1) * B.unsqueeze(2) * x.unsqueeze(-1)
-        # Parallel scan using log-space cumulative products for numerical stability
-        # For SSM: h[t] = A_bar[t] * h[t-1] + BX[t]
-        # This is a linear recurrence that can be solved with associative scan
-        # Use chunked approach for memory efficiency with parallel scan
-        # Compute cumulative product of A_bar (in log space for stability)
+        # Log-space parallel scan
         log_A_bar = torch.log(A_bar.clamp(min=1e-10))
-        log_A_cumsum = torch.cumsum(log_A_bar, dim=1)  # (B, L, d_inner, d_state)
-        A_cumsum = torch.exp(log_A_cumsum)
+        log_A_cumsum = torch.cumsum(log_A_bar, dim=1)
+        A_cumsum = torch.exp(log_A_cumsum.clamp(max=80))  # Prevent overflow
-        # For each timestep t, we need: sum_{s=0}^{t} (prod_{k=s+1}^{t} A_bar[k]) * BX[s]
-        # = sum_{s=0}^{t} (A_cumsum[t] / A_cumsum[s]) * BX[s]
-        # = A_cumsum[t] * sum_{s=0}^{t} (BX[s] / A_cumsum[s])
-        # Compute BX / A_cumsum (use A_cumsum shifted by 1 for proper indexing)
-        # A_cumsum[s] represents prod_{k=0}^{s} A_bar[k], but we need prod_{k=0}^{s-1}
-        # So we shift: use A_cumsum from previous timestep
+        # Shifted cumsum for proper indexing
         A_cumsum_shifted = F.pad(A_cumsum[:, :-1], (0, 0, 0, 0, 1, 0), value=1.0)
-        # Weighted input: BX[s] / A_cumsum[s-1] = BX[s] * exp(-log_A_cumsum[s-1])
+        # Weighted input and cumsum
         weighted_BX = BX / A_cumsum_shifted.clamp(min=1e-10)
-        # Cumulative sum of weighted inputs
         weighted_BX_cumsum = torch.cumsum(weighted_BX, dim=1)
-        # Final state at each timestep: h[t] = A_cumsum[t] * weighted_BX_cumsum[t]
-        # But A_cumsum includes A_bar[0], so adjust
+        # Final state
         h = A_cumsum * weighted_BX_cumsum / A_bar.clamp(min=1e-10)
-        # Output: y = C * h + D * x
-        # h: (B, L, d_inner, d_state), C: (B, L, d_state)
-        y = (C.unsqueeze(2) * h).sum(-1) + D * x  # (B, L, d_inner)
+        # Output
+        y = (C.unsqueeze(2) * h).sum(-1) + D * x
+        return y
+    def _chunked_selective_scan(
+        self,
+        x: torch.Tensor,
+        delta: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Chunked parallel scan for long sequences.
+        Processes in chunks of self.chunk_size, carrying state between chunks.
+        This prevents log-cumsum from growing unbounded while maintaining
+        reasonable parallelism within each chunk.
+        """
+        batch_size, seq_len, d_inner = x.shape
+        d_state = self.d_state
+        chunk_size = self.chunk_size
+        # Initialize output and state
+        y_chunks = []
+        h_state = torch.zeros(
+            batch_size, d_inner, d_state, device=x.device, dtype=x.dtype
+        )
+        # Process in chunks
+        for start in range(0, seq_len, chunk_size):
+            end = min(start + chunk_size, seq_len)
+            # Extract chunk
+            x_chunk = x[:, start:end]
+            delta_chunk = delta[:, start:end]
+            B_chunk = B[:, start:end]
+            C_chunk = C[:, start:end]
+            # Compute A_bar for chunk: (B, chunk_len, d_inner, d_state)
+            A_bar = torch.exp(delta_chunk.unsqueeze(-1) * A)
+            # Input contribution
+            BX = (
+                delta_chunk.unsqueeze(-1) * B_chunk.unsqueeze(2) * x_chunk.unsqueeze(-1)
+            )
+            # Within-chunk parallel scan (short enough to be stable)
+            log_A_bar = torch.log(A_bar.clamp(min=1e-10))
+            log_A_cumsum = torch.cumsum(log_A_bar, dim=1)
+            A_cumsum = torch.exp(log_A_cumsum.clamp(max=80))
+            A_cumsum_shifted = F.pad(A_cumsum[:, :-1], (0, 0, 0, 0, 1, 0), value=1.0)
+            weighted_BX = BX / A_cumsum_shifted.clamp(min=1e-10)
+            weighted_BX_cumsum = torch.cumsum(weighted_BX, dim=1)
+            # Chunk-internal state (without carry-over)
+            h_chunk_internal = A_cumsum * weighted_BX_cumsum / A_bar.clamp(min=1e-10)
+            # Add contribution from previous state
+            # h_state: (B, d_inner, d_state) -> (B, 1, d_inner, d_state)
+            # A_cumsum: (B, chunk_len, d_inner, d_state)
+            h_state_contribution = h_state.unsqueeze(1) * A_cumsum
+            # Total state for this chunk
+            h_chunk = h_chunk_internal + h_state_contribution
+            # Output for this chunk
+            y_chunk = (C_chunk.unsqueeze(2) * h_chunk).sum(-1) + D * x_chunk
+            y_chunks.append(y_chunk)
+            # Update carry-over state for next chunk
+            # Final state of this chunk: h_chunk[:, -1]
+            h_state = h_chunk[:, -1]
+        # Concatenate all chunks
+        y = torch.cat(y_chunks, dim=1)
         return y

wavedl/models/resnet3d.py CHANGED Viewed

@@ -136,6 +136,28 @@ class ResNet3DBase(BaseModel):
         if freeze_backbone:
             self._freeze_backbone()
+        # Adapt first conv for single-channel input (instead of expand in forward)
+        self._adapt_stem_for_single_channel()
+    def _adapt_stem_for_single_channel(self):
+        """Modify stem conv to accept 1 channel, averaging pretrained RGB weights."""
+        old_conv = self.backbone.stem[0]
+        new_conv = nn.Conv3d(
+            1,
+            old_conv.out_channels,
+            kernel_size=old_conv.kernel_size,
+            stride=old_conv.stride,
+            padding=old_conv.padding,
+            bias=old_conv.bias is not None,
+        )
+        if self.pretrained:
+            with torch.no_grad():
+                # Average RGB weights for grayscale initialization
+                new_conv.weight.copy_(old_conv.weight.mean(dim=1, keepdim=True))
+                if old_conv.bias is not None:
+                    new_conv.bias.copy_(old_conv.bias)
+        self.backbone.stem[0] = new_conv
     def _freeze_backbone(self):
         """Freeze all backbone parameters except the fc head."""
         for name, param in self.backbone.named_parameters():
@@ -147,15 +169,11 @@ class ResNet3DBase(BaseModel):
         Forward pass.
         Args:
-            x: Input tensor of shape (B, C, D, H, W) where C is 1 or 3
+            x: Input tensor of shape (B, 1, D, H, W)
         Returns:
             Output tensor of shape (B, out_size)
         """
-        # Expand single channel to 3 channels for pretrained weights compatibility
-        if x.size(1) == 1:
-            x = x.expand(-1, 3, -1, -1, -1)
         return self.backbone(x)
     @classmethod

wavedl/models/unireplknet.py CHANGED Viewed

@@ -37,6 +37,7 @@ import torch
 import torch.nn as nn
 from wavedl.models._pretrained_utils import (
+    DropPath,
     LayerNormNd,
     get_conv_layer,
     get_grn_layer,
@@ -133,24 +134,6 @@ class SEBlock(nn.Module):
         return x * scale
-class DropPath(nn.Module):
-    """Stochastic Depth (drop path) regularization."""
-    def __init__(self, drop_prob: float = 0.0):
-        super().__init__()
-        self.drop_prob = drop_prob
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.drop_prob == 0.0 or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
-        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-        random_tensor.floor_()
-        return x.div(keep_prob) * random_tensor
 # =============================================================================
 # UNIREPLKNET BLOCK
 # =============================================================================

wavedl/models/vit.py CHANGED Viewed

@@ -150,17 +150,22 @@ class PatchEmbed(nn.Module):
 class MultiHeadAttention(nn.Module):
-    """Multi-head self-attention mechanism."""
+    """
+    Multi-head self-attention mechanism.
+    Uses F.scaled_dot_product_attention (PyTorch 2.0+) for efficient,
+    fused attention with automatic Flash Attention support when available.
+    """
     def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        self.scale = self.head_dim**-0.5
+        self.dropout_p = dropout
         self.qkv = nn.Linear(embed_dim, embed_dim * 3, bias=True)
         self.proj = nn.Linear(embed_dim, embed_dim)
-        self.dropout = nn.Dropout(dropout)
+        self.proj_dropout = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -169,13 +174,18 @@ class MultiHeadAttention(nn.Module):
         qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, B, heads, N, head_dim)
         q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.dropout(attn)
+        # Use fused SDPA (PyTorch 2.0+) for efficiency + Flash Attention
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.dropout_p if self.training else 0.0,
+            is_causal=False,
+        )
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = x.transpose(1, 2).reshape(B, N, C)
         x = self.proj(x)
-        x = self.dropout(x)
+        x = self.proj_dropout(x)
         return x

wavedl/test.py CHANGED Viewed

@@ -33,35 +33,17 @@ Author: Ductho Le (ductho.le@outlook.com)
 # Uses current working directory as fallback - works on HPC and local machines.
 import os
+# Import and call HPC cache setup before any library imports
+from wavedl.utils import setup_hpc_cache_dirs
-def _setup_cache_dir(env_var: str, subdir: str) -> None:
-    """Set cache directory to CWD if home is not writable."""
-    if env_var in os.environ:
-        return  # User already set, respect their choice
-    # Check if home is writable
-    home = os.path.expanduser("~")
-    if os.access(home, os.W_OK):
-        return  # Home is writable, let library use defaults
-    # Home not writable - use current working directory
-    cache_path = os.path.join(os.getcwd(), f".{subdir}")
-    os.makedirs(cache_path, exist_ok=True)
-    os.environ[env_var] = cache_path
-# Configure cache directories (before any library imports)
-_setup_cache_dir("TORCH_HOME", "torch_cache")
-_setup_cache_dir("MPLCONFIGDIR", "matplotlib")
-_setup_cache_dir("FONTCONFIG_CACHE", "fontconfig")
-_setup_cache_dir("XDG_DATA_HOME", "local/share")
-_setup_cache_dir("XDG_STATE_HOME", "local/state")
-_setup_cache_dir("XDG_CACHE_HOME", "cache")
+setup_hpc_cache_dirs()
 import argparse  # noqa: E402
 import logging  # noqa: E402
 import pickle  # noqa: E402
 from pathlib import Path  # noqa: E402
+from typing import Any  # noqa: E402
 import matplotlib.pyplot as plt  # noqa: E402
 import numpy as np  # noqa: E402
@@ -314,7 +296,7 @@ def load_checkpoint(
     in_shape: tuple[int, ...],
     out_size: int,
     model_name: str | None = None,
-) -> tuple[nn.Module, any]:
+) -> tuple[nn.Module, Any]:
     """
     Load model and scaler from Accelerate checkpoint directory.

wavedl 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

wavedl 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl