PyPI - monai-weekly - Versions diffs - 1.4.dev2431__py3-none-any.whl → 1.4.dev2435__py3-none-any.whl - Mend

monai-weekly 1.4.dev2431py3-none-any.whl → 1.4.dev2435py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

monai/__init__.py +1 -1
monai/_version.py +3 -3
monai/apps/generation/maisi/networks/autoencoderkl_maisi.py +43 -25
monai/apps/generation/maisi/networks/controlnet_maisi.py +15 -18
monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py +18 -18
monai/apps/vista3d/inferer.py +177 -0
monai/apps/vista3d/sampler.py +179 -0
monai/apps/vista3d/transforms.py +224 -0
monai/bundle/scripts.py +29 -17
monai/data/utils.py +1 -1
monai/data/wsi_datasets.py +3 -3
monai/inferers/utils.py +1 -0
monai/losses/__init__.py +1 -0
monai/losses/dice.py +10 -1
monai/losses/nacl_loss.py +139 -0
monai/networks/blocks/crossattention.py +48 -26
monai/networks/blocks/mlp.py +16 -4
monai/networks/blocks/selfattention.py +75 -23
monai/networks/blocks/spatialattention.py +16 -1
monai/networks/blocks/transformerblock.py +17 -2
monai/networks/layers/filtering.py +6 -2
monai/networks/nets/__init__.py +2 -1
monai/networks/nets/autoencoderkl.py +55 -22
monai/networks/nets/cell_sam_wrapper.py +92 -0
monai/networks/nets/controlnet.py +24 -22
monai/networks/nets/diffusion_model_unet.py +159 -19
monai/networks/nets/segresnet_ds.py +127 -1
monai/networks/nets/spade_autoencoderkl.py +22 -0
monai/networks/nets/spade_diffusion_model_unet.py +39 -2
monai/networks/nets/transformer.py +17 -17
monai/networks/nets/vista3d.py +946 -0
monai/networks/utils.py +4 -4
monai/transforms/__init__.py +13 -2
monai/transforms/io/array.py +59 -3
monai/transforms/io/dictionary.py +29 -2
monai/transforms/spatial/functional.py +1 -1
monai/transforms/transform.py +2 -2
monai/transforms/utility/dictionary.py +4 -0
monai/transforms/utils.py +230 -1
monai/{apps/generation/maisi/utils/morphological_ops.py → transforms/utils_morphological_ops.py} +2 -0
monai/transforms/utils_pytorch_numpy_unification.py +2 -2
monai/utils/enums.py +1 -0
monai/utils/module.py +7 -6
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/METADATA +84 -81
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/RECORD +49 -43
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/WHEEL +1 -1
/monai/apps/{generation/maisi/utils → vista3d}/__init__.py +0 -0
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/LICENSE +0 -0
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/top_level.txt +0 -0

monai/losses/nacl_loss.py ADDED Viewed

@@ -0,0 +1,139 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.loss import _Loss
+from monai.networks.layers import GaussianFilter, MeanFilter
+class NACLLoss(_Loss):
+    """
+    Neighbor-Aware Calibration Loss (NACL) is primarily developed for developing calibrated models in image segmentation.
+    NACL computes standard cross-entropy loss with a linear penalty that enforces the logit distributions
+    to match a soft class proportion of surrounding pixel.
+    Murugesan, Balamurali, et al.
+    "Trust your neighbours: Penalty-based constraints for model calibration."
+    International Conference on Medical Image Computing and Computer-Assisted Intervention, MICCAI 2023.
+    https://arxiv.org/abs/2303.06268
+    Murugesan, Balamurali, et al.
+    "Neighbor-Aware Calibration of Segmentation Networks with Penalty-Based Constraints."
+    https://arxiv.org/abs/2401.14487
+    """
+    def __init__(
+        self,
+        classes: int,
+        dim: int,
+        kernel_size: int = 3,
+        kernel_ops: str = "mean",
+        distance_type: str = "l1",
+        alpha: float = 0.1,
+        sigma: float = 1.0,
+    ) -> None:
+        """
+        Args:
+            classes: number of classes
+            dim: dimension of data (supports 2d and 3d)
+            kernel_size: size of the spatial kernel
+            distance_type: l1/l2 distance between spatial kernel and predicted logits
+            alpha: weightage between cross entropy and logit constraint
+            sigma: sigma of gaussian
+        """
+        super().__init__()
+        if kernel_ops not in ["mean", "gaussian"]:
+            raise ValueError("Kernel ops must be either mean or gaussian")
+        if dim not in [2, 3]:
+            raise ValueError(f"Support 2d and 3d, got dim={dim}.")
+        if distance_type not in ["l1", "l2"]:
+            raise ValueError(f"Distance type must be either L1 or L2, got {distance_type}")
+        self.nc = classes
+        self.dim = dim
+        self.cross_entropy = nn.CrossEntropyLoss()
+        self.distance_type = distance_type
+        self.alpha = alpha
+        self.ks = kernel_size
+        self.svls_layer: Any
+        if kernel_ops == "mean":
+            self.svls_layer = MeanFilter(spatial_dims=dim, size=kernel_size)
+            self.svls_layer.filter = self.svls_layer.filter / (kernel_size**dim)
+        if kernel_ops == "gaussian":
+            self.svls_layer = GaussianFilter(spatial_dims=dim, sigma=sigma)
+    def get_constr_target(self, mask: torch.Tensor) -> torch.Tensor:
+        """
+        Converts the mask to one hot represenation and is smoothened with the selected spatial filter.
+        Args:
+            mask: the shape should be BH[WD].
+        Returns:
+            torch.Tensor: the shape would be BNH[WD], N being number of classes.
+        """
+        rmask: torch.Tensor
+        if self.dim == 2:
+            oh_labels = F.one_hot(mask.to(torch.int64), num_classes=self.nc).permute(0, 3, 1, 2).contiguous().float()
+            rmask = self.svls_layer(oh_labels)
+        if self.dim == 3:
+            oh_labels = F.one_hot(mask.to(torch.int64), num_classes=self.nc).permute(0, 4, 1, 2, 3).contiguous().float()
+            rmask = self.svls_layer(oh_labels)
+        return rmask
+    def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        """
+        Computes standard cross-entropy loss and constraints it neighbor aware logit penalty.
+        Args:
+            inputs: the shape should be BNH[WD], where N is the number of classes.
+            targets: the shape should be BH[WD].
+        Returns:
+            torch.Tensor: value of the loss.
+        Example:
+            >>> import torch
+            >>> from monai.losses import NACLLoss
+            >>> B, N, H, W = 8, 3, 64, 64
+            >>> input = torch.rand(B, N, H, W)
+            >>> target = torch.randint(0, N, (B, H, W))
+            >>> criterion = NACLLoss(classes = N, dim = 2)
+            >>> loss = criterion(input, target)
+        """
+        loss_ce = self.cross_entropy(inputs, targets)
+        utargets = self.get_constr_target(targets)
+        if self.distance_type == "l1":
+            loss_conf = utargets.sub(inputs).abs_().mean()
+        elif self.distance_type == "l2":
+            loss_conf = utargets.sub(inputs).pow_(2).abs_().mean()
+        loss: torch.Tensor = loss_ce + self.alpha * loss_conf
+        return loss

monai/networks/blocks/crossattention.py CHANGED Viewed

@@ -17,7 +17,7 @@ import torch
 import torch.nn as nn
 from monai.networks.layers.utils import get_rel_pos_embedding_layer
-from monai.utils import optional_import
+from monai.utils import optional_import, pytorch_after
 Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange")
@@ -44,6 +44,7 @@ class CrossAttentionBlock(nn.Module):
         rel_pos_embedding: Optional[str] = None,
         input_size: Optional[Tuple] = None,
         attention_dtype: Optional[torch.dtype] = None,
+        use_flash_attention: bool = False,
     ) -> None:
         """
         Args:
@@ -55,13 +56,15 @@ class CrossAttentionBlock(nn.Module):
             dim_head (int, optional): dimension of each head. Defaults to hidden_size // num_heads.
             qkv_bias (bool, optional): bias term for the qkv linear layer. Defaults to False.
             save_attn (bool, optional): to make accessible the attention matrix. Defaults to False.
-            causal: whether to use causal attention.
-            sequence_length: if causal is True, it is necessary to specify the sequence length.
-            rel_pos_embedding (str, optional): Add relative positional embeddings to the attention map.
-                For now only "decomposed" is supported (see https://arxiv.org/abs/2112.01526). 2D and 3D are supported.
-            input_size (tuple(spatial_dim), optional): Input resolution for calculating the relative
-                positional parameter size.
+            causal (bool, optional): whether to use causal attention.
+            sequence_length (int, optional): if causal is True, it is necessary to specify the sequence length.
+            rel_pos_embedding (str, optional): Add relative positional embeddings to the attention map. For now only
+                "decomposed" is supported (see https://arxiv.org/abs/2112.01526). 2D and 3D are supported.
+            input_size (tuple(spatial_dim), optional): Input resolution for calculating the relative positional
+                parameter size.
             attention_dtype: cast attention operations to this dtype.
+            use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+                (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
         """
         super().__init__()
@@ -81,6 +84,20 @@ class CrossAttentionBlock(nn.Module):
         if causal and sequence_length is None:
             raise ValueError("sequence_length is necessary for causal attention.")
+        if use_flash_attention and not pytorch_after(minor=13, major=1, patch=0):
+            raise ValueError(
+                "use_flash_attention is only supported for PyTorch versions >= 2.0."
+                "Upgrade your PyTorch or set the flag to False."
+            )
+        if use_flash_attention and save_attn:
+            raise ValueError(
+                "save_attn has been set to True, but use_flash_attention is also set"
+                "to True. save_attn can only be used if use_flash_attention is False"
+            )
+        if use_flash_attention and rel_pos_embedding is not None:
+            raise ValueError("rel_pos_embedding must be None if you are using flash_attention.")
         self.num_heads = num_heads
         self.hidden_input_size = hidden_input_size if hidden_input_size else hidden_size
         self.context_input_size = context_input_size if context_input_size else hidden_size
@@ -91,9 +108,10 @@ class CrossAttentionBlock(nn.Module):
         self.to_v = nn.Linear(self.context_input_size, inner_size, bias=qkv_bias)
         self.input_rearrange = Rearrange("b h (l d) -> b l h d", l=num_heads)
-        self.out_rearrange = Rearrange("b h l d -> b l (h d)")
+        self.out_rearrange = Rearrange("b l h d -> b h (l d)")
         self.drop_output = nn.Dropout(dropout_rate)
         self.drop_weights = nn.Dropout(dropout_rate)
+        self.dropout_rate = dropout_rate
         self.scale = self.head_dim**-0.5
         self.save_attn = save_attn
@@ -101,6 +119,7 @@ class CrossAttentionBlock(nn.Module):
         self.causal = causal
         self.sequence_length = sequence_length
+        self.use_flash_attention = use_flash_attention
         if causal and sequence_length is not None:
             # causal mask to ensure that attention is only applied to the left in the input sequence
@@ -132,36 +151,39 @@ class CrossAttentionBlock(nn.Module):
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
         b, t, c = x.size()  # batch size, sequence length, embedding dimensionality (hidden_size)
-        q = self.to_q(x)
+        q = self.input_rearrange(self.to_q(x))
         kv = context if context is not None else x
         _, kv_t, _ = kv.size()
-        k = self.to_k(kv)
-        v = self.to_v(kv)
+        k = self.input_rearrange(self.to_k(kv))
+        v = self.input_rearrange(self.to_v(kv))
         if self.attention_dtype is not None:
             q = q.to(self.attention_dtype)
             k = k.to(self.attention_dtype)
-        q = q.view(b, t, self.num_heads, c // self.num_heads).transpose(1, 2)  # (b, nh, t,  hs)
-        k = k.view(b, kv_t, self.num_heads, c // self.num_heads).transpose(1, 2)  # (b, nh, kv_t, hs)
-        v = v.view(b, kv_t, self.num_heads, c // self.num_heads).transpose(1, 2)  # (b, nh, kv_t, hs)
-        att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale
+        if self.use_flash_attention:
+            x = torch.nn.functional.scaled_dot_product_attention(
+                query=q, key=k, value=v, scale=self.scale, dropout_p=self.dropout_rate, is_causal=self.causal
+            )
+        else:
+            att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale
+            # apply relative positional embedding if defined
+            if self.rel_positional_embedding is not None:
+                att_mat = self.rel_positional_embedding(x, att_mat, q)
-        # apply relative positional embedding if defined
-        att_mat = self.rel_positional_embedding(x, att_mat, q) if self.rel_positional_embedding is not None else att_mat
+            if self.causal:
+                att_mat = att_mat.masked_fill(self.causal_mask[:, :, :t, :kv_t] == 0, float("-inf"))
-        if self.causal:
-            att_mat = att_mat.masked_fill(self.causal_mask[:, :, :t, :kv_t] == 0, float("-inf"))
+            att_mat = att_mat.softmax(dim=-1)
-        att_mat = att_mat.softmax(dim=-1)
+            if self.save_attn:
+                # no gradients and new tensor;
+                # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
+                self.att_mat = att_mat.detach()
-        if self.save_attn:
-            # no gradients and new tensor;
-            # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
-            self.att_mat = att_mat.detach()
+            att_mat = self.drop_weights(att_mat)
+            x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v)
-        att_mat = self.drop_weights(att_mat)
-        x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v)
         x = self.out_rearrange(x)
         x = self.out_proj(x)
         x = self.drop_output(x)

monai/networks/blocks/mlp.py CHANGED Viewed

@@ -11,12 +11,15 @@
 from __future__ import annotations
+from typing import Union
 import torch.nn as nn
 from monai.networks.layers import get_act_layer
+from monai.networks.layers.factories import split_args
 from monai.utils import look_up_option
-SUPPORTED_DROPOUT_MODE = {"vit", "swin"}
+SUPPORTED_DROPOUT_MODE = {"vit", "swin", "vista3d"}
 class MLPBlock(nn.Module):
@@ -39,7 +42,7 @@ class MLPBlock(nn.Module):
                 https://github.com/google-research/vision_transformer/blob/main/vit_jax/models.py#L87
                 "swin" corresponds to one instance as implemented in
                 https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_mlp.py#L23
+                "vista3d" mode does not use dropout.
         """
@@ -48,15 +51,24 @@ class MLPBlock(nn.Module):
         if not (0 <= dropout_rate <= 1):
             raise ValueError("dropout_rate should be between 0 and 1.")
         mlp_dim = mlp_dim or hidden_size
-        self.linear1 = nn.Linear(hidden_size, mlp_dim) if act != "GEGLU" else nn.Linear(hidden_size, mlp_dim * 2)
+        act_name, _ = split_args(act)
+        self.linear1 = nn.Linear(hidden_size, mlp_dim) if act_name != "GEGLU" else nn.Linear(hidden_size, mlp_dim * 2)
         self.linear2 = nn.Linear(mlp_dim, hidden_size)
         self.fn = get_act_layer(act)
-        self.drop1 = nn.Dropout(dropout_rate)
+        # Use Union[nn.Dropout, nn.Identity] for type annotations
+        self.drop1: Union[nn.Dropout, nn.Identity]
+        self.drop2: Union[nn.Dropout, nn.Identity]
         dropout_opt = look_up_option(dropout_mode, SUPPORTED_DROPOUT_MODE)
         if dropout_opt == "vit":
+            self.drop1 = nn.Dropout(dropout_rate)
             self.drop2 = nn.Dropout(dropout_rate)
         elif dropout_opt == "swin":
+            self.drop1 = nn.Dropout(dropout_rate)
             self.drop2 = self.drop1
+        elif dropout_opt == "vista3d":
+            self.drop1 = nn.Identity()
+            self.drop2 = nn.Identity()
         else:
             raise ValueError(f"dropout_mode should be one of {SUPPORTED_DROPOUT_MODE}")

monai/networks/blocks/selfattention.py CHANGED Viewed

@@ -11,13 +11,14 @@
 from __future__ import annotations
-from typing import Optional, Tuple
+from typing import Tuple, Union
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from monai.networks.layers.utils import get_rel_pos_embedding_layer
-from monai.utils import optional_import
+from monai.utils import optional_import, pytorch_after
 Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange")
@@ -39,9 +40,12 @@ class SABlock(nn.Module):
         hidden_input_size: int | None = None,
         causal: bool = False,
         sequence_length: int | None = None,
-        rel_pos_embedding: Optional[str] = None,
-        input_size: Optional[Tuple] = None,
-        attention_dtype: Optional[torch.dtype] = None,
+        rel_pos_embedding: str | None = None,
+        input_size: Tuple | None = None,
+        attention_dtype: torch.dtype | None = None,
+        include_fc: bool = True,
+        use_combined_linear: bool = True,
+        use_flash_attention: bool = False,
     ) -> None:
         """
         Args:
@@ -59,6 +63,10 @@ class SABlock(nn.Module):
             input_size (tuple(spatial_dim), optional): Input resolution for calculating the relative
                 positional parameter size.
             attention_dtype: cast attention operations to this dtype.
+            include_fc: whether to include the final linear layer. Default to True.
+            use_combined_linear: whether to use a single linear layer for qkv projection, default to True.
+            use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+                (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
         """
@@ -82,21 +90,52 @@ class SABlock(nn.Module):
         if causal and sequence_length is None:
             raise ValueError("sequence_length is necessary for causal attention.")
+        if use_flash_attention and not pytorch_after(minor=13, major=1, patch=0):
+            raise ValueError(
+                "use_flash_attention is only supported for PyTorch versions >= 2.0."
+                "Upgrade your PyTorch or set the flag to False."
+            )
+        if use_flash_attention and save_attn:
+            raise ValueError(
+                "save_attn has been set to True, but use_flash_attention is also set"
+                "to True. save_attn can only be used if use_flash_attention is False."
+            )
+        if use_flash_attention and rel_pos_embedding is not None:
+            raise ValueError("rel_pos_embedding must be None if you are using flash_attention.")
         self.num_heads = num_heads
         self.hidden_input_size = hidden_input_size if hidden_input_size else hidden_size
         self.out_proj = nn.Linear(self.inner_dim, self.hidden_input_size)
-        self.qkv = nn.Linear(self.hidden_input_size, self.inner_dim * 3, bias=qkv_bias)
-        self.input_rearrange = Rearrange("b h (qkv l d) -> qkv b l h d", qkv=3, l=num_heads)
-        self.out_rearrange = Rearrange("b h l d -> b l (h d)")
+        self.qkv: Union[nn.Linear, nn.Identity]
+        self.to_q: Union[nn.Linear, nn.Identity]
+        self.to_k: Union[nn.Linear, nn.Identity]
+        self.to_v: Union[nn.Linear, nn.Identity]
+        if use_combined_linear:
+            self.qkv = nn.Linear(self.hidden_input_size, self.inner_dim * 3, bias=qkv_bias)
+            self.to_q = self.to_k = self.to_v = nn.Identity()  # add to enable torchscript
+            self.input_rearrange = Rearrange("b h (qkv l d) -> qkv b l h d", qkv=3, l=num_heads)
+        else:
+            self.to_q = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias)
+            self.to_k = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias)
+            self.to_v = nn.Linear(self.hidden_input_size, self.inner_dim, bias=qkv_bias)
+            self.qkv = nn.Identity()  # add to enable torchscript
+            self.input_rearrange = Rearrange("b h (l d) -> b l h d", l=num_heads)
+        self.out_rearrange = Rearrange("b l h d -> b h (l d)")
         self.drop_output = nn.Dropout(dropout_rate)
         self.drop_weights = nn.Dropout(dropout_rate)
+        self.dropout_rate = dropout_rate
         self.scale = self.dim_head**-0.5
         self.save_attn = save_attn
         self.att_mat = torch.Tensor()
         self.attention_dtype = attention_dtype
         self.causal = causal
         self.sequence_length = sequence_length
+        self.include_fc = include_fc
+        self.use_combined_linear = use_combined_linear
+        self.use_flash_attention = use_flash_attention
         if causal and sequence_length is not None:
             # causal mask to ensure that attention is only applied to the left in the input sequence
@@ -123,31 +162,44 @@ class SABlock(nn.Module):
         Return:
             torch.Tensor: B x (s_dim_1 * ... * s_dim_n) x C
         """
-        output = self.input_rearrange(self.qkv(x))
-        q, k, v = output[0], output[1], output[2]
+        if self.use_combined_linear:
+            output = self.input_rearrange(self.qkv(x))
+            q, k, v = output[0], output[1], output[2]
+        else:
+            q = self.input_rearrange(self.to_q(x))
+            k = self.input_rearrange(self.to_k(x))
+            v = self.input_rearrange(self.to_v(x))
         if self.attention_dtype is not None:
             q = q.to(self.attention_dtype)
             k = k.to(self.attention_dtype)
-        att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale
+        if self.use_flash_attention:
+            x = F.scaled_dot_product_attention(
+                query=q, key=k, value=v, scale=self.scale, dropout_p=self.dropout_rate, is_causal=self.causal
+            )
+        else:
+            att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale
+            # apply relative positional embedding if defined
+            if self.rel_positional_embedding is not None:
+                att_mat = self.rel_positional_embedding(x, att_mat, q)
-        # apply relative positional embedding if defined
-        att_mat = self.rel_positional_embedding(x, att_mat, q) if self.rel_positional_embedding is not None else att_mat
+            if self.causal:
+                att_mat = att_mat.masked_fill(self.causal_mask[:, :, : x.shape[-2], : x.shape[-2]] == 0, float("-inf"))
-        if self.causal:
-            att_mat = att_mat.masked_fill(self.causal_mask[:, :, : x.shape[1], : x.shape[1]] == 0, float("-inf"))
+            att_mat = att_mat.softmax(dim=-1)
-        att_mat = att_mat.softmax(dim=-1)
+            if self.save_attn:
+                # no gradients and new tensor;
+                # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
+                self.att_mat = att_mat.detach()
-        if self.save_attn:
-            # no gradients and new tensor;
-            # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
-            self.att_mat = att_mat.detach()
+            att_mat = self.drop_weights(att_mat)
+            x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v)
-        att_mat = self.drop_weights(att_mat)
-        x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v)
         x = self.out_rearrange(x)
-        x = self.out_proj(x)
+        if self.include_fc:
+            x = self.out_proj(x)
         x = self.drop_output(x)
         return x

monai/networks/blocks/spatialattention.py CHANGED Viewed

@@ -32,7 +32,13 @@ class SpatialAttentionBlock(nn.Module):
         spatial_dims: number of spatial dimensions, could be 1, 2, or 3.
         num_channels: number of input channels. Must be divisible by num_head_channels.
         num_head_channels: number of channels per head.
+        norm_num_groups: Number of groups for the group norm layer.
+        norm_eps: Epsilon for the normalization.
         attention_dtype: cast attention operations to this dtype.
+        include_fc: whether to include the final linear layer. Default to True.
+        use_combined_linear: whether to use a single linear layer for qkv projection, default to False.
+        use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+            (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
     """
@@ -44,6 +50,9 @@ class SpatialAttentionBlock(nn.Module):
         norm_num_groups: int = 32,
         norm_eps: float = 1e-6,
         attention_dtype: Optional[torch.dtype] = None,
+        include_fc: bool = True,
+        use_combined_linear: bool = False,
+        use_flash_attention: bool = False,
     ) -> None:
         super().__init__()
@@ -54,7 +63,13 @@ class SpatialAttentionBlock(nn.Module):
             raise ValueError("num_channels must be divisible by num_head_channels")
         num_heads = num_channels // num_head_channels if num_head_channels is not None else 1
         self.attn = SABlock(
-            hidden_size=num_channels, num_heads=num_heads, qkv_bias=True, attention_dtype=attention_dtype
+            hidden_size=num_channels,
+            num_heads=num_heads,
+            qkv_bias=True,
+            attention_dtype=attention_dtype,
+            include_fc=include_fc,
+            use_combined_linear=use_combined_linear,
+            use_flash_attention=use_flash_attention,
         )
     def forward(self, x: torch.Tensor):

monai/networks/blocks/transformerblock.py CHANGED Viewed

@@ -36,6 +36,9 @@ class TransformerBlock(nn.Module):
         causal: bool = False,
         sequence_length: int | None = None,
         with_cross_attention: bool = False,
+        use_flash_attention: bool = False,
+        include_fc: bool = True,
+        use_combined_linear: bool = True,
     ) -> None:
         """
         Args:
@@ -43,8 +46,12 @@ class TransformerBlock(nn.Module):
             mlp_dim (int): dimension of feedforward layer.
             num_heads (int): number of attention heads.
             dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0.
-            qkv_bias (bool, optional): apply bias term for the qkv linear layer. Defaults to False.
+            qkv_bias(bool, optional): apply bias term for the qkv linear layer. Defaults to False.
             save_attn (bool, optional): to make accessible the attention matrix. Defaults to False.
+            use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+                (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
+            include_fc: whether to include the final linear layer. Default to True.
+            use_combined_linear: whether to use a single linear layer for qkv projection, default to True.
         """
@@ -66,13 +73,21 @@ class TransformerBlock(nn.Module):
             save_attn=save_attn,
             causal=causal,
             sequence_length=sequence_length,
+            include_fc=include_fc,
+            use_combined_linear=use_combined_linear,
+            use_flash_attention=use_flash_attention,
         )
         self.norm2 = nn.LayerNorm(hidden_size)
         self.with_cross_attention = with_cross_attention
         self.norm_cross_attn = nn.LayerNorm(hidden_size)
         self.cross_attn = CrossAttentionBlock(
-            hidden_size=hidden_size, num_heads=num_heads, dropout_rate=dropout_rate, qkv_bias=qkv_bias, causal=False
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            dropout_rate=dropout_rate,
+            qkv_bias=qkv_bias,
+            causal=False,
+            use_flash_attention=use_flash_attention,
         )
     def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None) -> torch.Tensor:

monai/networks/layers/filtering.py CHANGED Viewed

@@ -51,6 +51,8 @@ class BilateralFilter(torch.autograd.Function):
         ctx.cs = color_sigma
         ctx.fa = fast_approx
         output_data = _C.bilateral_filter(input, spatial_sigma, color_sigma, fast_approx)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         return output_data
     @staticmethod
@@ -139,7 +141,8 @@ class TrainableBilateralFilterFunction(torch.autograd.Function):
             do_dsig_y,
             do_dsig_z,
         )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         return output_tensor
     @staticmethod
@@ -301,7 +304,8 @@ class TrainableJointBilateralFilterFunction(torch.autograd.Function):
             do_dsig_z,
             guidance_img,
         )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         return output_tensor
     @staticmethod

monai/networks/nets/__init__.py CHANGED Viewed

@@ -76,7 +76,7 @@ from .resnet import (
     resnet200,
 )
 from .segresnet import SegResNet, SegResNetVAE
-from .segresnet_ds import SegResNetDS
+from .segresnet_ds import SegResNetDS, SegResNetDS2
 from .senet import (
     SENet,
     SEnet,
@@ -118,6 +118,7 @@ from .transformer import DecoderOnlyTransformer
 from .unet import UNet, Unet
 from .unetr import UNETR
 from .varautoencoder import VarAutoEncoder
+from .vista3d import VISTA3D, vista3d132
 from .vit import ViT
 from .vitautoenc import ViTAutoEnc
 from .vnet import VNet

monai-weekly 1.4.dev2431__py3-none-any.whl → 1.4.dev2435__py3-none-any.whl

monai-weekly 1.4.dev2431py3-none-any.whl → 1.4.dev2435py3-none-any.whl