PyPI - monai-weekly - Versions diffs - 1.4.dev2431__py3-none-any.whl → 1.4.dev2435__py3-none-any.whl - Mend

monai-weekly 1.4.dev2431py3-none-any.whl → 1.4.dev2435py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

monai/__init__.py +1 -1
monai/_version.py +3 -3
monai/apps/generation/maisi/networks/autoencoderkl_maisi.py +43 -25
monai/apps/generation/maisi/networks/controlnet_maisi.py +15 -18
monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py +18 -18
monai/apps/vista3d/inferer.py +177 -0
monai/apps/vista3d/sampler.py +179 -0
monai/apps/vista3d/transforms.py +224 -0
monai/bundle/scripts.py +29 -17
monai/data/utils.py +1 -1
monai/data/wsi_datasets.py +3 -3
monai/inferers/utils.py +1 -0
monai/losses/__init__.py +1 -0
monai/losses/dice.py +10 -1
monai/losses/nacl_loss.py +139 -0
monai/networks/blocks/crossattention.py +48 -26
monai/networks/blocks/mlp.py +16 -4
monai/networks/blocks/selfattention.py +75 -23
monai/networks/blocks/spatialattention.py +16 -1
monai/networks/blocks/transformerblock.py +17 -2
monai/networks/layers/filtering.py +6 -2
monai/networks/nets/__init__.py +2 -1
monai/networks/nets/autoencoderkl.py +55 -22
monai/networks/nets/cell_sam_wrapper.py +92 -0
monai/networks/nets/controlnet.py +24 -22
monai/networks/nets/diffusion_model_unet.py +159 -19
monai/networks/nets/segresnet_ds.py +127 -1
monai/networks/nets/spade_autoencoderkl.py +22 -0
monai/networks/nets/spade_diffusion_model_unet.py +39 -2
monai/networks/nets/transformer.py +17 -17
monai/networks/nets/vista3d.py +946 -0
monai/networks/utils.py +4 -4
monai/transforms/__init__.py +13 -2
monai/transforms/io/array.py +59 -3
monai/transforms/io/dictionary.py +29 -2
monai/transforms/spatial/functional.py +1 -1
monai/transforms/transform.py +2 -2
monai/transforms/utility/dictionary.py +4 -0
monai/transforms/utils.py +230 -1
monai/{apps/generation/maisi/utils/morphological_ops.py → transforms/utils_morphological_ops.py} +2 -0
monai/transforms/utils_pytorch_numpy_unification.py +2 -2
monai/utils/enums.py +1 -0
monai/utils/module.py +7 -6
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/METADATA +84 -81
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/RECORD +49 -43
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/WHEEL +1 -1
/monai/apps/{generation/maisi/utils → vista3d}/__init__.py +0 -0
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/LICENSE +0 -0
{monai_weekly-1.4.dev2431.dist-info → monai_weekly-1.4.dev2435.dist-info}/top_level.txt +0 -0

monai/networks/nets/autoencoderkl.py CHANGED Viewed

@@ -157,6 +157,10 @@ class Encoder(nn.Module):
         norm_eps: epsilon for the normalization.
         attention_levels: indicate which level from num_channels contain an attention block.
         with_nonlocal_attn: if True use non-local attention block.
+        include_fc: whether to include the final linear layer. Default to True.
+        use_combined_linear: whether to use a single linear layer for qkv projection, default to False.
+        use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+            (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
     """
     def __init__(
@@ -170,6 +174,9 @@ class Encoder(nn.Module):
         norm_eps: float,
         attention_levels: Sequence[bool],
         with_nonlocal_attn: bool = True,
+        include_fc: bool = True,
+        use_combined_linear: bool = False,
+        use_flash_attention: bool = False,
     ) -> None:
         super().__init__()
         self.spatial_dims = spatial_dims
@@ -220,6 +227,9 @@ class Encoder(nn.Module):
                             num_channels=input_channel,
                             norm_num_groups=norm_num_groups,
                             norm_eps=norm_eps,
+                            include_fc=include_fc,
+                            use_combined_linear=use_combined_linear,
+                            use_flash_attention=use_flash_attention,
                         )
                     )
@@ -243,6 +253,9 @@ class Encoder(nn.Module):
                     num_channels=channels[-1],
                     norm_num_groups=norm_num_groups,
                     norm_eps=norm_eps,
+                    include_fc=include_fc,
+                    use_combined_linear=use_combined_linear,
+                    use_flash_attention=use_flash_attention,
                 )
             )
             blocks.append(
@@ -291,6 +304,10 @@ class Decoder(nn.Module):
         attention_levels: indicate which level from num_channels contain an attention block.
         with_nonlocal_attn: if True use non-local attention block.
         use_convtranspose: if True, use ConvTranspose to upsample feature maps in decoder.
+        include_fc: whether to include the final linear layer. Default to True.
+        use_combined_linear: whether to use a single linear layer for qkv projection, default to False.
+        use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+            (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
     """
     def __init__(
@@ -305,6 +322,9 @@ class Decoder(nn.Module):
         attention_levels: Sequence[bool],
         with_nonlocal_attn: bool = True,
         use_convtranspose: bool = False,
+        include_fc: bool = True,
+        use_combined_linear: bool = False,
+        use_flash_attention: bool = False,
     ) -> None:
         super().__init__()
         self.spatial_dims = spatial_dims
@@ -350,6 +370,9 @@ class Decoder(nn.Module):
                     num_channels=reversed_block_out_channels[0],
                     norm_num_groups=norm_num_groups,
                     norm_eps=norm_eps,
+                    include_fc=include_fc,
+                    use_combined_linear=use_combined_linear,
+                    use_flash_attention=use_flash_attention,
                 )
             )
             blocks.append(
@@ -389,6 +412,9 @@ class Decoder(nn.Module):
                             num_channels=block_in_ch,
                             norm_num_groups=norm_num_groups,
                             norm_eps=norm_eps,
+                            include_fc=include_fc,
+                            use_combined_linear=use_combined_linear,
+                            use_flash_attention=use_flash_attention,
                         )
                     )
@@ -463,6 +489,10 @@ class AutoencoderKL(nn.Module):
         with_decoder_nonlocal_attn: if True use non-local attention block in the decoder.
         use_checkpoint: if True, use activation checkpoint to save memory.
         use_convtranspose: if True, use ConvTranspose to upsample feature maps in decoder.
+        include_fc: whether to include the final linear layer in the attention block. Default to True.
+        use_combined_linear: whether to use a single linear layer for qkv projection in the attention block, default to False.
+        use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+            (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
     """
     def __init__(
@@ -480,6 +510,9 @@ class AutoencoderKL(nn.Module):
         with_decoder_nonlocal_attn: bool = True,
         use_checkpoint: bool = False,
         use_convtranspose: bool = False,
+        include_fc: bool = True,
+        use_combined_linear: bool = False,
+        use_flash_attention: bool = False,
     ) -> None:
         super().__init__()
@@ -499,7 +532,7 @@ class AutoencoderKL(nn.Module):
                 "`num_channels`."
             )
-        self.encoder = Encoder(
+        self.encoder: nn.Module = Encoder(
             spatial_dims=spatial_dims,
             in_channels=in_channels,
             channels=channels,
@@ -509,8 +542,11 @@ class AutoencoderKL(nn.Module):
             norm_eps=norm_eps,
             attention_levels=attention_levels,
             with_nonlocal_attn=with_encoder_nonlocal_attn,
+            include_fc=include_fc,
+            use_combined_linear=use_combined_linear,
+            use_flash_attention=use_flash_attention,
         )
-        self.decoder = Decoder(
+        self.decoder: nn.Module = Decoder(
             spatial_dims=spatial_dims,
             channels=channels,
             in_channels=latent_channels,
@@ -521,6 +557,9 @@ class AutoencoderKL(nn.Module):
             attention_levels=attention_levels,
             with_nonlocal_attn=with_decoder_nonlocal_attn,
             use_convtranspose=use_convtranspose,
+            include_fc=include_fc,
+            use_combined_linear=use_combined_linear,
+            use_flash_attention=use_flash_attention,
         )
         self.quant_conv_mu = Convolution(
             spatial_dims=spatial_dims,
@@ -665,27 +704,18 @@ class AutoencoderKL(nn.Module):
         # copy over all matching keys
         for k in new_state_dict:
             if k in old_state_dict:
-                new_state_dict[k] = old_state_dict[k]
+                new_state_dict[k] = old_state_dict.pop(k)
         # fix the attention blocks
-        attention_blocks = [k.replace(".attn.qkv.weight", "") for k in new_state_dict if "attn.qkv.weight" in k]
+        attention_blocks = [k.replace(".attn.to_q.weight", "") for k in new_state_dict if "attn.to_q.weight" in k]
         for block in attention_blocks:
-            new_state_dict[f"{block}.attn.qkv.weight"] = torch.cat(
-                [
-                    old_state_dict[f"{block}.to_q.weight"],
-                    old_state_dict[f"{block}.to_k.weight"],
-                    old_state_dict[f"{block}.to_v.weight"],
-                ],
-                dim=0,
-            )
-            new_state_dict[f"{block}.attn.qkv.bias"] = torch.cat(
-                [
-                    old_state_dict[f"{block}.to_q.bias"],
-                    old_state_dict[f"{block}.to_k.bias"],
-                    old_state_dict[f"{block}.to_v.bias"],
-                ],
-                dim=0,
-            )
+            new_state_dict[f"{block}.attn.to_q.weight"] = old_state_dict.pop(f"{block}.to_q.weight")
+            new_state_dict[f"{block}.attn.to_k.weight"] = old_state_dict.pop(f"{block}.to_k.weight")
+            new_state_dict[f"{block}.attn.to_v.weight"] = old_state_dict.pop(f"{block}.to_v.weight")
+            new_state_dict[f"{block}.attn.to_q.bias"] = old_state_dict.pop(f"{block}.to_q.bias")
+            new_state_dict[f"{block}.attn.to_k.bias"] = old_state_dict.pop(f"{block}.to_k.bias")
+            new_state_dict[f"{block}.attn.to_v.bias"] = old_state_dict.pop(f"{block}.to_v.bias")
             # old version did not have a projection so set these to the identity
             new_state_dict[f"{block}.attn.out_proj.weight"] = torch.eye(
                 new_state_dict[f"{block}.attn.out_proj.weight"].shape[0]
@@ -698,5 +728,8 @@ class AutoencoderKL(nn.Module):
         for k in new_state_dict:
             if "postconv" in k:
                 old_name = k.replace("postconv", "conv")
-                new_state_dict[k] = old_state_dict[old_name]
-        self.load_state_dict(new_state_dict)
+                new_state_dict[k] = old_state_dict.pop(old_name)
+        if verbose:
+            # print all remaining keys in old_state_dict
+            print("remaining keys in old_state_dict:", old_state_dict.keys())
+        self.load_state_dict(new_state_dict, strict=True)

monai/networks/nets/cell_sam_wrapper.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import torch
+from torch import nn
+from torch.nn import functional as F
+from monai.utils import optional_import
+build_sam_vit_b, has_sam = optional_import("segment_anything.build_sam", name="build_sam_vit_b")
+_all__ = ["CellSamWrapper"]
+class CellSamWrapper(torch.nn.Module):
+    """
+    CellSamWrapper is thin wrapper around SAM model https://github.com/facebookresearch/segment-anything
+    with an image only decoder, that can be used for segmentation tasks.
+    Args:
+        auto_resize_inputs: whether to resize inputs before passing to the network.
+            (usually they need be resized, unless they are already at the expected size)
+        network_resize_roi: expected input size for the network.
+            (currently SAM expects 1024x1024)
+        checkpoint: checkpoint file to load the SAM weights from.
+            (this can be downloaded from SAM repo https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth)
+        return_features: whether to return features from SAM encoder
+            (without using decoder/upsampling to the original input size)
+    """
+    def __init__(
+        self,
+        auto_resize_inputs=True,
+        network_resize_roi=(1024, 1024),
+        checkpoint="sam_vit_b_01ec64.pth",
+        return_features=False,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.network_resize_roi = network_resize_roi
+        self.auto_resize_inputs = auto_resize_inputs
+        self.return_features = return_features
+        if not has_sam:
+            raise ValueError(
+                "SAM is not installed, please run: pip install git+https://github.com/facebookresearch/segment-anything.git"
+            )
+        model = build_sam_vit_b(checkpoint=checkpoint)
+        model.prompt_encoder = None
+        model.mask_decoder = None
+        model.mask_decoder = nn.Sequential(
+            nn.BatchNorm2d(num_features=256),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1, bias=False),
+            nn.BatchNorm2d(num_features=128),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(128, 3, kernel_size=3, stride=2, padding=1, output_padding=1, bias=True),
+        )
+        self.model = model
+    def forward(self, x):
+        sh = x.shape[2:]
+        if self.auto_resize_inputs:
+            x = F.interpolate(x, size=self.network_resize_roi, mode="bilinear")
+        x = self.model.image_encoder(x)
+        if not self.return_features:
+            x = self.model.mask_decoder(x)
+            if self.auto_resize_inputs:
+                x = F.interpolate(x, size=sh, mode="bilinear")
+        return x

monai/networks/nets/controlnet.py CHANGED Viewed

@@ -143,6 +143,10 @@ class ControlNet(nn.Module):
         upcast_attention: if True, upcast attention operations to full precision.
         conditioning_embedding_in_channels: number of input channels for the conditioning embedding.
         conditioning_embedding_num_channels: number of channels for the blocks in the conditioning embedding.
+        include_fc: whether to include the final linear layer. Default to True.
+        use_combined_linear: whether to use a single linear layer for qkv projection, default to True.
+        use_flash_attention: if True, use Pytorch's inbuilt flash attention for a memory efficient attention mechanism
+            (see https://pytorch.org/docs/2.2/generated/torch.nn.functional.scaled_dot_product_attention.html).
     """
     def __init__(
@@ -163,28 +167,29 @@ class ControlNet(nn.Module):
         upcast_attention: bool = False,
         conditioning_embedding_in_channels: int = 1,
         conditioning_embedding_num_channels: Sequence[int] = (16, 32, 96, 256),
+        include_fc: bool = True,
+        use_combined_linear: bool = False,
+        use_flash_attention: bool = False,
     ) -> None:
         super().__init__()
         if with_conditioning is True and cross_attention_dim is None:
             raise ValueError(
-                "DiffusionModelUNet expects dimension of the cross-attention conditioning (cross_attention_dim) "
+                "ControlNet expects dimension of the cross-attention conditioning (cross_attention_dim) "
                 "to be specified when with_conditioning=True."
             )
         if cross_attention_dim is not None and with_conditioning is False:
-            raise ValueError(
-                "DiffusionModelUNet expects with_conditioning=True when specifying the cross_attention_dim."
-            )
+            raise ValueError("ControlNet expects with_conditioning=True when specifying the cross_attention_dim.")
         # All number of channels should be multiple of num_groups
         if any((out_channel % norm_num_groups) != 0 for out_channel in channels):
             raise ValueError(
-                f"DiffusionModelUNet expects all channels to be a multiple of norm_num_groups, but got"
+                f"ControlNet expects all channels to be a multiple of norm_num_groups, but got"
                 f" channels={channels} and norm_num_groups={norm_num_groups}"
             )
         if len(channels) != len(attention_levels):
             raise ValueError(
-                f"DiffusionModelUNet expects channels to have the same length as attention_levels, but got "
+                f"ControlNet expects channels to have the same length as attention_levels, but got "
                 f"channels={channels} and attention_levels={attention_levels}"
             )
@@ -282,6 +287,9 @@ class ControlNet(nn.Module):
                 transformer_num_layers=transformer_num_layers,
                 cross_attention_dim=cross_attention_dim,
                 upcast_attention=upcast_attention,
+                include_fc=include_fc,
+                use_combined_linear=use_combined_linear,
+                use_flash_attention=use_flash_attention,
             )
             self.down_blocks.append(down_block)
@@ -326,6 +334,9 @@ class ControlNet(nn.Module):
             transformer_num_layers=transformer_num_layers,
             cross_attention_dim=cross_attention_dim,
             upcast_attention=upcast_attention,
+            include_fc=include_fc,
+            use_combined_linear=use_combined_linear,
+            use_flash_attention=use_flash_attention,
         )
         controlnet_block = Convolution(
@@ -441,25 +452,16 @@ class ControlNet(nn.Module):
         # copy over all matching keys
         for k in new_state_dict:
             if k in old_state_dict:
-                new_state_dict[k] = old_state_dict[k]
+                new_state_dict[k] = old_state_dict.pop(k)
         # fix the attention blocks
-        attention_blocks = [k.replace(".attn1.qkv.weight", "") for k in new_state_dict if "attn1.qkv.weight" in k]
+        attention_blocks = [k.replace(".out_proj.weight", "") for k in new_state_dict if "out_proj.weight" in k]
         for block in attention_blocks:
-            new_state_dict[f"{block}.attn1.qkv.weight"] = torch.cat(
-                [
-                    old_state_dict[f"{block}.attn1.to_q.weight"],
-                    old_state_dict[f"{block}.attn1.to_k.weight"],
-                    old_state_dict[f"{block}.attn1.to_v.weight"],
-                ],
-                dim=0,
-            )
             # projection
-            new_state_dict[f"{block}.attn1.out_proj.weight"] = old_state_dict[f"{block}.attn1.to_out.0.weight"]
-            new_state_dict[f"{block}.attn1.out_proj.bias"] = old_state_dict[f"{block}.attn1.to_out.0.bias"]
-            new_state_dict[f"{block}.attn2.out_proj.weight"] = old_state_dict[f"{block}.attn2.to_out.0.weight"]
-            new_state_dict[f"{block}.attn2.out_proj.bias"] = old_state_dict[f"{block}.attn2.to_out.0.bias"]
+            new_state_dict[f"{block}.out_proj.weight"] = old_state_dict.pop(f"{block}.to_out.0.weight")
+            new_state_dict[f"{block}.out_proj.bias"] = old_state_dict.pop(f"{block}.to_out.0.bias")
+        if verbose:
+            # print all remaining keys in old_state_dict
+            print("remaining keys in old_state_dict:", old_state_dict.keys())
         self.load_state_dict(new_state_dict)

monai-weekly 1.4.dev2431__py3-none-any.whl → 1.4.dev2435__py3-none-any.whl

monai-weekly 1.4.dev2431py3-none-any.whl → 1.4.dev2435py3-none-any.whl