PyPI - monai-weekly - Versions diffs - 1.5.dev2511__py3-none-any.whl → 1.5.dev2513__py3-none-any.whl - Mend

monai-weekly 1.5.dev2511py3-none-any.whl → 1.5.dev2513py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

monai/__init__.py +1 -1
monai/_version.py +3 -3
monai/data/utils.py +1 -1
monai/metrics/meandice.py +132 -76
monai/networks/blocks/__init__.py +2 -1
monai/networks/blocks/cablock.py +182 -0
monai/networks/blocks/downsample.py +241 -2
monai/networks/nets/restormer.py +337 -0
monai/networks/utils.py +44 -1
monai/utils/__init__.py +1 -0
monai/utils/enums.py +13 -0
monai/utils/misc.py +1 -1
{monai_weekly-1.5.dev2511.dist-info → monai_weekly-1.5.dev2513.dist-info}/METADATA +3 -2
{monai_weekly-1.5.dev2511.dist-info → monai_weekly-1.5.dev2513.dist-info}/RECORD +22 -17
{monai_weekly-1.5.dev2511.dist-info → monai_weekly-1.5.dev2513.dist-info}/WHEEL +1 -1
tests/metrics/test_compute_meandice.py +3 -3
tests/networks/blocks/test_CABlock.py +150 -0
tests/networks/blocks/test_downsample_block.py +184 -0
tests/networks/nets/test_restormer.py +147 -0
tests/networks/utils/test_pixelunshuffle.py +51 -0
tests/integration/test_downsample_block.py +0 -50
{monai_weekly-1.5.dev2511.dist-info → monai_weekly-1.5.dev2513.dist-info/licenses}/LICENSE +0 -0
{monai_weekly-1.5.dev2511.dist-info → monai_weekly-1.5.dev2513.dist-info}/top_level.txt +0 -0

monai/__init__.py CHANGED Viewed

@@ -136,4 +136,4 @@ except BaseException:
     if MONAIEnvVars.debug():
         raise
-__commit_id__ = "34f379735c5e18e7f809453eb1b3606c225c788b"
+__commit_id__ = "bfcb318b1fbde75bdceb56b7cff632c9fc1c13dd"

monai/_version.py CHANGED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-03-16T02:30:38+0000",
+ "date": "2025-03-30T02:32:28+0000",
  "dirty": false,
  "error": null,
- "full-revisionid": "7876647f87c763d854f9546bbc60e12f13af84a6",
- "version": "1.5.dev2511"
+ "full-revisionid": "ef083a32ccc13ee3937a4bd8acc12b9cdc174e18",
+ "version": "1.5.dev2513"
 }
 '''  # END VERSION_JSON

monai/data/utils.py CHANGED Viewed

@@ -1473,7 +1473,7 @@ def convert_tables_to_dicts(
     # parse row indices
     rows: list[int | str] = []
     if row_indices is None:
-        rows = slice(df.shape[0])  # type: ignore
+        rows = df.index.tolist()
     else:
         for i in row_indices:
             if isinstance(i, (tuple, list)):

monai/metrics/meandice.py CHANGED Viewed

@@ -14,7 +14,7 @@ from __future__ import annotations
 import torch
 from monai.metrics.utils import do_metric_reduction
-from monai.utils import MetricReduction
+from monai.utils import MetricReduction, deprecated_arg
 from .metric import CumulativeIterationMetric
@@ -23,35 +23,76 @@ __all__ = ["DiceMetric", "compute_dice", "DiceHelper"]
 class DiceMetric(CumulativeIterationMetric):
     """
-    Compute average Dice score for a set of pairs of prediction-groundtruth segmentations.
+    Computes Dice score for a set of pairs of prediction-groundtruth labels. It supports single-channel label maps
+    or multi-channel images with class segmentations per channel. This allows the computation for both multi-class
+    and multi-label tasks.
-    It supports both multi-classes and multi-labels tasks.
-    Input `y_pred` is compared with ground truth `y`.
-    `y_pred` is expected to have binarized predictions and `y` can be single-channel class indices or in the
-    one-hot format. The `include_background` parameter can be set to ``False`` to exclude
-    the first category (channel index 0) which is by convention assumed to be background. If the non-background
-    segmentations are small compared to the total image size they can get overwhelmed by the signal from the
-    background. `y_preds` and `y` can be a list of channel-first Tensor (CHW[D]) or a batch-first Tensor (BCHW[D]),
-    `y` can also be in the format of `B1HW[D]`.
+    If either prediction ``y_pred`` or ground truth ``y`` have shape BCHW[D], it is expected that these represent one-
+    hot segmentations for C number of classes. If either shape is B1HW[D], it is expected that these are label maps
+    and the number of classes must be specified by the ``num_classes`` parameter. In either case for either inputs,
+    this metric applies no activations and so non-binary values will produce unexpected results if this metric is used
+    for binary overlap measurement (ie. either was expected to be one-hot formatted). Soft labels are thus permitted by
+    this metric. Typically this implies that raw predictions from a network must first be activated and possibly made
+    into label maps, eg. for a multi-class prediction tensor softmax and then argmax should be applied over the channel
+    dimensions to produce a label map.
+    The ``include_background`` parameter can be set to `False` to exclude the first category (channel index 0) which
+    is by convention assumed to be background. If the non-background segmentations are small compared to the total
+    image size they can get overwhelmed by the signal from the background. This assumes the shape of both prediction
+    and ground truth is BCHW[D].
+    The typical execution steps of this metric class follows :py:class:`monai.metrics.metric.Cumulative`.
+    Further information can be found in the official
+    `MONAI Dice Overview <https://github.com/Project-MONAI/tutorials/blob/main/modules/dice_loss_metric_notes.ipynb>`.
+    Example:
+    .. code-block:: python
+        import torch
+        from monai.metrics import DiceMetric
+        from monai.losses import DiceLoss
+        from monai.networks import one_hot
+        batch_size, n_classes, h, w = 7, 5, 128, 128
+        y_pred = torch.rand(batch_size, n_classes, h, w)  # network predictions
+        y_pred = torch.argmax(y_pred, 1, True)  # convert to label map
+        # ground truth as label map
+        y = torch.randint(0, n_classes, size=(batch_size, 1, h, w))
+        dm = DiceMetric(
+            reduction="mean_batch", return_with_label=True, num_classes=n_classes
+        )
+        raw_scores = dm(y_pred, y)
+        print(dm.aggregate())
+        # now compute the Dice loss which should be the same as 1 - raw_scores
+        dl = DiceLoss(to_onehot_y=True, reduction="none")
+        loss = dl(one_hot(y_pred, n_classes), y).squeeze()
+        print(1.0 - loss)  # same as raw_scores
-    Example of the typical execution steps of this metric class follows :py:class:`monai.metrics.metric.Cumulative`.
     Args:
-        include_background: whether to include Dice computation on the first channel of
-            the predicted output. Defaults to ``True``.
-        reduction: define mode of reduction to the metrics, will only apply reduction on `not-nan` values,
-            available reduction modes: {``"none"``, ``"mean"``, ``"sum"``, ``"mean_batch"``, ``"sum_batch"``,
-            ``"mean_channel"``, ``"sum_channel"``}, default to ``"mean"``. if "none", will not do reduction.
-        get_not_nans: whether to return the `not_nans` count, if True, aggregate() returns (metric, not_nans).
-            Here `not_nans` count the number of not nans for the metric, thus its shape equals to the shape of the metric.
-        ignore_empty: whether to ignore empty ground truth cases during calculation.
-            If `True`, NaN value will be set for empty ground truth cases.
-            If `False`, 1 will be set if the predictions of empty ground truth cases are also empty.
-        num_classes: number of input channels (always including the background). When this is None,
+        include_background: whether to include Dice computation on the first channel/category of the prediction and
+            ground truth. Defaults to ``True``, use ``False`` to exclude the background class.
+        reduction: defines mode of reduction to the metrics, this will only apply reduction on `not-nan` values. The
+            available reduction modes are enumerated by :py:class:`monai.utils.enums.MetricReduction`. If "none", is
+            selected, the metric will not do reduction.
+        get_not_nans: whether to return the `not_nans` count. If True, aggregate() returns `(metric, not_nans)` where
+            `not_nans` counts the number of valid values in the result, and will have the same shape.
+        ignore_empty: whether to ignore empty ground truth cases during calculation. If `True`, the `NaN` value will be
+            set for an empty ground truth cases, otherwise 1 will be set if the predictions of empty ground truth cases
+            are also empty.
+        num_classes: number of input channels (always including the background). When this is ``None``,
             ``y_pred.shape[1]`` will be used. This option is useful when both ``y_pred`` and ``y`` are
             single-channel class indices and the number of classes is not automatically inferred from data.
         return_with_label: whether to return the metrics with label, only works when reduction is "mean_batch".
-            If `True`, use "label_{index}" as the key corresponding to C channels; if 'include_background' is True,
+            If `True`, use "label_{index}" as the key corresponding to C channels; if ``include_background`` is True,
             the index begins at "0", otherwise at "1". It can also take a list of label names.
             The outcome will then be returned as a dictionary.
@@ -77,22 +118,21 @@ class DiceMetric(CumulativeIterationMetric):
             include_background=self.include_background,
             reduction=MetricReduction.NONE,
             get_not_nans=False,
-            softmax=False,
+            apply_argmax=False,
             ignore_empty=self.ignore_empty,
             num_classes=self.num_classes,
         )
     def _compute_tensor(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
         """
+        Compute the dice value using ``DiceHelper``.
         Args:
-            y_pred: input data to compute, typical segmentation model output.
-                It must be one-hot format and first dim is batch, example shape: [16, 3, 32, 32]. The values
-                should be binarized.
-            y: ground truth to compute mean Dice metric. `y` can be single-channel class indices or
-                in the one-hot format.
+            y_pred: prediction value, see class docstring for format definition.
+            y: ground truth label.
         Raises:
-            ValueError: when `y_pred` has less than three dimensions.
+            ValueError: when `y_pred` has fewer than three dimensions.
         """
         dims = y_pred.ndimension()
         if dims < 3:
@@ -107,10 +147,8 @@ class DiceMetric(CumulativeIterationMetric):
         Execute reduction and aggregation logic for the output of `compute_dice`.
         Args:
-            reduction: define mode of reduction to the metrics, will only apply reduction on `not-nan` values,
-                available reduction modes: {``"none"``, ``"mean"``, ``"sum"``, ``"mean_batch"``, ``"sum_batch"``,
-                ``"mean_channel"``, ``"sum_channel"``}, default to `self.reduction`. if "none", will not do reduction.
+            reduction: defines mode of reduction as enumerated in :py:class:`monai.utils.enums.MetricReduction`.
+                By default this will do no reduction.
         """
         data = self.get_buffer()
         if not isinstance(data, torch.Tensor):
@@ -138,18 +176,20 @@ def compute_dice(
     ignore_empty: bool = True,
     num_classes: int | None = None,
 ) -> torch.Tensor:
-    """Computes Dice score metric for a batch of predictions.
+    """
+    Computes Dice score metric for a batch of predictions. This performs the same computation as
+    :py:class:`monai.metrics.DiceMetric`, which is preferrable to use over this function. For input formats, see the
+    documentation for that class .
     Args:
         y_pred: input data to compute, typical segmentation model output.
-            `y_pred` can be single-channel class indices or in the one-hot format.
-        y: ground truth to compute mean dice metric. `y` can be single-channel class indices or in the one-hot format.
-        include_background: whether to include Dice computation on the first channel of
-            the predicted output. Defaults to True.
-        ignore_empty: whether to ignore empty ground truth cases during calculation.
-            If `True`, NaN value will be set for empty ground truth cases.
-            If `False`, 1 will be set if the predictions of empty ground truth cases are also empty.
-        num_classes: number of input channels (always including the background). When this is None,
+        y: ground truth to compute mean dice metric.
+        include_background: whether to include Dice computation on the first channel/category of the prediction and
+            ground truth. Defaults to ``True``, use ``False`` to exclude the background class.
+        ignore_empty: whether to ignore empty ground truth cases during calculation. If `True`, the `NaN` value will be
+            set for an empty ground truth cases, otherwise 1 will be set if the predictions of empty ground truth cases
+            are also empty.
+        num_classes: number of input channels (always including the background). When this is ``None``,
             ``y_pred.shape[1]`` will be used. This option is useful when both ``y_pred`` and ``y`` are
             single-channel class indices and the number of classes is not automatically inferred from data.
@@ -161,7 +201,7 @@ def compute_dice(
         include_background=include_background,
         reduction=MetricReduction.NONE,
         get_not_nans=False,
-        softmax=False,
+        apply_argmax=False,
         ignore_empty=ignore_empty,
         num_classes=num_classes,
     )(y_pred=y_pred, y=y)
@@ -169,8 +209,8 @@ def compute_dice(
 class DiceHelper:
     """
-    Compute Dice score between two tensors `y_pred` and `y`.
-    `y_pred` and `y` can be single-channel class indices or in the one-hot format.
+    Compute Dice score between two tensors ``y_pred`` and ``y``. This is used by :py:class:`monai.metrics.DiceMetric`,
+    see the documentation for that class for input formats.
     Example:
@@ -188,49 +228,65 @@ class DiceHelper:
         score, not_nans = DiceHelper(include_background=False, sigmoid=True, softmax=True)(y_pred, y)
         print(score, not_nans)
+    Args:
+        include_background: whether to include Dice computation on the first channel/category of the prediction and
+            ground truth. Defaults to ``True``, use ``False`` to exclude the background class.
+        threshold: if ``True`, ``y_pred`` will be thresholded at a value of 0.5. Defaults to False.
+        apply_argmax: whether ``y_pred`` are softmax activated outputs. If True, `argmax` will be performed to
+            get the discrete prediction. Defaults to the value of ``not threshold``.
+        activate: if this and ``threshold` are ``True``, sigmoid activation is applied to ``y_pred`` before
+            thresholding. Defaults to False.
+        get_not_nans: whether to return the number of not-nan values.
+        reduction: defines mode of reduction to the metrics, this will only apply reduction on `not-nan` values. The
+            available reduction modes are enumerated by :py:class:`monai.utils.enums.MetricReduction`. If "none", is
+            selected, the metric will not do reduction.
+        ignore_empty: whether to ignore empty ground truth cases during calculation. If `True`, the `NaN` value will be
+            set for an empty ground truth cases, otherwise 1 will be set if the predictions of empty ground truth cases
+            are also empty.
+        num_classes: number of input channels (always including the background). When this is ``None``,
+            ``y_pred.shape[1]`` will be used. This option is useful when both ``y_pred`` and ``y`` are
+            single-channel class indices and the number of classes is not automatically inferred from data.
     """
+    @deprecated_arg("softmax", "1.5", "1.7", "Use `apply_argmax` instead.", new_name="apply_argmax")
+    @deprecated_arg("sigmoid", "1.5", "1.7", "Use `threshold` instead.", new_name="threshold")
     def __init__(
         self,
         include_background: bool | None = None,
-        sigmoid: bool = False,
-        softmax: bool | None = None,
+        threshold: bool = False,
+        apply_argmax: bool | None = None,
         activate: bool = False,
         get_not_nans: bool = True,
         reduction: MetricReduction | str = MetricReduction.MEAN_BATCH,
         ignore_empty: bool = True,
         num_classes: int | None = None,
+        sigmoid: bool | None = None,
+        softmax: bool | None = None,
     ) -> None:
-        """
+        # handling deprecated arguments
+        if sigmoid is not None:
+            threshold = sigmoid
+        if softmax is not None:
+            apply_argmax = softmax
-        Args:
-            include_background: whether to include the score on the first channel
-                (default to the value of `sigmoid`, False).
-            sigmoid: whether ``y_pred`` are/will be sigmoid activated outputs. If True, thresholding at 0.5
-                will be performed to get the discrete prediction. Defaults to False.
-            softmax: whether ``y_pred`` are softmax activated outputs. If True, `argmax` will be performed to
-                get the discrete prediction. Defaults to the value of ``not sigmoid``.
-            activate: whether to apply sigmoid to ``y_pred`` if ``sigmoid`` is True. Defaults to False.
-                This option is only valid when ``sigmoid`` is True.
-            get_not_nans: whether to return the number of not-nan values.
-            reduction: define mode of reduction to the metrics
-            ignore_empty: if `True`, NaN value will be set for empty ground truth cases.
-                If `False`, 1 will be set if the Union of ``y_pred`` and ``y`` is empty.
-            num_classes: number of input channels (always including the background). When this is None,
-                ``y_pred.shape[1]`` will be used. This option is useful when both ``y_pred`` and ``y`` are
-                single-channel class indices and the number of classes is not automatically inferred from data.
-        """
-        self.sigmoid = sigmoid
+        self.threshold = threshold
         self.reduction = reduction
         self.get_not_nans = get_not_nans
-        self.include_background = sigmoid if include_background is None else include_background
-        self.softmax = not sigmoid if softmax is None else softmax
+        self.include_background = threshold if include_background is None else include_background
+        self.apply_argmax = not threshold if apply_argmax is None else apply_argmax
         self.activate = activate
         self.ignore_empty = ignore_empty
         self.num_classes = num_classes
     def compute_channel(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-        """"""
+        """
+        Compute the dice metric for binary inputs which have only spatial dimensions. This method is called separately
+        for each batch item and for each channel of those items.
+        Args:
+            y_pred: input predictions with shape HW[D].
+            y: ground truth with shape HW[D].
+        """
         y_o = torch.sum(y)
         if y_o > 0:
             return (2.0 * torch.sum(torch.masked_select(y, y_pred))) / (y_o + torch.sum(y_pred))
@@ -243,25 +299,25 @@ class DiceHelper:
     def __call__(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
+        Compute the metric for the given prediction and ground truth.
         Args:
             y_pred: input predictions with shape (batch_size, num_classes or 1, spatial_dims...).
                 the number of channels is inferred from ``y_pred.shape[1]`` when ``num_classes is None``.
             y: ground truth with shape (batch_size, num_classes or 1, spatial_dims...).
         """
-        _softmax, _sigmoid = self.softmax, self.sigmoid
+        _apply_argmax, _threshold = self.apply_argmax, self.threshold
         if self.num_classes is None:
             n_pred_ch = y_pred.shape[1]  # y_pred is in one-hot format or multi-channel scores
         else:
             n_pred_ch = self.num_classes
             if y_pred.shape[1] == 1 and self.num_classes > 1:  # y_pred is single-channel class indices
-                _softmax = _sigmoid = False
+                _apply_argmax = _threshold = False
-        if _softmax:
-            if n_pred_ch > 1:
-                y_pred = torch.argmax(y_pred, dim=1, keepdim=True)
+        if _apply_argmax and n_pred_ch > 1:
+            y_pred = torch.argmax(y_pred, dim=1, keepdim=True)
-        elif _sigmoid:
+        elif _threshold:
             if self.activate:
                 y_pred = torch.sigmoid(y_pred)
             y_pred = y_pred > 0.5

monai/networks/blocks/__init__.py CHANGED Viewed

@@ -15,12 +15,13 @@ from .acti_norm import ADN
 from .activation import GEGLU, MemoryEfficientSwish, Mish, Swish
 from .aspp import SimpleASPP
 from .backbone_fpn_utils import BackboneWithFPN
+from .cablock import CABlock, FeedForward
 from .convolutions import Convolution, ResidualUnit
 from .crf import CRF
 from .crossattention import CrossAttentionBlock
 from .denseblock import ConvDenseBlock, DenseBlock
 from .dints_block import ActiConvNormBlock, FactorizedIncreaseBlock, FactorizedReduceBlock, P3DActiConvNormBlock
-from .downsample import MaxAvgPool
+from .downsample import DownSample, Downsample, MaxAvgPool, SubpixelDownsample, SubpixelDownSample, Subpixeldownsample
 from .dynunet_block import UnetBasicBlock, UnetOutBlock, UnetResBlock, UnetUpBlock, get_output_padding, get_padding
 from .encoder import BaseEncoder
 from .fcn import FCN, GCN, MCFCN, Refine

monai/networks/blocks/cablock.py ADDED Viewed

@@ -0,0 +1,182 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import cast
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monai.networks.blocks.convolutions import Convolution
+from monai.utils import optional_import
+rearrange, _ = optional_import("einops", name="rearrange")
+__all__ = ["FeedForward", "CABlock"]
+class FeedForward(nn.Module):
+    """Gated-DConv Feed-Forward Network (GDFN) that controls feature flow using gating mechanism.
+    Uses depth-wise convolutions for local context mixing and GELU-activated gating for refined feature selection.
+    Args:
+        spatial_dims: Number of spatial dimensions (2D or 3D)
+        dim: Number of input channels
+        ffn_expansion_factor: Factor to expand hidden features dimension
+        bias: Whether to use bias in convolution layers
+    """
+    def __init__(self, spatial_dims: int, dim: int, ffn_expansion_factor: float, bias: bool):
+        super().__init__()
+        hidden_features = int(dim * ffn_expansion_factor)
+        self.project_in = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=dim,
+            out_channels=hidden_features * 2,
+            kernel_size=1,
+            bias=bias,
+            conv_only=True,
+        )
+        self.dwconv = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=hidden_features * 2,
+            out_channels=hidden_features * 2,
+            kernel_size=3,
+            strides=1,
+            padding=1,
+            groups=hidden_features * 2,
+            bias=bias,
+            conv_only=True,
+        )
+        self.project_out = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=hidden_features,
+            out_channels=dim,
+            kernel_size=1,
+            bias=bias,
+            conv_only=True,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.project_in(x)
+        x1, x2 = self.dwconv(x).chunk(2, dim=1)
+        return cast(torch.Tensor, self.project_out(F.gelu(x1) * x2))
+class CABlock(nn.Module):
+    """Multi-DConv Head Transposed Self-Attention (MDTA): Differs from standard self-attention
+    by operating on feature channels instead of spatial dimensions. Incorporates depth-wise
+    convolutions for local mixing before attention, achieving linear complexity vs quadratic
+    in vanilla attention. Based on SW Zamir, et al., 2022 <https://arxiv.org/abs/2111.09881>
+    Args:
+        spatial_dims: Number of spatial dimensions (2D or 3D)
+        dim: Number of input channels
+        num_heads: Number of attention heads
+        bias: Whether to use bias in convolution layers
+        flash_attention: Whether to use flash attention optimization. Defaults to False.
+    Raises:
+        ValueError: If flash attention is not available in current PyTorch version
+        ValueError: If spatial_dims is greater than 3
+    """
+    def __init__(self, spatial_dims, dim: int, num_heads: int, bias: bool, flash_attention: bool = False):
+        super().__init__()
+        if flash_attention and not hasattr(F, "scaled_dot_product_attention"):
+            raise ValueError("Flash attention not available")
+        if spatial_dims > 3:
+            raise ValueError(f"Only 2D and 3D inputs are supported. Got spatial_dims={spatial_dims}")
+        self.spatial_dims = spatial_dims
+        self.num_heads = num_heads
+        self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1))
+        self.flash_attention = flash_attention
+        self.qkv = Convolution(
+            spatial_dims=spatial_dims, in_channels=dim, out_channels=dim * 3, kernel_size=1, bias=bias, conv_only=True
+        )
+        self.qkv_dwconv = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=dim * 3,
+            out_channels=dim * 3,
+            kernel_size=3,
+            strides=1,
+            padding=1,
+            groups=dim * 3,
+            bias=bias,
+            conv_only=True,
+        )
+        self.project_out = Convolution(
+            spatial_dims=spatial_dims, in_channels=dim, out_channels=dim, kernel_size=1, bias=bias, conv_only=True
+        )
+        self._attention_fn = self._get_attention_fn()
+    def _get_attention_fn(self):
+        if self.flash_attention:
+            return self._flash_attention
+        return self._normal_attention
+    def _flash_attention(self, q, k, v):
+        """Flash attention implementation using scaled dot-product attention."""
+        scale = float(self.temperature.mean())
+        out = F.scaled_dot_product_attention(q, k, v, scale=scale, dropout_p=0.0, is_causal=False)
+        return out
+    def _normal_attention(self, q, k, v):
+        """Attention matrix multiplication with depth-wise convolutions."""
+        attn = (q @ k.transpose(-2, -1)) * self.temperature
+        attn = attn.softmax(dim=-1)
+        return attn @ v
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass for MDTA attention.
+        1. Apply depth-wise convolutions to Q, K, V
+        2. Reshape Q, K, V for multi-head attention
+        3. Compute attention matrix using flash or normal attention
+        4. Reshape and project out attention output"""
+        spatial_dims = x.shape[2:]
+        # Project and mix
+        qkv = self.qkv_dwconv(self.qkv(x))
+        q, k, v = qkv.chunk(3, dim=1)
+        # Select attention
+        if self.spatial_dims == 2:
+            qkv_to_multihead = "b (head c) h w -> b head c (h w)"
+            multihead_to_qkv = "b head c (h w) -> b (head c) h w"
+        else:  # dims == 3
+            qkv_to_multihead = "b (head c) d h w -> b head c (d h w)"
+            multihead_to_qkv = "b head c (d h w) -> b (head c) d h w"
+        # Reconstruct and project feature map
+        q = rearrange(q, qkv_to_multihead, head=self.num_heads)
+        k = rearrange(k, qkv_to_multihead, head=self.num_heads)
+        v = rearrange(v, qkv_to_multihead, head=self.num_heads)
+        q = torch.nn.functional.normalize(q, dim=-1)
+        k = torch.nn.functional.normalize(k, dim=-1)
+        out = self._attention_fn(q, k, v)
+        out = rearrange(
+            out,
+            multihead_to_qkv,
+            head=self.num_heads,
+            **dict(zip(["h", "w"] if self.spatial_dims == 2 else ["d", "h", "w"], spatial_dims)),
+        )
+        return cast(torch.Tensor, self.project_out(out))

monai-weekly 1.5.dev2511__py3-none-any.whl → 1.5.dev2513__py3-none-any.whl

monai-weekly 1.5.dev2511py3-none-any.whl → 1.5.dev2513py3-none-any.whl