PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/d_fine/modular_d_fine.py CHANGED Viewed

@@ -12,25 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Any
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from torch import nn
 from ... import initialization as init
 from ...activations import ACT2CLS
+from ...backbone_utils import consolidate_backbone_kwargs_to_config
 from ...configuration_utils import PreTrainedConfig
 from ...image_transforms import corners_to_center_format
-from ...utils import is_torchdynamo_compiling, logging
-from ...utils.backbone_utils import verify_backbone_config_arguments
-from ..auto import CONFIG_MAPPING, AutoConfig
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging, torch_compilable_check
+from ..auto import AutoConfig
 from ..rt_detr.modeling_rt_detr import (
+    RTDetrAIFILayer,
     RTDetrConvNormLayer,
     RTDetrDecoder,
     RTDetrDecoderLayer,
     RTDetrDecoderOutput,
-    RTDetrEncoder,
+    RTDetrEncoderLayer,
     RTDetrForObjectDetection,
     RTDetrFrozenBatchNorm2d,
     RTDetrHybridEncoder,
@@ -68,20 +69,8 @@ class DFineConfig(PreTrainedConfig):
             The epsilon used by the batch normalization layers.
         backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `HGNetV2Config()`):
             The configuration of the backbone model.
-        backbone (`str`, *optional*):
-            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
-            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
-            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
-        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to use pretrained weights for the backbone.
-        use_timm_backbone (`bool`, *optional*, defaults to `False`):
-            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
-            library.
         freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
             Whether to freeze the batch normalization layers in the backbone.
-        backbone_kwargs (`dict`, *optional*):
-            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
-            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
         encoder_hidden_dim (`int`, *optional*, defaults to 256):
             Dimension of the layers in hybrid encoder.
         encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
@@ -210,6 +199,8 @@ class DFineConfig(PreTrainedConfig):
             The method to use for the decoder: `"default"` or `"discrete"`.
         up (`float`, *optional*, defaults to 0.5):
             Controls the upper bounds of the Weighting Function.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
     """
     model_type = "d_fine"
@@ -228,11 +219,7 @@ class DFineConfig(PreTrainedConfig):
         batch_norm_eps=1e-5,
         # backbone
         backbone_config=None,
-        backbone=None,
-        use_pretrained_backbone=False,
-        use_timm_backbone=False,
         freeze_backbone_batch_norms=True,
-        backbone_kwargs=None,
         # encoder HybridEncoder
         encoder_hidden_dim=256,
         encoder_in_channels=[512, 1024, 2048],
@@ -294,52 +281,23 @@ class DFineConfig(PreTrainedConfig):
         decoder_offset_scale=0.5,
         decoder_method="default",
         up=0.5,
+        tie_word_embeddings=True,
         **kwargs,
     ):
         self.initializer_range = initializer_range
         self.initializer_bias_prior_prob = initializer_bias_prior_prob
         self.layer_norm_eps = layer_norm_eps
         self.batch_norm_eps = batch_norm_eps
-        # backbone
-        if backbone_config is None and backbone is None:
-            logger.info(
-                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone."
-            )
-            backbone_model_type = "hgnet_v2"
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            # this will map it to HGNetV2Config
-            # and we would need to create HGNetV2Backbone
-            backbone_config = config_class(
-                num_channels=3,
-                embedding_size=64,
-                hidden_sizes=[256, 512, 1024, 2048],
-                depths=[3, 4, 6, 3],
-                layer_type="bottleneck",
-                hidden_act="relu",
-                downsample_in_first_stage=False,
-                downsample_in_bottleneck=False,
-                out_features=None,
-                out_indices=[2, 3, 4],
-            )
-        elif isinstance(backbone_config, dict):
-            backbone_model_type = backbone_config.pop("model_type")
-            config_class = CONFIG_MAPPING[backbone_model_type]
-            backbone_config = config_class.from_dict(backbone_config)
-        verify_backbone_config_arguments(
-            use_timm_backbone=use_timm_backbone,
-            use_pretrained_backbone=use_pretrained_backbone,
-            backbone=backbone,
+        backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
             backbone_config=backbone_config,
-            backbone_kwargs=backbone_kwargs,
+            default_config_type="hgnet_v2",
+            default_config_kwargs={"out_indices": [2, 3, 4]},
+            **kwargs,
         )
         self.backbone_config = backbone_config
-        self.backbone = backbone
-        self.use_pretrained_backbone = use_pretrained_backbone
-        self.use_timm_backbone = use_timm_backbone
         self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
-        self.backbone_kwargs = backbone_kwargs
         # encoder
         self.encoder_hidden_dim = encoder_hidden_dim
         self.encoder_in_channels = encoder_in_channels
@@ -401,6 +359,7 @@ class DFineConfig(PreTrainedConfig):
         self.lqe_hidden_dim = lqe_hidden_dim
         self.lqe_layers = lqe_layers
         self.up = up
+        self.tie_word_embeddings = tie_word_embeddings
         if isinstance(self.decoder_n_points, list):
             if len(self.decoder_n_points) != self.num_feature_levels:
@@ -417,6 +376,93 @@ class DFineConfig(PreTrainedConfig):
         super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+class DFineDecoderOutput(RTDetrDecoderOutput):
+    pass
+def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
+    """
+    Generates the non-uniform Weighting Function W(n) for bounding box regression.
+    Args:
+        max_num_bins (int): Max number of the discrete bins.
+        up (Tensor): Controls upper bounds of the sequence,
+                     where maximum offset is ±up * H / W.
+        reg_scale (float): Controls the curvature of the Weighting Function.
+                           Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
+                           and steeper weights at both ends.
+    Returns:
+        Tensor: Sequence of Weighting Function.
+    """
+    upper_bound1 = abs(up[0]) * abs(reg_scale)
+    upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
+    step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
+    left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
+    right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
+    values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
+    values = torch.cat(values, 0)
+    return values
+def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
+    """
+    Decodes edge-distances into bounding box coordinates.
+    Args:
+        points (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+        distance (`torch.Tensor`):
+            (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
+        reg_scale (`float`):
+            Controls the curvature of the Weighting Function.
+    Returns:
+        `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
+    """
+    reg_scale = abs(reg_scale)
+    top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
+    top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
+    bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
+    bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
+    bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
+    return corners_to_center_format(bboxes)
+class DFineMLP(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
+        super().__init__()
+        self.num_layers = num_layers
+        hidden_dims = [hidden_dim] * (num_layers - 1)
+        input_dims = [input_dim] + hidden_dims
+        output_dims = hidden_dims + [output_dim]
+        self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
+        self.act = ACT2CLS[act]()
+    def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
+        for i, layer in enumerate(self.layers):
+            stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
+        return stat_features
+class DFineGate(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.gate = nn.Linear(2 * d_model, 2 * d_model)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_input = torch.cat([second_residual, hidden_states], dim=-1)
+        gates = torch.sigmoid(self.gate(gate_input))
+        gate1, gate2 = gates.chunk(2, dim=-1)
+        hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
+        return hidden_states
+class DFineFrozenBatchNorm2d(RTDetrFrozenBatchNorm2d):
+    pass
 class DFineMultiscaleDeformableAttention(nn.Module):
     def __init__(self, config: DFineConfig):
         """
@@ -454,14 +500,15 @@ class DFineMultiscaleDeformableAttention(nn.Module):
         encoder_hidden_states=None,
         spatial_shapes=None,
         spatial_shapes_list=None,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
-        if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
-            raise ValueError(
-                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
-            )
+        torch_compilable_check(
+            (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == sequence_length,
+            "Make sure to align the spatial shapes with the sequence length of the encoder hidden states",
+        )
         # Reshape for multi-head attention
         value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
@@ -508,18 +555,171 @@ class DFineMultiscaleDeformableAttention(nn.Module):
         return output, attention_weights
-class DFineGate(nn.Module):
-    def __init__(self, d_model: int):
+class DFineConvNormLayer(RTDetrConvNormLayer):
+    def __init__(
+        self,
+        config: DFineConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        groups: int = 1,
+        padding: int | None = None,
+        activation: str | None = None,
+    ):
+        super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation)
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+class DFineRepVggBlock(RTDetrRepVggBlock):
+    def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
+        super().__init__(config)
+        hidden_channels = in_channels
+        self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
+        self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
+class DFineCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+    def __init__(
+        self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
+    ):
         super().__init__()
-        self.gate = nn.Linear(2 * d_model, 2 * d_model)
-        self.norm = nn.LayerNorm(d_model)
+        activation = config.activation_function
-    def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
-        gate_input = torch.cat([second_residual, hidden_states], dim=-1)
-        gates = torch.sigmoid(self.gate(gate_input))
-        gate1, gate2 = gates.chunk(2, dim=-1)
-        hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states)
-        return hidden_states
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.bottlenecks = nn.ModuleList(
+            [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
+        )
+        if hidden_channels != out_channels:
+            self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+        else:
+            self.conv3 = nn.Identity()
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state_1 = self.conv1(hidden_state)
+        for bottleneck in self.bottlenecks:
+            hidden_state_1 = bottleneck(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
+        hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
+        return hidden_state_3
+class DFineRepNCSPELAN4(nn.Module):
+    def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
+        super().__init__()
+        conv1_dim = config.encoder_hidden_dim * 2
+        conv2_dim = config.encoder_hidden_dim
+        conv3_dim = config.encoder_hidden_dim * 2
+        conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
+        self.conv_dim = conv3_dim // 2
+        self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
+        self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
+        self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
+        self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
+        self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        # Split initial features into two branches after first convolution
+        split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
+        # Process branches sequentially
+        branch1 = self.csp_rep1(split_features[-1])
+        branch1 = self.conv2(branch1)
+        branch2 = self.csp_rep2(branch1)
+        branch2 = self.conv3(branch2)
+        split_features.extend([branch1, branch2])
+        merged_features = torch.cat(split_features, 1)
+        merged_features = self.conv4(merged_features)
+        return merged_features
+class DFineSCDown(nn.Module):
+    def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
+        super().__init__()
+        self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
+        self.conv2 = DFineConvNormLayer(
+            config,
+            config.encoder_hidden_dim,
+            config.encoder_hidden_dim,
+            kernel_size,
+            stride,
+            config.encoder_hidden_dim,
+        )
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        input_features = self.conv1(input_features)
+        input_features = self.conv2(input_features)
+        return input_features
+class DFineEncoderLayer(RTDetrEncoderLayer):
+    def __init__(self, config: DFineConfig):
+        super().__init__(config)
+        self.mlp = DFineMLP(
+            self.hidden_size, config.encoder_ffn_dim, self.hidden_size, 2, config.encoder_activation_function
+        )
+class DFineAIFILayer(RTDetrAIFILayer):
+    pass
+class DFineIntegral(nn.Module):
+    """
+    A static layer that calculates integral results from a distribution.
+    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
+    where Pr(n) is the softmax probability vector representing the discrete
+    distribution, and W(n) is the non-uniform Weighting Function.
+    Args:
+        max_num_bins (int): Max number of the discrete bins. Default is 32.
+                       It can be adjusted based on the dataset or task requirements.
+    """
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.max_num_bins = config.max_num_bins
+    def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
+        batch_size, num_queries, _ = pred_corners.shape
+        pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
+        pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
+        pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
+        return pred_corners
+class DFineLQE(nn.Module):
+    def __init__(self, config: DFineConfig):
+        super().__init__()
+        self.top_prob_values = config.top_prob_values
+        self.max_num_bins = config.max_num_bins
+        self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
+    def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
+        batch_size, length, _ = pred_corners.size()
+        prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
+        prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
+        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
+        quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
+        scores = scores + quality_score
+        return scores
 class DFineDecoderLayer(RTDetrDecoderLayer):
@@ -530,6 +730,9 @@ class DFineDecoderLayer(RTDetrDecoderLayer):
         self.encoder_attn = DFineMultiscaleDeformableAttention(config=config)
         # gate
         self.gateway = DFineGate(config.d_model)
+        self.mlp = DFineMLP(
+            self.hidden_size, config.decoder_ffn_dim, self.hidden_size, 2, config.decoder_activation_function
+        )
         del self.encoder_attn_layer_norm
@@ -542,49 +745,47 @@ class DFineDecoderLayer(RTDetrDecoderLayer):
         spatial_shapes_list=None,
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
-        output_attentions: bool | None = False,
-    ) -> tuple[torch.Tensor, Any, Any]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
         # Self Attention
-        hidden_states_2, self_attn_weights = self.self_attn(
+        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=encoder_attention_mask,
             position_embeddings=position_embeddings,
-            output_attentions=output_attentions,
+            **kwargs,
         )
-        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
-        hidden_states = hidden_states + hidden_states_2
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         residual = hidden_states
         # Cross-Attention
-        cross_attn_weights = None
         hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings
-        hidden_states_2, cross_attn_weights = self.encoder_attn(
+        hidden_states, _ = self.encoder_attn(
             hidden_states=hidden_states,
             encoder_hidden_states=encoder_hidden_states,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
             spatial_shapes_list=spatial_shapes_list,
         )
-        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
-        hidden_states = self.gateway(residual, hidden_states_2)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.gateway(residual, hidden_states)
         # Fully Connected
-        hidden_states_2 = self.activation_fn(self.fc1(hidden_states))
-        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training)
-        hidden_states_2 = self.fc2(hidden_states_2)
-        hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training)
-        hidden_states = hidden_states + hidden_states_2
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504))
-        outputs = (hidden_states,)
+        return hidden_states
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-        return outputs
+class DFineMLPPredictionHead(RTDetrMLPPredictionHead):
+    pass
 class DFinePreTrainedModel(RTDetrPreTrainedModel):
@@ -664,33 +865,42 @@ class DFinePreTrainedModel(RTDetrPreTrainedModel):
             init.xavier_uniform_(module.denoising_class_embed.weight)
-class DFineIntegral(nn.Module):
-    """
-    A static layer that calculates integral results from a distribution.
-    This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`,
-    where Pr(n) is the softmax probability vector representing the discrete
-    distribution, and W(n) is the non-uniform Weighting Function.
-    Args:
-        max_num_bins (int): Max number of the discrete bins. Default is 32.
-                       It can be adjusted based on the dataset or task requirements.
-    """
+class DFineHybridEncoder(RTDetrHybridEncoder):
     def __init__(self, config: DFineConfig):
-        super().__init__()
-        self.max_num_bins = config.max_num_bins
+        DFinePreTrainedModel.__init__(config)
+        self.config = config
+        self.in_channels = config.encoder_in_channels
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.feat_strides = config.feat_strides
+        self.encoder_hidden_dim = config.encoder_hidden_dim
+        self.encode_proj_layers = config.encode_proj_layers
+        self.positional_encoding_temperature = config.positional_encoding_temperature
+        self.eval_size = config.eval_size
+        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+        self.out_strides = self.feat_strides
-    def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor:
-        batch_size, num_queries, _ = pred_corners.shape
-        pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1)
-        pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4)
-        pred_corners = pred_corners.reshape(batch_size, num_queries, -1)
-        return pred_corners
+        # AIFI (Attention-based Intra-scale Feature Interaction) layers
+        self.aifi = nn.ModuleList([DFineAIFILayer(config) for _ in range(len(self.encode_proj_layers))])
+        # top-down fpn
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1, 0, -1):
+            lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
+            self.lateral_convs.append(lateral_layer)
+            num_blocks = round(3 * config.depth_mult)
+            fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
+            self.fpn_blocks.append(fpn_layer)
+        # bottom-up pan
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(len(self.in_channels) - 1):
+            self.downsample_convs.append(DFineSCDown(config, 3, 2))
+            num_blocks = round(3 * config.depth_mult)
+            self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
-class DFineDecoderOutput(RTDetrDecoderOutput):
-    pass
+        self.post_init()
 class DFineDecoder(RTDetrDecoder):
@@ -727,26 +937,14 @@ class DFineDecoder(RTDetrDecoder):
         spatial_shapes,
         level_start_index=None,
         spatial_shapes_list=None,
-        output_hidden_states=None,
         encoder_attention_mask=None,
         memory_mask=None,
-        output_attentions=None,
-        return_dict=None,
-        **kwargs,
+        **kwargs: Unpack[TransformersKwargs],
     ) -> DFineDecoderOutput:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         intermediate = ()
         intermediate_reference_points = ()
         intermediate_logits = ()
@@ -762,25 +960,22 @@ class DFineDecoder(RTDetrDecoder):
             ref_points_input = ref_points_detach.unsqueeze(2)
             query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            output = decoder_layer(
-                hidden_states=hidden_states,
+            hidden_states = decoder_layer(
+                hidden_states,
                 position_embeddings=query_pos_embed,
                 reference_points=ref_points_input,
                 spatial_shapes=spatial_shapes,
                 spatial_shapes_list=spatial_shapes_list,
                 encoder_hidden_states=encoder_hidden_states,
                 encoder_attention_mask=encoder_attention_mask,
-                output_attentions=output_attentions,
+                **kwargs,
             )
-            hidden_states = output[0]
             if i == 0:
                 # Initial bounding box predictions with inverse sigmoid refinement
-                new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach))
+                new_reference_points = F.sigmoid(
+                    self.pre_bbox_head(hidden_states) + inverse_sigmoid(ref_points_detach)
+                )
                 ref_points_initial = new_reference_points.detach()
             # Refine bounding box corners using FDR, integrating previous layer's corrections
@@ -809,12 +1004,6 @@ class DFineDecoder(RTDetrDecoder):
                 initial_reference_points += (ref_points_initial,)
                 intermediate_predicted_corners += (pred_corners,)
-            if output_attentions:
-                all_self_attns += (output[1],)
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (output[2],)
         # Keep batch_size as first dimension
         intermediate = torch.stack(intermediate)
         if self.class_embed is not None and self.bbox_embed is not None:
@@ -823,27 +1012,6 @@ class DFineDecoder(RTDetrDecoder):
             initial_reference_points = torch.stack(initial_reference_points, dim=1)
             intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    intermediate,
-                    intermediate_logits,
-                    intermediate_reference_points,
-                    intermediate_predicted_corners,
-                    initial_reference_points,
-                    all_hidden_states,
-                    all_self_attns,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
         return DFineDecoderOutput(
             last_hidden_state=hidden_states,
             intermediate_hidden_states=intermediate,
@@ -851,16 +1019,9 @@ class DFineDecoder(RTDetrDecoder):
             intermediate_reference_points=intermediate_reference_points,
             intermediate_predicted_corners=intermediate_predicted_corners,
             initial_reference_points=initial_reference_points,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
         )
-class DFineFrozenBatchNorm2d(RTDetrFrozenBatchNorm2d):
-    pass
 class DFineModel(RTDetrModel):
     def __init__(self, config: DFineConfig):
         super().__init__(config)
@@ -892,10 +1053,10 @@ class DFineForObjectDetection(RTDetrForObjectDetection):
     # We can't initialize the model on meta device as some weights are modified during the initialization
     _no_split_modules = None
     _tied_weights_keys = {
-        r"bbox_embed.(?![0])\d+": "bbox_embed.0",
-        r"class_embed.(?![0])\d+": "class_embed.0",
-        "model.decoder.class_embed": "class_embed",
-        "model.decoder.bbox_embed": "bbox_embed",
+        r"bbox_embed.(?![0])\d+": r"bbox_embed.0",
+        r"class_embed.(?![0])\d+": r"^class_embed.0",
+        "class_embed": "model.decoder.class_embed",
+        "bbox_embed": "model.decoder.bbox_embed",
     }
     def __init__(self, config: DFineConfig):
@@ -972,244 +1133,6 @@ class DFineForObjectDetection(RTDetrForObjectDetection):
         super().forward(**super_kwargs)
-def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor:
-    """
-    Generates the non-uniform Weighting Function W(n) for bounding box regression.
-    Args:
-        max_num_bins (int): Max number of the discrete bins.
-        up (Tensor): Controls upper bounds of the sequence,
-                     where maximum offset is ±up * H / W.
-        reg_scale (float): Controls the curvature of the Weighting Function.
-                           Larger values result in flatter weights near the central axis W(max_num_bins/2)=0
-                           and steeper weights at both ends.
-    Returns:
-        Tensor: Sequence of Weighting Function.
-    """
-    upper_bound1 = abs(up[0]) * abs(reg_scale)
-    upper_bound2 = abs(up[0]) * abs(reg_scale) * 2
-    step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2))
-    left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)]
-    right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)]
-    values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2]
-    values = torch.cat(values, 0)
-    return values
-class DFineMLPPredictionHead(RTDetrMLPPredictionHead):
-    pass
-def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor:
-    """
-    Decodes edge-distances into bounding box coordinates.
-    Args:
-        points (`torch.Tensor`):
-            (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
-        distance (`torch.Tensor`):
-            (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries.
-        reg_scale (`float`):
-            Controls the curvature of the Weighting Function.
-    Returns:
-        `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height]
-    """
-    reg_scale = abs(reg_scale)
-    top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale)
-    top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale)
-    bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale)
-    bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale)
-    bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1)
-    return corners_to_center_format(bboxes)
-class DFineMLP(nn.Module):
-    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"):
-        super().__init__()
-        self.num_layers = num_layers
-        hidden_dims = [hidden_dim] * (num_layers - 1)
-        input_dims = [input_dim] + hidden_dims
-        output_dims = hidden_dims + [output_dim]
-        self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims))
-        self.act = ACT2CLS[act]()
-    def forward(self, stat_features: torch.Tensor) -> torch.Tensor:
-        for i, layer in enumerate(self.layers):
-            stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features)
-        return stat_features
-class DFineLQE(nn.Module):
-    def __init__(self, config: DFineConfig):
-        super().__init__()
-        self.top_prob_values = config.top_prob_values
-        self.max_num_bins = config.max_num_bins
-        self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers)
-    def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor:
-        batch_size, length, _ = pred_corners.size()
-        prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1)
-        prob_topk, _ = prob.topk(self.top_prob_values, dim=-1)
-        stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1)
-        quality_score = self.reg_conf(stat.reshape(batch_size, length, -1))
-        scores = scores + quality_score
-        return scores
-class DFineConvNormLayer(RTDetrConvNormLayer):
-    def __init__(
-        self,
-        config: DFineConfig,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int,
-        groups: int = 1,
-        padding: int | None = None,
-        activation: str | None = None,
-    ):
-        super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation)
-        self.conv = nn.Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            groups=groups,
-            padding=(kernel_size - 1) // 2 if padding is None else padding,
-            bias=False,
-        )
-class DFineRepVggBlock(RTDetrRepVggBlock):
-    def __init__(self, config: DFineConfig, in_channels: int, out_channels: int):
-        super().__init__(config)
-        hidden_channels = in_channels
-        self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1)
-        self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0)
-class DFineCSPRepLayer(nn.Module):
-    """
-    Cross Stage Partial (CSP) network layer with RepVGG blocks.
-    """
-    def __init__(
-        self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0
-    ):
-        super().__init__()
-        activation = config.activation_function
-        hidden_channels = int(out_channels * expansion)
-        self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
-        self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
-        self.bottlenecks = nn.ModuleList(
-            [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)]
-        )
-        if hidden_channels != out_channels:
-            self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
-        else:
-            self.conv3 = nn.Identity()
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state_1 = self.conv1(hidden_state)
-        for bottleneck in self.bottlenecks:
-            hidden_state_1 = bottleneck(hidden_state_1)
-        hidden_state_2 = self.conv2(hidden_state)
-        hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2)
-        return hidden_state_3
-class DFineRepNCSPELAN4(nn.Module):
-    def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3):
-        super().__init__()
-        conv1_dim = config.encoder_hidden_dim * 2
-        conv2_dim = config.encoder_hidden_dim
-        conv3_dim = config.encoder_hidden_dim * 2
-        conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2)
-        self.conv_dim = conv3_dim // 2
-        self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act)
-        self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks)
-        self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
-        self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks)
-        self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act)
-        self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act)
-    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
-        # Split initial features into two branches after first convolution
-        split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1))
-        # Process branches sequentially
-        branch1 = self.csp_rep1(split_features[-1])
-        branch1 = self.conv2(branch1)
-        branch2 = self.csp_rep2(branch1)
-        branch2 = self.conv3(branch2)
-        split_features.extend([branch1, branch2])
-        merged_features = torch.cat(split_features, 1)
-        merged_features = self.conv4(merged_features)
-        return merged_features
-class DFineSCDown(nn.Module):
-    def __init__(self, config: DFineConfig, kernel_size: int, stride: int):
-        super().__init__()
-        self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1)
-        self.conv2 = DFineConvNormLayer(
-            config,
-            config.encoder_hidden_dim,
-            config.encoder_hidden_dim,
-            kernel_size,
-            stride,
-            config.encoder_hidden_dim,
-        )
-    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
-        input_features = self.conv1(input_features)
-        input_features = self.conv2(input_features)
-        return input_features
-class DFineEncoder(RTDetrEncoder):
-    pass
-class DFineHybridEncoder(RTDetrHybridEncoder):
-    def __init__(self, config: DFineConfig):
-        nn.Module.__init__(self)
-        self.config = config
-        self.in_channels = config.encoder_in_channels
-        self.num_fpn_stages = len(self.in_channels) - 1
-        self.feat_strides = config.feat_strides
-        self.encoder_hidden_dim = config.encoder_hidden_dim
-        self.encode_proj_layers = config.encode_proj_layers
-        self.positional_encoding_temperature = config.positional_encoding_temperature
-        self.eval_size = config.eval_size
-        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
-        self.out_strides = self.feat_strides
-        # encoder transformer
-        self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))])
-        # top-down fpn
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1, 0, -1):
-            lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1)
-            self.lateral_convs.append(lateral_layer)
-            num_blocks = round(3 * config.depth_mult)
-            fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks)
-            self.fpn_blocks.append(fpn_layer)
-        # bottom-up pan
-        self.downsample_convs = nn.ModuleList()
-        self.pan_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1):
-            self.downsample_convs.append(DFineSCDown(config, 3, 2))
-            num_blocks = round(3 * config.depth_mult)
-            self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks))
 __all__ = [
     "DFineConfig",
     "DFineModel",

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl