PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/models/eomt_dinov3/modeling_eomt_dinov3.py ADDED Viewed

@@ -0,0 +1,1376 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/eomt_dinov3/modular_eomt_dinov3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_eomt_dinov3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from ... import initialization as init
+from ...activations import ACT2FN
+from ...file_utils import ModelOutput, is_scipy_available, requires_backends
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import TransformersKwargs, auto_docstring, is_accelerate_available
+from ...utils.generic import check_model_inputs, maybe_autocast
+from .configuration_eomt_dinov3 import EomtDinov3Config
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float | None = None,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def apply_rotary_pos_emb(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens,
+    ignoring the prefix tokens (cls token and register tokens).
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    num_tokens = q.shape[-2]
+    num_patches = sin.shape[-2]
+    num_prefix_tokens = num_tokens - num_patches  # cls token + register tokens
+    q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2)
+    k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2)
+    # apply rope only to patch tokens
+    q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin)
+    k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin)
+    q = torch.cat((q_prefix_tokens, q_patches), dim=-2)
+    k = torch.cat((k_prefix_tokens, k_patches), dim=-2)
+    return q, k
+class EomtDinov3Attention(nn.Module):
+    """
+    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
+    """
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.is_causal = False
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = False
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, patches, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(batch_size, patches, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class EomtDinov3Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        self.config = config
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
+        self.num_prefix_tokens = 1 + config.num_register_tokens
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: torch.Tensor | None = None) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embeddings.weight.dtype
+        # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size)
+        patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)
+        if bool_masked_pos is not None:
+            mask_token = self.mask_token.to(patch_embeddings.dtype)
+            patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings)
+        # Add CLS and register tokens
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
+        register_tokens = self.register_tokens.expand(batch_size, -1, -1)
+        embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1)
+        return embeddings
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+class EomtDinov3DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: float | None = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+class EomtDinov3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+class EomtDinov3GatedMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class EomtDinov3Layer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = EomtDinov3Attention(config)
+        self.layer_scale1 = EomtDinov3LayerScale(config)
+        self.drop_path = EomtDinov3DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if config.use_gated_mlp:
+            self.mlp = EomtDinov3GatedMLP(config)
+        else:
+            self.mlp = EomtDinov3MLP(config)
+        self.layer_scale2 = EomtDinov3LayerScale(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        # Attention with residual connection
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states, _ = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.layer_scale1(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+        # MLP with residual connection
+        residual = hidden_states
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.layer_scale2(hidden_states)
+        hidden_states = self.drop_path(hidden_states) + residual
+        return hidden_states
+class EomtDinov3LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+@compile_compatible_method_lru_cache(maxsize=32)
+def get_patches_center_coordinates(
+    num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1].
+    The center of each patch is exactly halfway between its top-left and bottom-right corners.
+    Args:
+        num_patches_h (int): Number of patches along the vertical (height) axis.
+        num_patches_w (int): Number of patches along the horizontal (width) axis.
+        dtype (torch.dtype): The desired data type of the returned tensor.
+    Returns:
+        torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x)
+            coordinates of a patch center, normalized to [-1, +1].
+    """
+    coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device)
+    coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device)
+    coords_h = coords_h / num_patches_h
+    coords_w = coords_w / num_patches_w
+    # (height, width, 2) -> (height * width, 2)
+    coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)
+    coords = coords.flatten(0, 1)
+    # Shift range [0, 1] to [-1, +1]
+    coords = 2.0 * coords - 1.0
+    return coords
+def augment_patches_center_coordinates(
+    coords: torch.Tensor,
+    shift: float | None = None,
+    jitter: float | None = None,
+    rescale: float | None = None,
+) -> torch.Tensor:
+    # Shift coords by adding a uniform value in [-shift, shift]
+    if shift is not None:
+        shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        shift_hw = shift_hw.uniform_(-shift, shift)
+        coords = coords + shift_hw
+    # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+    if jitter is not None:
+        jitter_range = np.log(jitter)
+        jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype)
+        jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp()
+        coords = coords * jitter_hw
+    # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+    if rescale is not None:
+        rescale_range = np.log(rescale)
+        rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype)
+        rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp()
+        coords = coords * rescale_hw
+    return coords
+class EomtDinov3RotaryEmbedding(nn.Module):
+    inv_freq: Tensor
+    def __init__(self, config: EomtDinov3Config, device=None):
+        super().__init__()
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            raise ValueError("`EomtDinov3` only supports `default` RoPE! Please check your `rope_type`")
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        _, _, height, width = pixel_values.shape
+        num_patches_h = height // self.config.patch_size
+        num_patches_w = width // self.config.patch_size
+        device = pixel_values.device
+        device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            # Although we could precompute static patch_coords from image_size and patch_size in the config,
+            # the model was trained with random_scale, so it can process images of varying sizes.
+            # Therefore, it's better to compute patch_coords dynamically (with lru_cache).
+            patch_coords = get_patches_center_coordinates(
+                num_patches_h, num_patches_w, dtype=torch.float32, device=device
+            )
+            if self.training:
+                patch_coords = augment_patches_center_coordinates(
+                    patch_coords,
+                    shift=self.config.pos_embed_shift,
+                    jitter=self.config.pos_embed_jitter,
+                    rescale=self.config.pos_embed_rescale,
+                )
+            # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim)
+            angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :]
+            angles = angles.flatten(1, 2)
+            angles = angles.tile(2)
+            cos = torch.cos(angles)
+            sin = torch.sin(angles)
+        dtype = pixel_values.dtype
+        return cos.to(dtype=dtype), sin.to(dtype=dtype)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: EomtDinov3Config | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> torch.Tensor:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        head_dim = config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1 / base ** torch.arange(0, 1, 4 / head_dim, dtype=torch.float32, device=device)
+        return inv_freq, attention_factor
+# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
+def sample_point(
+    input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
+) -> torch.Tensor:
+    """
+    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.
+    Args:
+        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
+            A tensor that contains features map on a height * width grid
+        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
+        2)):
+            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
+        add_dim (`bool`):
+            boolean value to keep track of added dimension
+    Returns:
+        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
+        height_grid, width_grid):
+            A tensor that contains features for points in `point_coordinates`.
+    """
+    if point_coordinates.dim() == 3:
+        add_dim = True
+        point_coordinates = point_coordinates.unsqueeze(2)
+    # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation
+    point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
+    if add_dim:
+        point_features = point_features.squeeze(3)
+    return point_features
+def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
+    """
+    A pair wise version of the dice loss, see `dice_loss` for usage.
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+    Returns:
+        `torch.Tensor`: The computed loss between each pairs.
+    """
+    inputs = inputs.sigmoid().flatten(1)
+    numerator = 2 * torch.matmul(inputs, labels.T)
+    # using broadcasting to get a [num_queries, NUM_CLASSES] matrix
+    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    r"""
+    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+    Returns:
+        loss (`torch.Tensor`): The computed loss between each pairs.
+    """
+    height_and_width = inputs.shape[1]
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
+    cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))
+    loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T)
+    loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T)
+    loss = loss_pos + loss_neg
+    return loss
+# Adapted from https://github.com/facebookresearch/EomtDinov3/blob/main/eomt_dinov3/modeling/matcher.py
+class EomtDinov3HungarianMatcher(nn.Module):
+    """This class computes an assignment between the labels and the predictions of the network.
+    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
+    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+    """
+    def __init__(
+        self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
+    ):
+        """Creates the matcher
+        Params:
+            cost_class (`float`, *optional*, defaults to 1.0):
+                Relative weight of the classification error in the matching cost.
+            cost_mask (`float`, *optional*,  defaults to 1.0):
+                This is the relative weight of the focal loss of the binary mask in the matching cost.
+            cost_dice (`float`, *optional*, defaults to 1.0):
+                This is the relative weight of the dice loss of the binary mask in the matching cost.
+            num_points (`int`, *optional*, defaults to 12544):
+                No. of points to sample on which the mask loss will be calculated. The same set of K points are
+                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
+                matching.
+        """
+        super().__init__()
+        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
+            raise ValueError("All costs can't be 0")
+        self.num_points = num_points
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+    @torch.no_grad()
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: torch.Tensor,
+        class_labels: torch.Tensor,
+    ) -> list[tuple[Tensor]]:
+        """
+        Params:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
+            class_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
+                target) containing the class labels.
+            mask_labels (`torch.Tensor`):
+                A tensor of dim `num_target_boxes, height, width` containing the target masks.
+        Returns:
+            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
+            where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected labels (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
+        """
+        indices: list[tuple[np.array]] = []
+        # iterate through batch size
+        batch_size = masks_queries_logits.shape[0]
+        for i in range(batch_size):
+            pred_probs = class_queries_logits[i].softmax(-1)
+            pred_mask = masks_queries_logits[i]
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted.
+            cost_class = -pred_probs[:, class_labels[i]]
+            target_mask = mask_labels[i].to(pred_mask)
+            target_mask = target_mask[:, None]
+            pred_mask = pred_mask[:, None]
+            # Sample ground truth and predicted masks
+            point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device)
+            target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1)
+            target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1)
+            pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1)
+            pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1)
+            # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask)
+            # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels)
+            cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
+            # final cost matrix
+            cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+            # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible``
+            cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
+            cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
+            cost_matrix = torch.nan_to_num(cost_matrix, 0)
+            # do the assignment using the hungarian algorithm in scipy
+            assigned_indices: tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
+            indices.append(assigned_indices)
+        # It could be stacked in one tensor
+        matched_indices = [
+            (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices
+        ]
+        return matched_indices
+def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
+    r"""
+    Compute the DICE loss, similar to generalized IOU for masks as follows:
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$
+    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow
+    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$
+    Args:
+        inputs (`torch.Tensor`):
+            A tensor representing a mask.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+        num_masks (`int`):
+            The number of masks present in the current batch, used for normalization.
+    Returns:
+        `torch.Tensor`: The computed loss.
+    """
+    probs = inputs.sigmoid().flatten(1)
+    numerator = 2 * (probs * labels).sum(-1)
+    denominator = probs.sum(-1) + labels.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    loss = loss.sum() / num_masks
+    return loss
+def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
+    r"""
+    Args:
+        inputs (`torch.Tensor`):
+            A float tensor of arbitrary shape.
+        labels (`torch.Tensor`):
+            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
+            (0 for the negative class and 1 for the positive class).
+    Returns:
+        loss (`torch.Tensor`): The computed loss.
+    """
+    criterion = nn.BCEWithLogitsLoss(reduction="none")
+    cross_entropy_loss = criterion(inputs, labels)
+    loss = cross_entropy_loss.mean(1).sum() / num_masks
+    return loss
+# Adapted from https://github.com/facebookresearch/EomtDinov3/blob/main/eomt_dinov3/modeling/criterion.py
+class EomtDinov3Loss(nn.Module):
+    def __init__(self, config: EomtDinov3Config, weight_dict: dict[str, float]):
+        """
+        The EomtDinov3 Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
+        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
+        of matched ground-truth / prediction (supervise class and mask)
+        Args:
+            config (`EomtDinov3Config`):
+                The configuration for EomtDinov3 model also containing loss calculation specific parameters.
+            weight_dict (`dict[str, float]`):
+                A dictionary of weights to be applied to the different losses.
+        """
+        super().__init__()
+        requires_backends(self, ["scipy"])
+        self.num_labels = config.num_labels
+        self.weight_dict = weight_dict
+        # Weight to apply to the null class
+        self.eos_coef = config.no_object_weight
+        empty_weight = torch.ones(self.num_labels + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+        # pointwise mask loss parameters
+        self.num_points = config.train_num_points
+        self.oversample_ratio = config.oversample_ratio
+        self.importance_sample_ratio = config.importance_sample_ratio
+        self.matcher = EomtDinov3HungarianMatcher(
+            cost_class=config.class_weight,
+            cost_dice=config.dice_weight,
+            cost_mask=config.mask_weight,
+            num_points=self.num_points,
+        )
+    def _max_by_axis(self, sizes: list[list[int]]) -> list[int]:
+        maxes = sizes[0]
+        for sublist in sizes[1:]:
+            for index, item in enumerate(sublist):
+                maxes[index] = max(maxes[index], item)
+        return maxes
+    # Adapted from nested_tensor_from_tensor_list() in original implementation
+    def _pad_images_to_max_in_batch(self, tensors: list[Tensor]) -> tuple[Tensor, Tensor]:
+        # get the maximum size in the batch
+        max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
+        # compute final size
+        batch_shape = [len(tensors)] + max_size
+        batch_size, _, height, width = batch_shape
+        dtype = tensors[0].dtype
+        device = tensors[0].device
+        padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
+        padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
+        # pad the tensors to the size of the biggest one
+        for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
+            padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
+            padding_mask[: tensor.shape[1], : tensor.shape[2]] = False
+        return padded_tensors, padding_masks
+    def loss_labels(
+        self, class_queries_logits: Tensor, class_labels: list[Tensor], indices: tuple[np.array]
+    ) -> dict[str, Tensor]:
+        """Compute the losses related to the labels using cross entropy.
+        Args:
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `batch_size, num_queries, num_labels`
+            class_labels (`list[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            indices (`tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+        Returns:
+            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+        """
+        pred_logits = class_queries_logits
+        batch_size, num_queries, _ = pred_logits.shape
+        criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
+        idx = self._get_predictions_permutation_indices(indices)  # shape of (batch_size, num_queries)
+        target_classes_o = torch.cat(
+            [target[j] for target, (_, j) in zip(class_labels, indices)]
+        )  # shape of (batch_size, num_queries)
+        target_classes = torch.full(
+            (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device
+        )
+        target_classes[idx] = target_classes_o
+        # Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries)
+        pred_logits_transposed = pred_logits.transpose(1, 2)
+        loss_ce = criterion(pred_logits_transposed, target_classes)
+        losses = {"loss_cross_entropy": loss_ce}
+        return losses
+    def loss_masks(
+        self,
+        masks_queries_logits: torch.Tensor,
+        mask_labels: list[torch.Tensor],
+        indices: tuple[np.array],
+        num_masks: int,
+    ) -> dict[str, torch.Tensor]:
+        """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            indices (`tuple[np.array])`:
+                The indices computed by the Hungarian matcher.
+            num_masks (`int)`:
+                The number of masks, used for normalization.
+        Returns:
+            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
+            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
+              masks.
+        """
+        src_idx = self._get_predictions_permutation_indices(indices)
+        tgt_idx = self._get_targets_permutation_indices(indices)
+        # shape (batch_size * num_queries, height, width)
+        pred_masks = masks_queries_logits[src_idx]
+        # shape (batch_size, num_queries, height, width)
+        # pad all and stack the targets to the num_labels dimension
+        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
+        target_masks = target_masks[tgt_idx]
+        # No need to upsample predictions as we are using normalized coordinates
+        pred_masks = pred_masks[:, None]
+        target_masks = target_masks[:, None]
+        # Sample point coordinates
+        with torch.no_grad():
+            point_coordinates = self.sample_points_using_uncertainty(
+                pred_masks,
+                lambda logits: self.calculate_uncertainty(logits),
+                self.num_points,
+                self.oversample_ratio,
+                self.importance_sample_ratio,
+            )
+            point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1)
+        point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1)
+        losses = {
+            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
+            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
+        }
+        del pred_masks
+        del target_masks
+        return losses
+    def _get_predictions_permutation_indices(self, indices):
+        # Permute predictions following indices
+        batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        predictions_indices = torch.cat([src for (src, _) in indices])
+        return batch_indices, predictions_indices
+    def _get_targets_permutation_indices(self, indices):
+        # Permute labels following indices
+        batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        target_indices = torch.cat([tgt for (_, tgt) in indices])
+        return batch_indices, target_indices
+    def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        In EomtDinov3 paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
+        for the foreground class in `classes`.
+        Args:
+            logits (`torch.Tensor`):
+            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
+            the number of foreground classes. The values are logits.
+        Returns:
+            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
+            uncertain locations having the highest uncertainty score.
+        """
+        uncertainty_scores = -(torch.abs(logits))
+        return uncertainty_scores
+    def sample_points_using_uncertainty(
+        self,
+        logits: torch.Tensor,
+        uncertainty_function,
+        num_points: int,
+        oversample_ratio: int,
+        importance_sample_ratio: float,
+    ) -> torch.Tensor:
+        """
+        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
+        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
+        prediction as input.
+        Args:
+            logits (`float`):
+                Logit predictions for P points.
+            uncertainty_function:
+                A function that takes logit predictions for P points and returns their uncertainties.
+            num_points (`int`):
+                The number of points P to sample.
+            oversample_ratio (`int`):
+                Oversampling parameter.
+            importance_sample_ratio (`float`):
+                Ratio of points that are sampled via importance sampling.
+        Returns:
+            point_coordinates (`torch.Tensor`):
+                Coordinates for P sampled points.
+        """
+        num_boxes = logits.shape[0]
+        num_points_sampled = int(num_points * oversample_ratio)
+        # Get random point coordinates
+        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)
+        # Get sampled prediction value for the point coordinates
+        point_logits = sample_point(logits, point_coordinates, align_corners=False)
+        # Calculate the uncertainties based on the sampled prediction values of the points
+        point_uncertainties = uncertainty_function(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
+        idx += shift[:, None]
+        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)
+        if num_random_points > 0:
+            point_coordinates = torch.cat(
+                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
+                dim=1,
+            )
+        return point_coordinates
+    def forward(
+        self,
+        masks_queries_logits: torch.Tensor,
+        class_queries_logits: torch.Tensor,
+        mask_labels: list[torch.Tensor],
+        class_labels: list[torch.Tensor],
+        auxiliary_predictions: dict[str, torch.Tensor] | None = None,
+    ) -> dict[str, torch.Tensor]:
+        """
+        This performs the loss computation.
+        Args:
+            masks_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, height, width)`.
+            class_queries_logits (`torch.Tensor`):
+                A tensor of shape `(batch_size, num_queries, num_labels)`.
+            mask_labels (`torch.Tensor`):
+                List of mask labels of shape `(labels, height, width)`.
+            class_labels (`list[torch.Tensor]`):
+                List of class labels of shape `(labels)`.
+            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
+                if `use_auxiliary_loss` was set to `true` in [`EomtDinov3Config`], then it contains the logits from
+                the inner layers of the EomtDinov3MaskedAttentionDecoder.
+        Returns:
+            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
+            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
+            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
+              masks.
+            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
+              masks.
+            if `use_auxiliary_loss` was set to `true` in [`EomtDinov3Config`], the dictionary contains additional
+            losses for each auxiliary predictions.
+        """
+        # retrieve the matching between the outputs of the last layer and the labels
+        indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+        # compute the average number of target masks for normalization purposes
+        num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
+        # get all the losses
+        losses: dict[str, Tensor] = {
+            **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
+            **self.loss_labels(class_queries_logits, class_labels, indices),
+        }
+        # in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if auxiliary_predictions is not None:
+            for idx, aux_outputs in enumerate(auxiliary_predictions):
+                masks_queries_logits = aux_outputs["masks_queries_logits"]
+                class_queries_logits = aux_outputs["class_queries_logits"]
+                loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
+                loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
+                losses.update(loss_dict)
+        return losses
+    def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
+        """
+        Computes the average number of target masks across the batch, for normalization purposes.
+        """
+        num_masks = sum(len(classes) for classes in class_labels)
+        num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
+        world_size = 1
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_masks = reduce(num_masks)
+                world_size = PartialState().num_processes
+        num_masks = torch.clamp(num_masks / world_size, min=1)
+        return num_masks
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for outputs of [`EomtDinov3ForUniversalSegmentationOutput`].
+    This output can be directly passed to [`~EomtDinov3ImageProcessor.post_process_semantic_segmentation`] or
+    [`~EomtDinov3ImageProcessor.post_process_instance_segmentation`] or
+    [`~EomtDinov3ImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
+    [`~EomtDinov3ImageProcessor] for details regarding usage.
+    """
+)
+class EomtDinov3ForUniversalSegmentationOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`, *optional*):
+        The computed loss, returned when labels are present.
+    class_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
+        query. Note the `+ 1` is needed because we incorporate the null class.
+    masks_queries_logits (`torch.FloatTensor`):
+        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
+        query.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+        Last hidden states (final feature map) of the last layer.
+    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
+    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
+    patch_offsets (`list[torch.Tensor]`, *optional*):
+        list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
+    """
+    loss: torch.FloatTensor | None = None
+    class_queries_logits: torch.FloatTensor | None = None
+    masks_queries_logits: torch.FloatTensor | None = None
+    last_hidden_state: torch.FloatTensor | None = None
+    hidden_states: tuple[torch.FloatTensor] | None = None
+    attentions: tuple[torch.FloatTensor] | None = None
+    patch_offsets: list[torch.Tensor] | None = None
+@auto_docstring
+class EomtDinov3PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config: EomtDinov3Config
+    base_model_prefix = "eomt_dinov3"
+    main_input_name = "pixel_values"
+    input_modalities = ("image",)
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["EomtDinov3Layer"]
+    _supports_sdpa = True
+    _can_record_outputs = {
+        "hidden_states": EomtDinov3Layer,
+        "attentions": EomtDinov3Attention,
+    }
+    config_class = EomtDinov3Config
+    @torch.no_grad()
+    def _init_weights(self, module: nn.Module) -> None:
+        super()._init_weights(module)
+        std = self.config.initializer_range
+        if isinstance(module, EomtDinov3LayerScale):
+            if hasattr(module, "lambda1"):
+                init.constant_(module.lambda1, self.config.layerscale_value)
+        elif isinstance(module, EomtDinov3Embeddings):
+            init.trunc_normal_(module.cls_token, mean=0.0, std=std)
+            init.zeros_(module.register_tokens)
+        elif isinstance(module, EomtDinov3Loss):
+            empty_weight = torch.ones(module.num_labels + 1)
+            empty_weight[-1] = module.eos_coef
+            init.copy_(module.empty_weight, empty_weight)
+        elif isinstance(module, EomtDinov3ForUniversalSegmentation):
+            init.ones_(module.attn_mask_probs)
+class EomtDinov3LayerNorm2d(nn.LayerNorm):
+    def __init__(self, num_channels, eps=1e-6, affine=True):
+        super().__init__(num_channels, eps=eps, elementwise_affine=affine)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = hidden_state.permute(0, 2, 3, 1)
+        hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps)
+        hidden_state = hidden_state.permute(0, 3, 1, 2)
+        return hidden_state
+class EomtDinov3ScaleLayer(nn.Module):
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2)
+        self.activation = ACT2FN[config.hidden_act]
+        self.conv2 = nn.Conv2d(
+            hidden_size,
+            hidden_size,
+            kernel_size=3,
+            padding=1,
+            groups=hidden_size,
+            bias=False,
+        )
+        self.layernorm2d = EomtDinov3LayerNorm2d(hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layernorm2d(hidden_states)
+        return hidden_states
+class EomtDinov3ScaleBlock(nn.Module):
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        self.num_blocks = config.num_upscale_blocks
+        self.block = nn.ModuleList([EomtDinov3ScaleLayer(config) for _ in range(self.num_blocks)])
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for block in self.block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+class EomtDinov3MaskHead(nn.Module):
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, hidden_size)
+        self.fc3 = nn.Linear(hidden_size, hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.activation(self.fc1(hidden_states))
+        hidden_states = self.activation(self.fc2(hidden_states))
+        hidden_states = self.fc3(hidden_states)
+        return hidden_states
+@auto_docstring(
+    custom_intro="""
+    The EoMT-DINOv3 model with head on top for instance/semantic/panoptic segmentation.
+    """,
+)
+class EomtDinov3ForUniversalSegmentation(EomtDinov3PreTrainedModel):
+    main_input_name = "pixel_values"
+    def __init__(self, config: EomtDinov3Config):
+        super().__init__(config)
+        self.config = config
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = EomtDinov3Embeddings(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.query = nn.Embedding(config.num_queries, config.hidden_size)
+        self.layers = nn.ModuleList([EomtDinov3Layer(config) for _ in range(config.num_hidden_layers)])
+        self.upscale_block = EomtDinov3ScaleBlock(config)
+        self.mask_head = EomtDinov3MaskHead(config)
+        self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1)
+        self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
+        self.weight_dict: dict[str, float] = {
+            "loss_cross_entropy": config.class_weight,
+            "loss_mask": config.mask_weight,
+            "loss_dice": config.dice_weight,
+        }
+        self.criterion = EomtDinov3Loss(config=config, weight_dict=self.weight_dict)
+        self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks))
+        self.num_prefix_tokens = 1 + config.num_register_tokens
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.embeddings.register_parameter("mask_token", None)
+        self.rope_embeddings = EomtDinov3RotaryEmbedding(config)
+        self.post_init()
+    def get_loss_dict(
+        self,
+        masks_queries_logits: Tensor,
+        class_queries_logits: Tensor,
+        mask_labels: Tensor,
+        class_labels: Tensor,
+        auxiliary_predictions: dict[str, Tensor],
+    ) -> dict[str, Tensor]:
+        loss_dict: dict[str, Tensor] = self.criterion(
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            mask_labels=mask_labels,
+            class_labels=class_labels,
+            auxiliary_predictions=auxiliary_predictions,
+        )
+        # weight each loss by `self.weight_dict[<LOSS_NAME>]` including auxiliary losses
+        for key, weight in self.weight_dict.items():
+            for loss_key, loss in loss_dict.items():
+                if key in loss_key:
+                    loss *= weight
+        return loss_dict
+    def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor:
+        return sum(loss_dict.values())
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Tensor,
+        mask_labels: list[Tensor] | None = None,
+        class_labels: list[Tensor] | None = None,
+        patch_offsets: list[Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> EomtDinov3ForUniversalSegmentationOutput:
+        r"""
+        mask_labels (`list[torch.Tensor]`, *optional*):
+            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
+        class_labels (`list[torch.LongTensor]`, *optional*):
+            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
+            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
+        patch_offsets (`list[torch.Tensor]`, *optional*):
+            list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
+        """
+        masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
+        hidden_states = self.dropout(self.embeddings(pixel_values))
+        position_embeddings = self.rope_embeddings(pixel_values.to(hidden_states.dtype))
+        attention_mask = None
+        for idx, layer_module in enumerate(self.layers):
+            if idx == self.num_hidden_layers - self.config.num_blocks:
+                query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device)
+                hidden_states = torch.cat((query, hidden_states), dim=1)
+            if idx >= self.num_hidden_layers - self.config.num_blocks and (
+                self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0
+            ):
+                norm_hidden_states = self.layernorm(hidden_states)
+                masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states)
+                masks_queries_logits_per_layer += (masks_queries_logits,)
+                class_queries_logits_per_layer += (class_queries_logits,)
+                attention_mask = torch.ones(
+                    hidden_states.shape[0],
+                    hidden_states.shape[1],
+                    hidden_states.shape[1],
+                    device=hidden_states.device,
+                    dtype=torch.bool,
+                )
+                interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear")
+                interpolated_logits = interpolated_logits.view(
+                    interpolated_logits.size(0), interpolated_logits.size(1), -1
+                )
+                num_query_tokens = self.config.num_queries
+                encoder_start_tokens = num_query_tokens + self.num_prefix_tokens
+                # Set attention mask for queries to focus on encoder tokens based on interpolated logits
+                attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0
+                # Disable attention mask for random query tokens.
+                attention_mask = self._disable_attention_mask(
+                    attention_mask,
+                    prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks],
+                    num_query_tokens=num_query_tokens,
+                    encoder_start_tokens=encoder_start_tokens,
+                    device=attention_mask.device,
+                )
+                # Expand attention mask to 4d mask.
+                attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1)
+                dtype_min = torch.finfo(hidden_states.dtype).min
+                attention_mask = attention_mask.to(hidden_states.dtype).masked_fill(~attention_mask, dtype_min)
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+            )
+        sequence_output = self.layernorm(hidden_states)
+        masks_queries_logits, class_queries_logits = self.predict(sequence_output)
+        masks_queries_logits_per_layer += (masks_queries_logits,)
+        class_queries_logits_per_layer += (class_queries_logits,)
+        loss = None
+        if mask_labels is not None and class_labels is not None:
+            loss = 0.0
+            for masks_queries_logits, class_queries_logits in zip(
+                masks_queries_logits_per_layer, class_queries_logits_per_layer
+            ):
+                loss_dict = self.get_loss_dict(
+                    masks_queries_logits=masks_queries_logits,
+                    class_queries_logits=class_queries_logits,
+                    mask_labels=mask_labels,
+                    class_labels=class_labels,
+                    auxiliary_predictions=None,
+                )
+                loss += self.get_loss(loss_dict)
+        return EomtDinov3ForUniversalSegmentationOutput(
+            loss=loss,
+            masks_queries_logits=masks_queries_logits,
+            class_queries_logits=class_queries_logits,
+            last_hidden_state=sequence_output,
+            patch_offsets=patch_offsets,
+        )
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+    def predict(self, logits: torch.Tensor):
+        query_tokens = logits[:, : self.config.num_queries, :]
+        class_logits = self.class_predictor(query_tokens)
+        prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :]
+        prefix_tokens = prefix_tokens.transpose(1, 2)
+        prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size)
+        query_tokens = self.mask_head(query_tokens)
+        prefix_tokens = self.upscale_block(prefix_tokens)
+        mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens)
+        return mask_logits, class_logits
+    @staticmethod
+    def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device):
+        if prob < 1:
+            # Generate random queries to disable based on the probs
+            random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob
+            # Disable attention to the query tokens, considering the prefix tokens
+            attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1
+        return attn_mask
+__all__ = ["EomtDinov3PreTrainedModel", "EomtDinov3ForUniversalSegmentation"]

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl