PyPI - optimum-rbln - Versions diffs - 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

optimum/rbln/transformers/models/gemma3/modeling_gemma3.py CHANGED Viewed

@@ -99,9 +99,7 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
         return True
     @classmethod
-    def get_pytorch_model(cls, *args, **kwargs):
-        model = super().get_pytorch_model(*args, **kwargs)
+    def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
         with no_init_weights():
             model_cls_name = model.model.language_model.__class__.__name__
             causal_model_cls_name = model_cls_name.replace("TextModel", "ForCausalLM")
@@ -135,7 +133,7 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
         return self.language_model.get_input_embeddings()
     @classmethod
-    def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
+    def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
         return model.multi_modal_projector
     @classmethod
@@ -301,28 +299,60 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
         generate_idx: Optional[torch.Tensor] = None,
         padded_cache_lengths: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
         **lm_kwargs: Dict[str, Any],
     ) -> Union[Tuple, RBLNDecoderOnlyOutput]:
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.rbln_config.language_model.output_hidden_states
+        )
+        if output_hidden_states != self.rbln_config.language_model.output_hidden_states:
+            raise ValueError(
+                f"Variable output_hidden_states {output_hidden_states} is not equal to rbln_config.language_model.output_hidden_states {self.rbln_config.language_model.output_hidden_states} "
+                f"Please compile again with the correct argument."
+            )
         # prefill
         if cache_position is None:
             logits = []
             inputs_embeds = self._preprocess_prefill(input_ids, inputs_embeds, pixel_values)
             batch_size = inputs_embeds.shape[0]
+            all_hidden_states = (
+                tuple(
+                    torch.zeros(
+                        batch_size,
+                        inputs_embeds.shape[1],
+                        self.config.text_config.hidden_size,
+                        dtype=self.rbln_config.dtype,
+                    )
+                    for _ in range(self.config.text_config.num_hidden_layers + 1)
+                )
+                if self.rbln_config.language_model.output_hidden_states
+                else None
+            )
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
                 token_type_id = token_type_ids[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
                 cache_position = self.get_padded_cache_position(cache_position, token_type_id)
-                output = self.language_model.prefill_decoder(
+                outputs = self.language_model.prefill_decoder(
                     inputs_embeds=inputs_embeds[b_idx : b_idx + 1],
                     attention_mask=attention_mask[b_idx],
                     cache_position=cache_position,
                     batch_idx=b_idx,
                     token_type_ids=token_type_ids[b_idx : b_idx + 1],  # do not pass token_type_id
                 )
-                padded_cache_lengths[b_idx] += output.padded_cache_lengths
-                logits.append(output.logits)
+                padded_cache_lengths[b_idx] += outputs.padded_cache_lengths
+                logits.append(outputs.logits)
+                if self.rbln_config.language_model.output_hidden_states:
+                    for l_idx in range(self.config.text_config.num_hidden_layers + 1):
+                        mask_indices = torch.nonzero(attention_mask[b_idx], as_tuple=True)[0]
+                        all_hidden_states[l_idx][b_idx].index_copy_(
+                            dim=0, index=mask_indices, source=outputs.hidden_states[l_idx][0]
+                        )
             logits = torch.cat(logits, dim=0)
         # decoder
@@ -336,15 +366,20 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMix
                     f"Please run your model with one of these batch sizes or add support for batch size {batch_size}."
                 )
-            logits = self.language_model.decoders[batch_size](
+            outputs = self.language_model.decoders[batch_size](
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
                 position_ids=position_ids if self.rbln_config.language_model.use_position_ids else None,
-            ).logits
+            )
+            logits = outputs.logits
+            all_hidden_states = outputs.hidden_states
         return RBLNDecoderOnlyOutput(
-            logits=logits, generate_idx=generate_idx, padded_cache_lengths=padded_cache_lengths
+            logits=logits,
+            generate_idx=generate_idx,
+            padded_cache_lengths=padded_cache_lengths,
+            hidden_states=all_hidden_states,
         )
@@ -405,26 +440,6 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
             )
         return embed_tokens
-    @classmethod
-    def _update_sliding_window_config(cls, model_config: PretrainedConfig, rbln_config: RBLNGemma3ForCausalLMConfig):
-        sliding_window = getattr(model_config, "sliding_window", None)
-        sliding_window_pattern = getattr(model_config, "sliding_window_pattern", None)
-        if sliding_window_pattern is None:
-            if hasattr(model_config, "layer_types"):
-                first_full_attention_index = model_config.layer_types.index("full_attention")
-                sliding_window_pattern = first_full_attention_index + 1
-            else:
-                raise ValueError("Cannot determine sliding_window_pattern from model_config")
-        if sliding_window_pattern <= model_config.num_hidden_layers:
-            rbln_config.cache_impl = "hybrid"
-            rbln_config.sliding_window = sliding_window
-            rbln_config.sliding_window_layers = [
-                i for i in range(model_config.num_hidden_layers) if (i + 1) % sliding_window_pattern > 0
-            ]
-        return rbln_config
     @classmethod
     def _update_submodule_config(
         cls,
@@ -482,7 +497,7 @@ class RBLNGemma3ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNGemma3ForCausalLMConfig):
-        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
+        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
         rbln_compile_configs = rbln_config.compile_cfgs
         prefill_compile_config = rbln_compile_configs[0]

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -20,8 +20,6 @@ import torch.nn as nn
 from ..decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyLayer,
-    DecoderOnlyModel,
     DecoderOnlyWrapper,
 )
@@ -34,12 +32,6 @@ class GPT2Wrapper(DecoderOnlyWrapper):
     def get_rbln_attn_class(self):
         return GPT2Attention
-    def get_rbln_layer_class(self):
-        return GPT2Layer
-    def get_rbln_model_class(self):
-        return GPT2Model
     def get_attn_layer(self, layer: nn.Module):
         return layer.attn
@@ -50,30 +42,12 @@ class GPT2Wrapper(DecoderOnlyWrapper):
         return model.transformer.h if self.is_causal_lm else model.h
-class GPT2Model(DecoderOnlyModel):
-    def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_f
-    def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.wte
-    def get_pos_embedding(self) -> nn.Embedding:
-        return self._original_mod.wpe
-class GPT2Layer(DecoderOnlyLayer):
-    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_1
-    def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.ln_2
 class GPT2Attention(DecoderOnlyAttention):
-    def __post_init__(self):
-        self.c_attn = self._original_mod.c_attn
-        self.o_proj = self._original_mod.c_proj
-        self.split_size = self._original_mod.split_size
+    def __post_init__(self, self_attn):
+        self.c_attn = self_attn.c_attn
+        self.o_proj = self_attn.c_proj
+        self.split_size = self_attn.split_size
+        self.num_key_value_heads = self_attn.num_heads
     def projection(self, hidden_states, lora_int_id) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         if lora_int_id is not None:
@@ -82,12 +56,12 @@ class GPT2Attention(DecoderOnlyAttention):
         query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
         return query_states, key_states, value_states
-    def get_attn_scale(self):
+    def get_attn_scale(self, self_attn):
         scale = 1.0
-        if self._original_mod.scale_attn_weights:
+        if self_attn.scale_attn_weights:
             scale /= math.sqrt(self.head_dim)
-        if self._original_mod.scale_attn_by_inverse_layer_idx:
+        if self_attn.scale_attn_by_inverse_layer_idx:
             scale /= 1 + self.layer_idx
         return scale

optimum/rbln/transformers/models/gpt_oss/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_gpt_oss import RBLNGptOssForCausalLMConfig
+from .modeling_gpt_oss import RBLNGptOssForCausalLM

optimum/rbln/transformers/models/gpt_oss/configuration_gpt_oss.py ADDED Viewed

@@ -0,0 +1,41 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
+class RBLNGptOssForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
+    """
+    Configuration class for RBLN GPT-OSS models.
+    This class is an alias of RBLNDecoderOnlyModelForCausalLMConfig.
+    Example usage:
+    ```python
+    from optimum.rbln import RBLNGptOssForCausalLM, RBLNGptOssForCausalLMConfig
+    # Create a configuration object
+    config = RBLNGptOssForCausalLMConfig(
+        batch_size=1,
+        tensor_parallel_size=4
+    )
+    # Use the configuration with from_pretrained
+    model = RBLNGptOssForCausalLM.from_pretrained(
+        "openai/gpt-oss-20b",
+        export=True,
+        rbln_config=config
+    )
+    ```
+    """

optimum/rbln/transformers/models/gpt_oss/gpt_oss_architecture.py ADDED Viewed

@@ -0,0 +1,122 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ..decoderonly.configuration_decoderonly import RBLNLoRAConfig
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyLayer,
+    DecoderOnlyWrapper,
+)
+class RBLNGptOssWrapper(DecoderOnlyWrapper):
+    def get_rbln_layer_class(self):
+        return RBLNGptOssLayer
+class RBLNGptOssLayer(DecoderOnlyLayer):
+    def __init__(self, layer, self_attn: DecoderOnlyAttention, lora_config: Optional[RBLNLoRAConfig] = None):
+        super().__init__(layer, self_attn, lora_config)
+        self.mlp = RBLNGptOssMLP(layer.mlp)
+    def get_mlp(self) -> nn.Module:
+        return self.mlp
+class RBLNGptOssTopKRouter(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.weight = model.weight
+        self.bias = model.bias
+    def forward(self, hidden_states):
+        return F.linear(hidden_states, self.weight, self.bias)  # (seq_len, num_experts)
+class RBLNGptOssExperts(nn.Module):
+    def __init__(self, model, top_k: Optional[int] = None):
+        super().__init__()
+        self.intermediate_size = model.intermediate_size
+        self.num_experts = model.num_experts
+        self.hidden_size = model.hidden_size
+        self.register_buffer(
+            "gate_proj_blocks",
+            model.gate_up_proj_blocks.data[:, ::2, :, :].reshape(self.num_experts, self.intermediate_size, -1),
+        )
+        self.register_buffer("gate_proj_scales", model.gate_up_proj_scales.data[:, ::2, :])
+        self.register_buffer(
+            "gate_proj_bias",
+            model.gate_up_proj_bias.data[:, ::2].reshape(self.num_experts, self.intermediate_size),
+        )
+        self.register_buffer(
+            "up_proj_blocks",
+            model.gate_up_proj_blocks.data[:, 1::2, :, :].reshape(self.num_experts, self.intermediate_size, -1),
+        )
+        self.register_buffer("up_proj_scales", model.gate_up_proj_scales.data[:, 1::2, :])
+        self.register_buffer(
+            "up_proj_bias", model.gate_up_proj_bias.data[:, 1::2].reshape(self.num_experts, self.intermediate_size)
+        )
+        self.register_buffer(
+            "down_proj_blocks", model.down_proj_blocks.data.reshape(self.num_experts, self.hidden_size, -1)
+        )
+        self.register_buffer("down_proj_scales", model.down_proj_scales.data)
+        self.register_buffer("down_proj_bias", model.down_proj_bias.data)
+        self.alpha = model.alpha  # 1.702
+        self.limit = model.limit  # 7.0
+        self.top_k = top_k
+    def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> torch.Tensor:
+        return torch.ops.rbln_custom_ops.custom_moe_glu_mxfp4(
+            hidden_states,
+            self.gate_proj_blocks,
+            self.gate_proj_scales,
+            self.gate_proj_bias,
+            self.up_proj_blocks,
+            self.up_proj_scales,
+            self.up_proj_bias,
+            self.down_proj_blocks,
+            self.down_proj_scales,
+            self.down_proj_bias,
+            router_logits,
+            torch.tensor(self.alpha, dtype=hidden_states.dtype),
+            torch.tensor(self.limit, dtype=hidden_states.dtype),
+            k=self.top_k,
+            post_norm=True,
+        )
+class RBLNGptOssMLP(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.router = RBLNGptOssTopKRouter(model.router)
+        self.experts = RBLNGptOssExperts(model.experts, top_k=model.router.top_k)
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        router_logits = self.router(hidden_states)
+        routed_out = self.experts(hidden_states, router_logits=router_logits)
+        routed_out = routed_out.reshape(batch_size, sequence_length, hidden_dim)
+        return routed_out

optimum/rbln/transformers/models/gpt_oss/modeling_gpt_oss.py ADDED Viewed

@@ -0,0 +1,165 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Optional, Union
+import torch
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig
+from transformers.integrations.mxfp4 import Mxfp4GptOssExperts
+from transformers.modeling_utils import PreTrainedModel, no_init_weights
+from ....utils.logging import get_logger
+from ...models.decoderonly import (
+    RBLNDecoderOnlyModelConfig,
+    RBLNDecoderOnlyModelForCausalLM,
+    RBLNDecoderOnlyModelForCausalLMConfig,
+)
+from ...utils.rbln_quantization import load_weight_files
+from .gpt_oss_architecture import RBLNGptOssWrapper
+if TYPE_CHECKING:
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PreTrainedModel
+logger = get_logger(__name__)
+class RBLNGptOssForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The GPT-OSS Model transformer with a language modeling head (linear layer) on top.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    A class to convert and run pre-trained transformers based GPT-OSSForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers GPT-OSSForCausalLM model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNGptOssForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNGptOssForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNGptOssForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNGptOssForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNGptOssForCausalLM.from_pretrained(
+            "openai/gpt-oss-20b",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "tensor_parallel_size": 4,
+        }
+        model = RBLNGptOssForCausalLM.from_pretrained(
+            "openai/gpt-oss-20b",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNGptOssForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNGptOssForCausalLMConfig
+        config = RBLNGptOssForCausalLMConfig(
+            batch_size=1,
+            tensor_parallel_size=4
+        )
+        model = RBLNGptOssForCausalLM.from_pretrained(
+            "openai/gpt-oss-20b",
+            export=True,
+            rbln_config=config
+        )
+        ```
+    """
+    _decoder_wrapper_cls = RBLNGptOssWrapper
+    @staticmethod
+    def _get_dtype(dtype: Union[str, torch.dtype] = None, torch_dtype: Union[str, torch.dtype] = None):
+        # For BC on torch_dtype argument
+        if torch_dtype is not None:
+            logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!")
+            # If both kwargs are provided, use `dtype`
+            dtype = dtype if dtype is not None else torch_dtype
+        # As mxfp4_quantizer's default dtype
+        if dtype is None or dtype == "auto":
+            dtype = torch.bfloat16
+        return dtype
+    @classmethod
+    def get_pytorch_model(
+        cls,
+        model_id: str,
+        *args,
+        rbln_config: Optional[RBLNDecoderOnlyModelConfig] = None,
+        dtype: Union[str, torch.dtype] = None,
+        torch_dtype: Union[str, torch.dtype] = None,
+        config: Optional[PretrainedConfig] = None,
+        **kwargs,
+    ) -> PreTrainedModel:
+        safetensor_files = load_weight_files(model_id, exception_keywords=["original"])
+        state_dict = {k: v for f in safetensor_files for k, v in load_file(f).items()}
+        if config is None:
+            config, kwargs = AutoConfig.from_pretrained(model_id, return_unused_kwargs=True)
+        dtype = cls._get_dtype(dtype, torch_dtype)
+        with no_init_weights():
+            model = AutoModelForCausalLM.from_config(config, dtype=dtype, **kwargs)
+        _replace_with_mxfp4_linear(model, config)
+        model.load_state_dict(state_dict, strict=False)
+        return model
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
+        model: Optional["PreTrainedModel"] = None,
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_config: Optional[RBLNDecoderOnlyModelForCausalLMConfig] = None,
+    ) -> RBLNDecoderOnlyModelForCausalLMConfig:
+        rbln_config = super()._update_rbln_config(preprocessors, model, model_config, rbln_config)
+        if rbln_config.use_attention_mask:
+            raise ValueError(
+                "use_attention_mask is not supported for GPT-OSS because custom attention does not support attention sink for masked attention"
+            )
+        return rbln_config
+def _replace_with_mxfp4_linear(
+    model,
+    config,
+):
+    for name, module in model.named_children():
+        if module.__class__.__name__ == "GptOssExperts":
+            model._modules[name] = Mxfp4GptOssExperts(config)
+        if len(list(module.children())) > 0:
+            _replace_with_mxfp4_linear(module, config)

optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py CHANGED Viewed

@@ -50,11 +50,14 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
         Raises:
             ValueError: If batch_size is not a positive integer.
         """
-        super().__init__(**kwargs)
-        self.encoder = encoder
-        self.decoder = decoder
-        self.text_backbone = text_backbone
-        self.backbone = backbone
+        super().__init__(batch_size=batch_size, **kwargs)
+        self.encoder = self.initialize_submodule_config(submodule_config=encoder, batch_size=self.batch_size)
+        self.decoder = self.initialize_submodule_config(submodule_config=decoder, batch_size=self.batch_size)
+        self.text_backbone = self.initialize_submodule_config(
+            submodule_config=text_backbone, batch_size=self.batch_size
+        )
+        self.backbone = self.initialize_submodule_config(submodule_config=backbone, batch_size=self.batch_size)
         self.output_attentions = output_attentions if output_attentions is not None else False
         self.output_hidden_states = output_hidden_states if output_hidden_states is not None else False

optimum/rbln/transformers/models/grounding_dino/grounding_dino_architecture.py CHANGED Viewed

@@ -150,7 +150,7 @@ class _GroundingDinoEncoder(torch.nn.Module):
         all_attn_fused_vision = () if output_attentions else None
         all_attn_enhanced_text = () if output_attentions else None
         all_attn_deformable = () if output_attentions else None
-        for i, encoder_layer in enumerate(self.layers):
+        for _, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_vision_states += (vision_features,)
                 encoder_text_states += (text_features,)
@@ -509,10 +509,12 @@ class _GroundingDinoBiMultiHeadAttention(torch.nn.Module):
         # mask vision for language
         if vision_attention_mask is not None:
-            # RBLN FIX: bool tensor to float tensor
-            mask = vision_attention_mask * torch.finfo(torch.float16).min
-            text_attn_weights = text_attn_weights.transpose(1, 2) + mask
-            text_attn_weights = text_attn_weights.transpose(1, 2)
+            # RBLN FIX: bool tensor to float tensor, broadcast across heads and src_len
+            mask = vision_attention_mask
+            if mask.dim() == 3:
+                mask = mask[..., 0]
+            mask = mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            text_attn_weights = text_attn_weights + mask * torch.finfo(text_attn_weights.dtype).min
         text_attn_weights = text_attn_weights.softmax(dim=-1)

optimum-rbln 0.9.3rc0__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.3rc0py3-none-any.whl → 0.9.5a4py3-none-any.whl