PyPI - optimum-rbln - Versions diffs - 0.8.1a0__py3-none-any.whl → 0.8.1a2__py3-none-any.whl - Mend

optimum-rbln 0.8.1a0py3-none-any.whl → 0.8.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

optimum/rbln/transformers/models/qwen2/modeling_qwen2.py CHANGED Viewed

@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from transformers import PretrainedConfig
 from ....utils import logging
-from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
+from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM, RBLNDecoderOnlyModelForCausalLMConfig
 from .qwen2_architecture import QWEN2Wrapper
@@ -22,13 +24,74 @@ logger = logging.get_logger(__name__)
 class RBLNQwen2ForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
-    The Llama Model transformer with a language modeling head (linear layer) on top.
+    The Qwen2 Model transformer with a language modeling head (linear layer) on top.
     This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
+    A class to convert and run pre-trained transformers based Qwen2ForCausalLM model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers Qwen2ForCausalLM model into a RBLN transformer model by:
     - transferring the checkpoint weights of the original into an optimized RBLN graph,
     - compiling the resulting graph using the RBLN compiler.
+    **Configuration:**
+    This model uses [`RBLNQwen2ForCausalLMConfig`] for configuration. When calling methods like `from_pretrained` or `from_model`,
+    the `rbln_config` parameter should be an instance of [`RBLNQwen2ForCausalLMConfig`] or a dictionary conforming to its structure.
+    See the [`RBLNQwen2ForCausalLMConfig`] class for all available configuration options.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNQwen2ForCausalLM
+        # Simple usage using rbln_* arguments
+        # `max_seq_len` is automatically inferred from the model config
+        model = RBLNQwen2ForCausalLM.from_pretrained(
+            "Qwen/Qwen2-7B-Instruct",
+            export=True,
+            rbln_batch_size=1,
+            rbln_tensor_parallel_size=4,
+        )
+        # Using a config dictionary
+        rbln_config = {
+            "batch_size": 1,
+            "max_seq_len": 4096,
+            "tensor_parallel_size": 4,
+        }
+        model = RBLNQwen2ForCausalLM.from_pretrained(
+            "Qwen/Qwen2-7B-Instruct",
+            export=True,
+            rbln_config=rbln_config
+        )
+        # Using a RBLNQwen2ForCausalLMConfig instance (recommended for type checking)
+        from optimum.rbln import RBLNQwen2ForCausalLMConfig
+        config = RBLNQwen2ForCausalLMConfig(
+            batch_size=1,
+            max_seq_len=4096,
+            tensor_parallel_size=4
+        )
+        model = RBLNQwen2ForCausalLM.from_pretrained(
+            "Qwen/Qwen2-7B-Instruct",
+            export=True,
+            rbln_config=config
+        )
+        ```
     """
     _decoder_wrapper_cls = QWEN2Wrapper
+    @classmethod
+    def _update_sliding_window_config(
+        cls, model_config: PretrainedConfig, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
+    ):
+        # https://github.com/huggingface/transformers/issues/35896
+        # There seems to be a bug in transformers(v4.52.4). Therefore, similar to when attn_implementation is eager,
+        # we set all layers to use sliding window in this version. This should be updated once the bug is fixed.
+        rbln_config.cache_impl = "sliding_window"
+        rbln_config.sliding_window = model_config.sliding_window
+        rbln_config.sliding_window_layers = list(range(model_config.num_hidden_layers))
+        return rbln_config

optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from ....configuration_utils import RBLNModelConfig
 from ..decoderonly.configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
@@ -25,7 +25,7 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
         self,
         visual: Optional[RBLNModelConfig] = None,
         use_inputs_embeds: bool = True,
-        **kwargs,
+        **kwargs: Dict[str, Any],
     ):
         super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
         if not self.use_inputs_embeds:
@@ -37,7 +37,7 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
 class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
-    def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs):
+    def __init__(self, max_seq_lens: Union[int, List[int]] = None, **kwargs: Dict[str, Any]):
         """
         Args:
             max_seq_lens (Optional[Union[int, List[int]]]): Maximum sequence lengths for Vision
@@ -54,6 +54,18 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
         Raises:
             ValueError: If batch_size is not a positive integer.
+        Max Seq Lens:
+            Since `Qwen2_5_VLForConditionalGeneration` performs inference on a per-image or per-frame basis,
+            `max_seq_lens` should be set based on the maximum expected resolution of the input images or video frames,
+            according to the following guidelines:
+            1. **Minimum Value**: `max_seq_lens` must be greater than or equal to the number of patches generated from the input image.
+                For example, a 224x224 image with a patch size of 14 results in (224 / 14) * (224 / 14) = 256 patches.
+                Therefore, `max_seq_lens` must be at least 256.
+            2. **Alignment Requirement**: `max_seq_lens` must be a multiple of `(window_size / patch_size)^2` due to the requirements
+                of the window-based attention mechanism. For instance, if `window_size` is 112 and `patch_size` is 14, then
+                `(112 / 14)^2 = 64`, meaning valid values for `max_seq_lens` include 64, 128, 192, 256, etc.
         """
         super().__init__(**kwargs)

optimum/rbln/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py CHANGED Viewed

@@ -28,6 +28,7 @@ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
     Qwen2_5_VisionPatchEmbed,
     Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VisionTransformerPretrainedModel,
+    Qwen2_5_VLModel,
     Qwen2_5_VLRotaryEmbedding,
 )
@@ -37,6 +38,7 @@ from ....utils.logging import get_logger
 from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyModelForCausalLM, RBLNDecoderOnlyOutput
 from .configuration_qwen2_5_vl import (
     RBLNQwen2_5_VisionTransformerPretrainedModelConfig,
+    RBLNQwen2_5_VLForConditionalGenerationConfig,
 )
 from .qwen2_5_vl_architecture import Qwen2_5_VisionTransformerWrapper, Qwen2_5_VL_LanguageModelWrapper
@@ -338,6 +340,40 @@ class RBLNQwen2_5_VisionTransformerPretrainedModel(RBLNModel):
 class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
+    """
+    RBLNQwen2_5_VLForConditionalGeneration is a multi-modal model that integrates vision and language processing capabilities,
+    optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    Important Note:
+        This model includes a Large Language Model (LLM). For optimal performance, it is highly recommended to use
+        tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the
+        `from_pretrained` method. Refer to the `from_pretrained` documentation and the RBLNQwen2_5_VLForConditionalGenerationConfig class for details.
+    Examples:
+        ```python
+        from optimum.rbln import RBLNQwen2_5_VLForConditionalGeneration
+        model = RBLNQwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-7B-Instruct",
+            export=True,
+            rbln_config={
+                "visual": {
+                    "max_seq_lens": 6400,
+                    "device": 0,
+                },
+                "tensor_parallel_size": 8,
+                "kvcache_partition_len": 16_384,
+                "max_seq_len": 114_688,
+                "device": [0, 1, 2, 3, 4, 5, 6, 7],
+            },
+        )
+        model.save_pretrained("compiled-qwen2.5-vl-7b-instruct")
+        ```
+    """
     auto_model_class = AutoModelForVision2Seq
     _rbln_submodules = [
         {"name": "visual"},
@@ -355,6 +391,14 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
     def can_generate(self):
         return True
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        model.model.lm_head = model.lm_head
+        model.lm_head = None
+        del model.lm_head
+        return model
     @classmethod
     def update_kwargs(cls, kwargs):
         kwargs.update(
@@ -369,33 +413,19 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
         cls,
         batch_size: int,
         query_length: int,
-        use_inputs_embeds: bool,
-        use_attention_mask: bool,
-        use_position_ids: bool,
-        max_seq_len: int,
-        kvcache_block_size: int,
-        kvcache_num_blocks: int,
-        num_key_value_heads: int,
-        num_hidden_layers: int,
-        hidden_size: int,
-        head_dim: int,
+        rbln_config: RBLNQwen2_5_VLForConditionalGenerationConfig,
+        model_config: PretrainedConfig,
     ):
-        input_info = super().get_input_info(
-            batch_size,
-            query_length,
-            use_inputs_embeds,
-            use_attention_mask,
-            use_position_ids,
-            max_seq_len,
-            kvcache_block_size,
-            kvcache_num_blocks,
-            num_key_value_heads,
-            num_hidden_layers,
-            hidden_size,
-            head_dim,
-        )
+        input_info = super().get_input_info(batch_size, query_length, rbln_config, model_config)
         pos_idx = 3
-        input_info.insert(pos_idx, ("position_emb", [2, batch_size, 1, query_length, head_dim], "float32"))
+        input_info.insert(
+            pos_idx,
+            (
+                "position_emb",
+                [2, batch_size, 1, query_length, model_config.hidden_size // model_config.num_attention_heads],
+                "float32",
+            ),
+        )
         return input_info
@@ -510,7 +540,8 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
             vision_tokens = input_id[0][vision_start_indices + 1]
             image_nums = (vision_tokens == image_token_id).sum()
             video_nums = (vision_tokens == video_token_id).sum()
-            position_ids, rope_deltas = self.get_rope_index(
+            position_ids, rope_deltas = Qwen2_5_VLModel.get_rope_index(
+                self,
                 input_id,
                 image_grid_thw[image_idx : image_idx + image_nums] if image_grid_thw is not None else None,
                 video_grid_thw[video_idx : video_idx + video_nums] if video_grid_thw is not None else None,
@@ -595,7 +626,7 @@ class RBLNQwen2_5_VLForConditionalGeneration(RBLNDecoderOnlyModelForCausalLM):
                 )
                 logits.append(output.logits)
             logits = torch.cat(logits, dim=0)
-        # Decoder
+            # Decoder
         else:
             inputs_embeds, position_embed = self._preprocess_decoder(input_ids, cache_position)
             output = self.decoder(

optimum/rbln/transformers/models/qwen2_5_vl/qwen2_5_vl_architecture.py CHANGED Viewed

@@ -3,8 +3,14 @@ from typing import Tuple
 import torch
 import torch.nn as nn
+from transformers import PreTrainedModel
 from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
     DecoderOnlyWrapper,
     apply_rotary_pos_emb,
 )
@@ -162,7 +168,8 @@ class Qwen2_5_VL_LanguageModelWrapper(DecoderOnlyWrapper):
         input_ids = None if self.use_inputs_embeds else args.pop(0)
         inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
         cache_position = args.pop(0)
-        block_tables = args.pop(0)
+        global_block_tables = args.pop(0)
+        local_block_tables = None
         position_embeds = args.pop(0)
         query_position = args.pop(0) if self.phase == "prefill" else None
         position_ids = None
@@ -188,10 +195,48 @@ class Qwen2_5_VL_LanguageModelWrapper(DecoderOnlyWrapper):
             input_ids,
             inputs_embeds,
             cache_position,
-            block_tables,
+            global_block_tables,
+            local_block_tables,
             query_position,
             attention_mask,
             position_ids,
             past_key_values,
             position_embeds,
         )
+    def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
+        new_layers = []
+        for layer in causal_lm.model.language_model.layers:
+            if self.attn_impl == "eager":
+                new_self_attn = DecoderOnlyAttention(
+                    layer.self_attn,
+                    self.use_attention_mask,
+                    self.use_position_ids,
+                    kvcache_block_size=self.kvcache_block_size,
+                )
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = DecoderOnlyFlashAttention(
+                    layer.self_attn,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    kvcache_block_size=self.kvcache_block_size,
+                    use_attention_mask=self.use_attention_mask,
+                    use_position_ids=self.use_position_ids,
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = DecoderOnlyModel(
+            causal_lm.model.language_model,
+            new_layers,
+            partition_len=self.kvcache_partition_len,
+            max_seq_len=max_seq_len,
+            kvcache_block_size=self.kvcache_block_size,
+            use_learned_pos_emb=self.use_learned_pos_emb,
+            sliding_window_layers=self.sliding_window_layers,
+        )
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm.model, new_model)
+        return new_causal_lm

optimum/rbln/transformers/models/resnet/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_resnet import RBLNResNetForImageClassificationConfig
+from .modeling_resnet import RBLNResNetForImageClassification
+__all__ = [
+    "RBLNResNetForImageClassificationConfig",
+    "RBLNResNetForImageClassification",
+]

optimum/rbln/transformers/models/resnet/configuration_resnet.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_generic import RBLNModelForImageClassificationConfig
+class RBLNResNetForImageClassificationConfig(RBLNModelForImageClassificationConfig):
+    ""

optimum/rbln/transformers/models/resnet/modeling_resnet.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...modeling_generic import RBLNModelForImageClassification
+class RBLNResNetForImageClassification(RBLNModelForImageClassification):
+    """
+    ResNet model for image classification tasks on RBLN NPU.
+    """

optimum/rbln/transformers/models/roberta/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_roberta import RBLNRobertaForMaskedLMConfig, RBLNRobertaForSequenceClassificationConfig
+from .modeling_roberta import RBLNRobertaForMaskedLM, RBLNRobertaForSequenceClassification
+__all__ = [
+    "RBLNRobertaForMaskedLMConfig",
+    "RBLNRobertaForSequenceClassificationConfig",
+    "RBLNRobertaForMaskedLM",
+    "RBLNRobertaForSequenceClassification",
+]

optimum/rbln/transformers/{configuration_alias.py → models/roberta/configuration_roberta.py} RENAMED Viewed

@@ -12,38 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .configuration_generic import (
-    RBLNModelForAudioClassificationConfig,
-    RBLNModelForImageClassificationConfig,
-    RBLNModelForMaskedLMConfig,
-    RBLNModelForQuestionAnsweringConfig,
-    RBLNModelForSequenceClassificationConfig,
-)
-class RBLNASTForAudioClassificationConfig(RBLNModelForAudioClassificationConfig):
-    pass
-class RBLNDistilBertForQuestionAnsweringConfig(RBLNModelForQuestionAnsweringConfig):
-    pass
-class RBLNResNetForImageClassificationConfig(RBLNModelForImageClassificationConfig):
-    pass
-class RBLNXLMRobertaForSequenceClassificationConfig(RBLNModelForSequenceClassificationConfig):
-    pass
-class RBLNRobertaForSequenceClassificationConfig(RBLNModelForSequenceClassificationConfig):
-    pass
+from ...configuration_generic import RBLNModelForMaskedLMConfig, RBLNModelForSequenceClassificationConfig
 class RBLNRobertaForMaskedLMConfig(RBLNModelForMaskedLMConfig):
-    pass
+    ""
-class RBLNViTForImageClassificationConfig(RBLNModelForImageClassificationConfig):
-    pass
+class RBLNRobertaForSequenceClassificationConfig(RBLNModelForSequenceClassificationConfig):
+    ""

optimum/rbln/transformers/{modeling_alias.py → models/roberta/modeling_roberta.py} RENAMED Viewed

@@ -12,42 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..utils.logging import get_logger
-from .modeling_generic import (
-    RBLNModelForAudioClassification,
-    RBLNModelForImageClassification,
-    RBLNModelForMaskedLM,
-    RBLNModelForQuestionAnswering,
-    RBLNModelForSequenceClassification,
-)
+from ...modeling_generic import RBLNModelForMaskedLM, RBLNModelForSequenceClassification
-logger = get_logger()
-class RBLNASTForAudioClassification(RBLNModelForAudioClassification):
-    pass
-class RBLNDistilBertForQuestionAnswering(RBLNModelForQuestionAnswering):
-    rbln_model_input_names = ["input_ids", "attention_mask"]
-class RBLNResNetForImageClassification(RBLNModelForImageClassification):
-    pass
-class RBLNXLMRobertaForSequenceClassification(RBLNModelForSequenceClassification):
+class RBLNRobertaForMaskedLM(RBLNModelForMaskedLM):
     rbln_model_input_names = ["input_ids", "attention_mask"]
 class RBLNRobertaForSequenceClassification(RBLNModelForSequenceClassification):
     rbln_model_input_names = ["input_ids", "attention_mask"]
-class RBLNRobertaForMaskedLM(RBLNModelForMaskedLM):
-    rbln_model_input_names = ["input_ids", "attention_mask"]
-class RBLNViTForImageClassification(RBLNModelForImageClassification):
-    pass

optimum/rbln/transformers/models/seq2seq/__init__.py CHANGED Viewed

@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .configuration_seq2seq2 import RBLNModelForSeq2SeqLMConfig
+from .configuration_seq2seq import RBLNModelForSeq2SeqLMConfig
 from .modeling_seq2seq import RBLNModelForSeq2SeqLM

optimum/rbln/transformers/models/seq2seq/{configuration_seq2seq2.py → configuration_seq2seq.py} RENAMED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
+from typing import Any, Dict, Optional
 import rebel
@@ -31,7 +31,7 @@ class RBLNModelForSeq2SeqLMConfig(RBLNModelConfig):
         dec_max_seq_len: Optional[int] = None,
         use_attention_mask: Optional[bool] = None,
         pad_token_id: Optional[int] = None,
-        **kwargs,
+        **kwargs: Dict[str, Any],
     ):
         """
         Args:

optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py CHANGED Viewed

@@ -26,7 +26,7 @@ from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from .configuration_seq2seq2 import RBLNModelForSeq2SeqLMConfig
+from .configuration_seq2seq import RBLNModelForSeq2SeqLMConfig
 logger = get_logger(__name__)

optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py CHANGED Viewed

@@ -148,7 +148,8 @@ class Seq2SeqDecoderWrapper(nn.Module):
         new_layers = []
         for layer in model.get_decoder().layers:
             self_attn = Seq2SeqSelfAttention(layer.self_attn)
-            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn))
+            cross_attn = Seq2SeqCrossAttention(layer.encoder_attn)
+            new_layers.append(Seq2SeqDecoderLayer(layer, self_attn, cross_attn))
         decoder_model = Seq2SeqDecoder(model.get_decoder(), new_layers)
         new_model = Seq2SeqForConditionalGeneration(model, decoder_model)
@@ -341,10 +342,11 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         self_attn (Seq2SeqSelfAttention): Modified self-attention layer optimized for RBLN
     """
-    def __init__(self, decoder_layer, self_attn):
+    def __init__(self, decoder_layer, self_attn, cross_attn):
         super().__init__()
         self._original_mod = decoder_layer
         self.self_attn = self_attn
+        self.cross_attn = cross_attn
         self.__post_init__()
     def __post_init__(self, **kwargs):
@@ -402,7 +404,8 @@ class Seq2SeqDecoderLayer(torch.nn.Module):
         # Cross-Attention Block
         residual = hidden_states
         hidden_states = self.pre_cross_attn_layer_norm(hidden_states)
-        cross_attn_output = self.encoder_attn(
+        cross_attn_output = self.cross_attn(
             hidden_states=hidden_states,
             past_key_value=cross_past_key_value,
             attention_mask=encoder_attention_mask,
@@ -487,3 +490,38 @@ class Seq2SeqSelfAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
         return attn_output
+class Seq2SeqCrossAttention(nn.Module):
+    def __init__(self, attn, **kwargs):
+        super().__init__()
+        self._original_mod = attn
+        self.__post_init__(**kwargs)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: torch.Tensor = None,
+        past_key_value: Optional[object] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        is_cross_attention = key_value_states is not None
+        if is_cross_attention:
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None, past_key_value

optimum/rbln/transformers/models/siglip/configuration_siglip.py CHANGED Viewed

@@ -24,6 +24,7 @@ class RBLNSiglipVisionModelConfig(RBLNModelConfig):
         image_size: Optional[int] = None,
         interpolate_pos_encoding: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         **kwargs,
     ):
         """
@@ -33,6 +34,7 @@ class RBLNSiglipVisionModelConfig(RBLNModelConfig):
                 a tuple/list (height, width), or a dictionary with 'height' and 'width' keys.
             interpolate_pos_encoding (Optional[bool]): Whether to interpolate the position encoding.
             output_hidden_states: (Optional[bool]): Whether to return hidden states.
+            output_attentions: (Optional[bool]): Whether to return attentions.
             **kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
@@ -46,6 +48,7 @@ class RBLNSiglipVisionModelConfig(RBLNModelConfig):
         self.image_size = image_size
         self.interpolate_pos_encoding = interpolate_pos_encoding or False
         self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
     @property
     def image_width(self):

optimum-rbln 0.8.1a0__py3-none-any.whl → 0.8.1a2__py3-none-any.whl

optimum-rbln 0.8.1a0py3-none-any.whl → 0.8.1a2py3-none-any.whl