PyPI - optimum-rbln - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

optimum-rbln 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

optimum/rbln/transformers/__init__.py CHANGED Viewed

@@ -30,17 +30,34 @@ _import_structure = {
     "cache_utils": ["RebelDynamicCache"],
     "generation": ["BatchTextIteratorStreamer"],
     "models": [
+        "RBLNAutoModel",
+        "RBLNAutoModelForAudioClassification",
+        "RBLNAutoModelForCausalLM",
+        "RBLNAutoModelForCTC",
+        "RBLNAutoModelForDepthEstimation",
+        "RBLNAutoModelForImageClassification",
+        "RBLNAutoModelForMaskedLM",
+        "RBLNAutoModelForQuestionAnswering",
+        "RBLNAutoModelForSeq2SeqLM",
+        "RBLNAutoModelForSequenceClassification",
+        "RBLNAutoModelForSpeechSeq2Seq",
+        "RBLNAutoModelForVision2Seq",
+        "RBLNBartModel",
+        "RBLNBertModel",
         "RBLNCLIPTextModel",
         "RBLNCLIPTextModelWithProjection",
+        "RBLNCLIPVisionModel",
         "RBLNDPTForDepthEstimation",
         "RBLNGemmaForCausalLM",
         "RBLNGPT2LMHeadModel",
         "RBLNWav2Vec2ForCTC",
         "RBLNWhisperForConditionalGeneration",
         "RBLNLlamaForCausalLM",
+        "RBLNPhiForCausalLM",
+        "RBLNLlavaNextForConditionalGeneration",
         "RBLNMidmLMHeadModel",
-        "RBLNMistralForCausalLM",
         "RBLNXLMRobertaModel",
+        "RBLNMistralForCausalLM",
     ],
 }
@@ -48,14 +65,31 @@ if TYPE_CHECKING:
     from .cache_utils import RebelDynamicCache
     from .generation import BatchTextIteratorStreamer
     from .models import (
+        RBLNAutoModel,
+        RBLNAutoModelForAudioClassification,
+        RBLNAutoModelForCausalLM,
+        RBLNAutoModelForCTC,
+        RBLNAutoModelForDepthEstimation,
+        RBLNAutoModelForImageClassification,
+        RBLNAutoModelForMaskedLM,
+        RBLNAutoModelForQuestionAnswering,
+        RBLNAutoModelForSeq2SeqLM,
+        RBLNAutoModelForSequenceClassification,
+        RBLNAutoModelForSpeechSeq2Seq,
+        RBLNAutoModelForVision2Seq,
+        RBLNBartModel,
+        RBLNBertModel,
         RBLNCLIPTextModel,
         RBLNCLIPTextModelWithProjection,
+        RBLNCLIPVisionModel,
         RBLNDPTForDepthEstimation,
         RBLNGemmaForCausalLM,
         RBLNGPT2LMHeadModel,
         RBLNLlamaForCausalLM,
+        RBLNLlavaNextForConditionalGeneration,
         RBLNMidmLMHeadModel,
         RBLNMistralForCausalLM,
+        RBLNPhiForCausalLM,
         RBLNWav2Vec2ForCTC,
         RBLNWhisperForConditionalGeneration,
         RBLNXLMRobertaModel,

optimum/rbln/transformers/models/__init__.py CHANGED Viewed

@@ -21,13 +21,32 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection
+from .auto import (
+    RBLNAutoModel,
+    RBLNAutoModelForAudioClassification,
+    RBLNAutoModelForCausalLM,
+    RBLNAutoModelForCTC,
+    RBLNAutoModelForDepthEstimation,
+    RBLNAutoModelForImageClassification,
+    RBLNAutoModelForMaskedLM,
+    RBLNAutoModelForQuestionAnswering,
+    RBLNAutoModelForSeq2SeqLM,
+    RBLNAutoModelForSequenceClassification,
+    RBLNAutoModelForSpeechSeq2Seq,
+    RBLNAutoModelForVision2Seq,
+)
+from .bart import RBLNBartModel
+from .bert import RBLNBertModel
+from .clip import RBLNCLIPTextModel, RBLNCLIPTextModelWithProjection, RBLNCLIPVisionModel
 from .dpt import RBLNDPTForDepthEstimation
 from .gemma import RBLNGemmaForCausalLM
 from .gpt2 import RBLNGPT2LMHeadModel
 from .llama import RBLNLlamaForCausalLM
+from .llava_next import RBLNLlavaNextForConditionalGeneration
 from .midm import RBLNMidmLMHeadModel
 from .mistral import RBLNMistralForCausalLM
+from .phi import RBLNPhiForCausalLM
 from .wav2vec2 import RBLNWav2Vec2ForCTC
 from .whisper import RBLNWhisperForConditionalGeneration
 from .xlm_roberta import RBLNXLMRobertaModel

optimum/rbln/transformers/models/auto/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .modeling_auto import (
+    RBLNAutoModel,
+    RBLNAutoModelForAudioClassification,
+    RBLNAutoModelForCausalLM,
+    RBLNAutoModelForCTC,
+    RBLNAutoModelForDepthEstimation,
+    RBLNAutoModelForImageClassification,
+    RBLNAutoModelForMaskedLM,
+    RBLNAutoModelForQuestionAnswering,
+    RBLNAutoModelForSeq2SeqLM,
+    RBLNAutoModelForSequenceClassification,
+    RBLNAutoModelForSpeechSeq2Seq,
+    RBLNAutoModelForVision2Seq,
+)

optimum/rbln/transformers/models/auto/auto_factory.py ADDED Viewed

@@ -0,0 +1,84 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import importlib
+from transformers import AutoConfig
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _model_mapping = None
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_config(config)` methods."
+        )
+    @classmethod
+    def get_rbln_cls(
+        cls,
+        model_id,
+        *args,
+        **kwargs,
+    ):
+        # kwargs.update({"return_unused_kwargs": True})
+        config = AutoConfig.from_pretrained(model_id, return_unused_kwargs=True, **kwargs)[0]
+        if len(config.architectures) > 1:
+            raise ValueError(
+                f"Model with ID '{model_id}' has multiple architectures defined in the configuration: "
+                f"{config.architectures}. `_BaseAutoModelClass` require exactly one architecture. "
+            )
+        architecture_name = config.architectures[0]
+        if architecture_name not in cls._model_mapping.values():
+            raise ValueError(
+                f"The 'RBLN{architecture_name}' architecture is not supported by `{cls.__name__}.from_pretrained()`."
+                "Please use the appropriate class's `from_pretrained()` method to load this model."
+            )
+        rbln_class_name = "RBLN" + architecture_name
+        module = importlib.import_module("optimum.rbln")
+        try:
+            rbln_cls = getattr(module, rbln_class_name)
+        except AttributeError as e:
+            raise AttributeError(
+                f"Class '{rbln_class_name}' not found in 'optimum.rbln' module for model ID '{model_id}'. "
+                "Ensure that the class name is correctly mapped and available in the 'optimum.rbln' module."
+            ) from e
+        return rbln_cls
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id,
+        *args,
+        **kwargs,
+    ):
+        rbln_cls = cls.get_rbln_cls(model_id, *args, **kwargs)
+        return rbln_cls.from_pretrained(model_id, *args, **kwargs)

optimum/rbln/transformers/models/auto/modeling_auto.py ADDED Viewed

@@ -0,0 +1,94 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_CTC_MAPPING_NAMES,
+    MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+    MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
+    MODEL_MAPPING_NAMES,
+)
+from .auto_factory import _BaseAutoModelClass
+MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.update(
+    {
+        "midm": "MidmLMHeadModel",
+    }
+)
+class RBLNAutoModel(_BaseAutoModelClass):
+    _model_mapping = MODEL_MAPPING_NAMES
+class RBLNAutoModelForCTC(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_CTC_MAPPING_NAMES
+class RBLNAutoModelForCausalLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+class RBLNAutoModelForSeq2SeqLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+class RBLNAutoModelForSpeechSeq2Seq(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+class RBLNAutoModelForDepthEstimation(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
+class RBLNAutoModelForSequenceClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+class RBLNAutoModelForVision2Seq(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+class RBLNAutoModelForMaskedLM(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_MASKED_LM_MAPPING_NAMES
+class RBLNAutoModelForAudioClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+class RBLNAutoModelForImageClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+class RBLNAutoModelForQuestionAnswering(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES

optimum/rbln/transformers/models/bart/__init__.py CHANGED Viewed

@@ -22,3 +22,4 @@
 # from Rebellions Inc.
 from .bart_architecture import BartDecoderWrapper, BartEncoderWrapper
+from .modeling_bart import RBLNBartModel

optimum/rbln/transformers/models/bart/bart_architecture.py CHANGED Viewed

@@ -54,6 +54,7 @@ class _BartAttention(BartAttention):
         past_key_value: Tuple[torch.Tensor],
         attention_mask: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_index: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
         bsz, tgt_len, _ = hidden_states.size()
@@ -72,28 +73,83 @@ class _BartAttention(BartAttention):
         else:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = past_key_value[0].slice_scatter(
-                key_states, dim=2, start=cache_position, end=cache_position + 1
-            )
-            value_states = past_key_value[1].slice_scatter(
-                value_states, dim=2, start=cache_position, end=cache_position + 1
-            )
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.reshape(*proj_shape)
-        value_states = value_states.reshape(*proj_shape)
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-        attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.bmm(attn_weights, value_states)
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        if cache_position.dim() > 0:
+            proj_shape = (bsz, self.num_heads, -1, self.head_dim)
+            query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+            key_states = key_states.reshape(*proj_shape)
+            value_states = value_states.reshape(*proj_shape)
+            all_key_states = []
+            all_value_states = []
+            all_attn_output = []
+            for b in range(bsz):
+                batch_query_states = query_states[b].unsqueeze(0).unsqueeze(2)
+                batch_attention_mask = attention_mask[b].unsqueeze(0).unsqueeze(2)
+                batch_key_states = key_states[b].unsqueeze(0).unsqueeze(2)
+                batch_value_states = value_states[b].unsqueeze(0).unsqueeze(2)
+                if not is_cross_attention:
+                    batch_key_states = (
+                        past_key_value[0][b]
+                        .unsqueeze(0)
+                        .unsqueeze(2)
+                        .slice_scatter(
+                            batch_key_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                    batch_value_states = (
+                        past_key_value[1][b]
+                        .unsqueeze(0)
+                        .unsqueeze(2)
+                        .slice_scatter(
+                            batch_value_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                attn_weights = torch.matmul(batch_query_states, batch_key_states.transpose(3, 4))
+                attn_weights = attn_weights + batch_attention_mask
+                attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+                attn_output = torch.matmul(attn_weights, batch_value_states)
+                attn_output = attn_output.view(1, self.num_heads, tgt_len, self.head_dim)
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(1, tgt_len, self.embed_dim)
+                all_key_states.append(batch_key_states)
+                all_value_states.append(batch_value_states)
+                all_attn_output.append(attn_output)
+            key_states = torch.cat(all_key_states, dim=0).squeeze(2)
+            value_states = torch.cat(all_value_states, dim=0).squeeze(2)
+            attn_output = torch.cat(all_attn_output, dim=0)
+        else:
+            if batch_index is None or batch_index == -1:
+                batch_index = 0
+            if not is_cross_attention:
+                key_states = past_key_value[0].slice_scatter(
+                    key_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+                value_states = past_key_value[1].slice_scatter(
+                    value_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+            proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+            query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+            key_states = key_states.reshape(*proj_shape)
+            value_states = value_states.reshape(*proj_shape)
+            src_len = key_states.size(1)
+            attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+            attn_output = torch.bmm(attn_weights, value_states)
+            attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            attn_output = attn_output.transpose(1, 2)
+            key_states = key_states.unsqueeze(0)
+            value_states = value_states.unsqueeze(0)
+            attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
         present_key_value = (key_states, value_states)
@@ -108,6 +164,7 @@ class _BartSdpaAttention(BartSdpaAttention):
         past_key_value: Tuple[torch.Tensor],
         attention_mask: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_index: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
         bsz, tgt_len, _ = hidden_states.size()
@@ -126,23 +183,70 @@ class _BartSdpaAttention(BartSdpaAttention):
         else:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = past_key_value[0].slice_scatter(
-                key_states, dim=2, start=cache_position, end=cache_position + 1
-            )
-            value_states = past_key_value[1].slice_scatter(
-                value_states, dim=2, start=cache_position, end=cache_position + 1
-            )
         query_states = self._shape(query_states, tgt_len, bsz)
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-        )
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        if (batch_index is None or batch_index == -1) and bsz > 1:
+            all_key_states = []
+            all_value_states = []
+            all_attn_output = []
+            for b in range(bsz):
+                batch_query_states = query_states[b].unsqueeze(0)
+                batch_attention_mask = attention_mask[b].unsqueeze(0)
+                batch_key_states = key_states[b].unsqueeze(0)
+                batch_value_states = value_states[b].unsqueeze(0)
+                if not is_cross_attention:
+                    batch_key_states = (
+                        past_key_value[0][b]
+                        .unsqueeze(0)
+                        .slice_scatter(
+                            batch_key_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                    batch_value_states = (
+                        past_key_value[1][b]
+                        .unsqueeze(0)
+                        .slice_scatter(
+                            batch_value_states, dim=-2, start=cache_position[b][0], end=cache_position[b][0] + 1
+                        )
+                    )
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    batch_query_states, batch_key_states, batch_value_states, attn_mask=batch_attention_mask
+                )
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(1, tgt_len, self.embed_dim)
+                all_key_states.append(batch_key_states)
+                all_value_states.append(batch_value_states)
+                all_attn_output.append(attn_output)
+            key_states = torch.cat(all_key_states, dim=0)
+            value_states = torch.cat(all_value_states, dim=0)
+            attn_output = torch.cat(all_attn_output, dim=0)
+        else:
+            if batch_index is None or batch_index == -1:
+                batch_index = 0
+            if not is_cross_attention:
+                key_states = past_key_value[0].slice_scatter(
+                    key_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+                value_states = past_key_value[1].slice_scatter(
+                    value_states, dim=2, start=cache_position, end=cache_position + 1
+                )
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+            )
+            attn_output = attn_output.transpose(1, 2)
+            attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
         present_key_value = (key_states, value_states)
@@ -162,6 +266,7 @@ class _BartDecoderLayer(BartDecoderLayer):
         encoder_hidden_states: torch.Tensor,
         past_key_value: Tuple[torch.Tensor],
         cache_position: torch.Tensor,
+        batch_ids: torch.Tensor,
         attn_impl: str = "eager",
     ) -> Tuple[torch.Tensor, Tuple[torch.Tensor]]:
         # Self Attention Block
@@ -174,6 +279,7 @@ class _BartDecoderLayer(BartDecoderLayer):
             past_key_value=self_attn_past_key_value,
             attention_mask=attention_mask,
             cache_position=cache_position,
+            batch_index=batch_ids,
         )
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -189,6 +295,7 @@ class _BartDecoderLayer(BartDecoderLayer):
             past_key_value=cross_attn_past_key_value,
             attention_mask=encoder_attention_mask,
             cache_position=cache_position,
+            batch_index=batch_ids,
         )
         hidden_states = residual + hidden_states
         hidden_states = self.encoder_attn_layer_norm(hidden_states)
@@ -213,14 +320,31 @@ class _BartDecoder(BartDecoder):
         encoder_hidden_states: torch.Tensor,
         past_key_values: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_ids: torch.Tensor,
         attn_impl: str = "eager",
     ):
         # embedding
-        positions_idx = cache_position + self.embed_positions.offset
-        positions = self.embed_positions.weight[positions_idx]
+        # thkim fix : transformers == 4.44.2 compile
+        if hasattr(self, "embed_scale"):
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        else:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position.dim() == 0:
+            positions_idx = cache_position + self.embed_positions.offset
+            positions = self.embed_positions.weight[positions_idx]
+            hidden_states = inputs_embeds + positions
+        else:
+            hidden_all = []
+            for i in range(input_ids.shape[0]):
+                # cache position [N,1]
+                positions_idx = cache_position[i]
+                position_weight = self.embed_positions.weight[2:]
+                position = position_weight[positions_idx]
+                tmp_hidden = position + inputs_embeds[i]
+                hidden_all.append(tmp_hidden)
+            hidden_states = torch.stack(hidden_all, dim=0)
-        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
         # prepare attn_mask
@@ -230,14 +354,14 @@ class _BartDecoder(BartDecoder):
                 attention_mask, input_shape, inputs_embeds, cache_position
             )
             encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
-                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                encoder_attention_mask, torch.float32, tgt_len=input_shape[-1]
             )
         else:
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask, input_shape, inputs_embeds, cache_position
             )
             encoder_attention_mask = _prepare_4d_attention_mask(
-                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                encoder_attention_mask, torch.float32, tgt_len=input_shape[-1]
             )
         # iterate decoder_layer
@@ -252,6 +376,7 @@ class _BartDecoder(BartDecoder):
                 encoder_attention_mask=encoder_attention_mask,
                 past_key_value=past_key_value,
                 cache_position=cache_position,
+                batch_ids=batch_ids,
                 attn_impl=attn_impl,
             )
             hidden_states = layer_outputs[0]
@@ -277,9 +402,14 @@ class BartDecoderWrapper(torch.nn.Module):
         attention_mask: torch.Tensor,
         encoder_attention_mask: torch.Tensor,
         cache_position: torch.Tensor,
+        batch_position: torch.Tensor,
         self_kv_cache: torch.Tensor,
         cross_kv_cache: torch.Tensor,
     ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor]]:
+        if input_ids.shape[1] == 1:
+            rbln_batch_position = None
+        else:
+            rbln_batch_position = batch_position
         # prepare past_key_values
         kv_cache = ()
         for i in range(0, self.num_layers * 2, 2):
@@ -291,7 +421,6 @@ class BartDecoderWrapper(torch.nn.Module):
                     cross_kv_cache[i + 1],
                 ),
             )
         # decode
         decoder_outputs = _BartDecoder.forward(
             self.decoder,
@@ -302,6 +431,7 @@ class BartDecoderWrapper(torch.nn.Module):
             past_key_values=kv_cache,
             encoder_hidden_states=torch.tensor([1]),
             attn_impl=self.config._attn_implementation,
+            batch_ids=rbln_batch_position,
         )
         sequence_output = decoder_outputs[0]
         lm_logits = self.lm_head(sequence_output)
@@ -314,7 +444,7 @@ class BartDecoderWrapper(torch.nn.Module):
             self_kv_cache.append(past_key_values[i][1])
         self_kv_cache = torch.stack(self_kv_cache, dim=0)
-        return lm_logits, self_kv_cache
+        return lm_logits, self_kv_cache, batch_position
 class BartEncoderWrapper(torch.nn.Module):
@@ -330,7 +460,13 @@ class BartEncoderWrapper(torch.nn.Module):
         self.num_heads = self.config.decoder_attention_heads
         self.d_kv = self.config.d_model // self.num_heads
-    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor) -> Tuple[torch.Tensor]:
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        cross_key_value: torch.Tensor = None,
+        batch_idx: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor]:
         encoder_batch_size = input_ids.shape[0]
         decoder_batch_size = encoder_batch_size  # TODO(taehoon) fix to enable beam-search
@@ -348,7 +484,7 @@ class BartEncoderWrapper(torch.nn.Module):
             layer_pkv = (pkv_self_attn_key, pkv_self_attn_value, pkv_cross_attn_key, pkv_cross_attn_value)
             dummy_past_key_value.append(layer_pkv)
-        decoder_attention_mask = torch.zeros(decoder_batch_size, self.decoder_max_length, dtype=torch.int64)
+        decoder_attention_mask = torch.zeros(decoder_batch_size, self.decoder_max_length, dtype=torch.float32)
         decoder_attention_mask[:, :1] = 1
         decoder_outputs = _BartDecoder.forward(
@@ -359,14 +495,17 @@ class BartEncoderWrapper(torch.nn.Module):
             cache_position=torch.tensor(0, dtype=torch.int32),
             encoder_hidden_states=last_hidden_states,
             past_key_values=dummy_past_key_value,
+            batch_ids=torch.tensor(0, dtype=torch.int32),
             attn_impl=self.config._attn_implementation,
         )
         first_past_kv = decoder_outputs[1]
-        # 3. return cross_key_values to recurrence port. fyi (enc_ir.outputs[0] -> dec_ir.inputs[5])
         encoder_kv = []
-        for layer_out in first_past_kv:  # for layer
-            encoder_kv.append(torch.stack(layer_out[2:], dim=0))
-        encoder_kv = torch.stack(encoder_kv, dim=0)
+        for i in range(self.model.config.decoder_layers):
+            encoder_kv.append(first_past_kv[i][2].unsqueeze(0))
+            encoder_kv.append(first_past_kv[i][3].unsqueeze(0))
+        encoder_kv = torch.cat(encoder_kv, dim=0)
+        cross_key_value = cross_key_value.slice_scatter(encoder_kv, dim=1, start=batch_idx, end=batch_idx + 1)
-        return encoder_kv
+        return cross_key_value

optimum-rbln 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

optimum-rbln 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl