PyPI - optimum-rbln - Versions diffs - 0.7.4a9__py3-none-any.whl → 0.7.5a1__py3-none-any.whl - Mend

optimum-rbln 0.7.4a9py3-none-any.whl → 0.7.5a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

optimum/rbln/transformers/models/blip_2/modeling_blip_2.py ADDED Viewed

@@ -0,0 +1,298 @@
+# Copyright 2025 Rebellions Inc. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
+import torch
+from transformers import (
+    AutoModelForVisualQuestionAnswering,
+    Blip2ForConditionalGeneration,
+    Blip2QFormerModel,
+    Blip2VisionModel,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions
+from transformers.utils import logging
+from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
+from ....modeling import RBLNModel
+logger = logging.get_logger(__name__)
+if TYPE_CHECKING:
+    from transformers import (
+        AutoFeatureExtractor,
+        AutoProcessor,
+        AutoTokenizer,
+    )
+class RBLNBlip2VisionModel(RBLNModel):
+    def get_input_embeddings(self):
+        return self.embeddings
+    @classmethod
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+        class Blip2VisionModelWrapper(torch.nn.Module):
+            def __init__(self, model: "Blip2VisionModel") -> None:
+                super().__init__()
+                self.model = model
+            def forward(self, *args, **kwargs):
+                kwargs.pop("return_dict", None)
+                return self.model(*args, **kwargs, return_dict=False)
+        return Blip2VisionModelWrapper(model).eval()
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+        model: Optional["PreTrainedModel"] = None,
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_config: Optional[RBLNModelConfig] = None,
+    ) -> RBLNModelConfig:
+        input_info = [
+            (
+                "pixel_values",
+                [
+                    rbln_config.batch_size,
+                    model_config.num_channels,
+                    model_config.image_size,
+                    model_config.image_size,
+                ],
+                "float32",
+            ),
+        ]
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config.set_compile_cfgs([rbln_compile_config])
+        return rbln_config
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output = super().forward(pixel_values, return_dict=return_dict)
+        return output
+    def _prepare_output(self, output, return_dict):
+        """
+        Prepare model output based on return_dict flag.
+        This method can be overridden by subclasses to provide task-specific output handling.
+        """
+        if not return_dict:
+            return (output,) if not isinstance(output, (tuple, list)) else output
+        else:
+            return BaseModelOutputWithPooling(
+                last_hidden_state=output[0],
+                pooler_output=output[1],
+            )
+class RBLNBlip2QFormerModel(RBLNModel):
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    @classmethod
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+        class Blip2QFormerModelWrapper(torch.nn.Module):
+            def __init__(self, model: "Blip2QFormerModel"):
+                super().__init__()
+                self.model = model
+            def forward(
+                self,
+                query_embeds: torch.FloatTensor,
+                encoder_hidden_states: Optional[torch.FloatTensor] = None,
+                encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            ) -> torch.Tensor:
+                qformer_out = self.model(
+                    query_embeds=query_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )
+                return qformer_out
+        return Blip2QFormerModelWrapper(model).eval()
+    @classmethod
+    def _update_submodule_config(cls, model: "PreTrainedModel", rbln_config: "RBLNModelConfig") -> "RBLNModelConfig":
+        if rbln_config.num_query_tokens is None:
+            rbln_config.num_query_tokens = model.config.num_query_tokens
+        if rbln_config.image_text_hidden_size is None:
+            rbln_config.image_text_hidden_size = model.config.image_text_hidden_size
+        return rbln_config
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+        model: Optional["PreTrainedModel"] = None,
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_config: Optional[RBLNModelConfig] = None,
+    ) -> RBLNModelConfig:
+        input_info = [
+            (
+                "query_embeds",
+                [
+                    rbln_config.batch_size,
+                    rbln_config.num_query_tokens,
+                    model_config.hidden_size,
+                ],
+                "float32",
+            ),
+            (
+                "encoder_hidden_states",
+                [
+                    rbln_config.batch_size,
+                    # image_text_hidden_size + cls token
+                    rbln_config.image_text_hidden_size + 1,
+                    model_config.encoder_hidden_size,
+                ],
+                "float32",
+            ),
+            (
+                "encoder_attention_mask",
+                # image_text_hidden_size + cls token
+                [rbln_config.batch_size, rbln_config.image_text_hidden_size + 1],
+                "int64",
+            ),
+        ]
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config.set_compile_cfgs([rbln_compile_config])
+        return rbln_config
+    def forward(
+        self,
+        query_embeds: torch.FloatTensor,
+        query_length: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        output = super().forward(query_embeds, encoder_hidden_states, encoder_attention_mask, return_dict=return_dict)
+        return output
+    def _prepare_output(self, output, return_dict):
+        """
+        Prepare model output based on return_dict flag.
+        This method can be overridden by subclasses to provide task-specific output handling.
+        """
+        if not return_dict:
+            return (output,) if not isinstance(output, (tuple, list)) else output
+        else:
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=output[0],
+                pooler_output=output[1],
+            )
+class RBLNBlip2ForConditionalGeneration(RBLNModel):
+    auto_model_class = AutoModelForVisualQuestionAnswering
+    _rbln_submodules = [{"name": "vision_model"}, {"name": "qformer"}, {"name": "language_model"}]
+    def __getattr__(self, __name: str) -> Any:
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(Blip2ForConditionalGeneration, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val
+    def can_generate(self):
+        return True
+    @classmethod
+    def save_torch_artifacts(
+        cls,
+        model: "Blip2ForConditionalGeneration",
+        save_dir_path: Path,
+        subfolder: str,
+        rbln_config: RBLNModelConfig,
+    ):
+        """
+        If you are unavoidably running on a CPU rather than an RBLN device,
+        store the torch tensor, weight, etc. in this function.
+        """
+        save_dict = {}
+        save_dict["query_tokens"] = model.query_tokens
+        torch.save(save_dict, save_dir_path / subfolder / "query_tokens.pth")
+    def __post_init__(self, **kwargs):
+        self.vision_model = self.rbln_submodules[0]
+        self.language_model = self.rbln_submodules[2]
+        self.qformer = self.rbln_submodules[1]
+        self.language_projection = self.model[0]
+        artifacts = torch.load(self.model_save_dir / self.subfolder / "query_tokens.pth", weights_only=False)
+        self.query_tokens = artifacts["query_tokens"]
+    def get_attn_impl(self) -> str:
+        return self.rbln_config.language_model.attn_impl
+    def get_kvcache_num_blocks(self) -> int:
+        return self.rbln_config.language_model.kvcache_num_blocks
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    @classmethod
+    def wrap_model_if_needed(cls, model, rbln_config):
+        return model.language_projection
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+        model: Optional["PreTrainedModel"] = None,
+        model_config: Optional["PretrainedConfig"] = None,
+        rbln_config: Optional[RBLNModelConfig] = None,
+    ) -> RBLNModelConfig:
+        input_info = [
+            (
+                "query_output",
+                [
+                    rbln_config.batch_size,
+                    model_config.num_query_tokens,
+                    model_config.qformer_config.hidden_size,
+                ],
+                "float32",
+            ),
+        ]
+        rbln_compile_config = RBLNCompileConfig(input_info=input_info)
+        rbln_config.set_compile_cfgs([rbln_compile_config])
+        return rbln_config

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Union
 import rebel
 from ....configuration_utils import RBLNModelConfig
 from ....utils.logging import get_logger
-from ...utils.rbln_quantization import QuantizationManager
+from ...utils.rbln_quantization import RBLNQuantizationConfig
 logger = get_logger()
@@ -31,12 +31,14 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
         max_seq_len: Optional[int] = None,
         use_inputs_embeds: Optional[bool] = None,
         use_attention_mask: Optional[bool] = None,
+        use_position_ids: Optional[bool] = None,
         attn_impl: Optional[str] = None,
         kvcache_partition_len: Optional[int] = None,
         kvcache_block_size: Optional[int] = None,
-        quantization: Optional[Dict[str, Any]] = None,
+        quantization: Optional[Union[Dict[str, Any], RBLNQuantizationConfig]] = None,
         prefill_chunk_size: Optional[int] = None,
         kvcache_num_blocks: Optional[int] = None,
+        decoder_batch_sizes: Optional[List[int]] = None,
         **kwargs,
     ):
         """
@@ -46,6 +48,7 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
             use_inputs_embeds (Optional[bool]): Whether to use input embeddings directly. Defaults to False.
             use_attention_mask (Optional[bool]): Whether to use attention masks. This is automatically set to True
                 for RBLN-CA02 devices.
+            use_position_ids (Optional[bool]): Whether to use position IDs. Defaults to False.
             attn_impl (Optional[str]): The attention implementation to use.
             kvcache_partition_len (Optional[int]): The length of each KV cache partition.
             kvcache_block_size (Optional[int]): The block size for KV cache.
@@ -53,6 +56,13 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
             prefill_chunk_size (Optional[int]): The chunk size for prefilling the KV cache. Defaults to 128,
                 and must be a positive integer divisible by 64.
             kvcache_num_blocks (Optional[int]): The number of blocks in the KV cache.
+            decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
+                This allows the model to handle varying batch sizes efficiently during generation. If not specified,
+                defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
+                1) All values must be less than or equal to the main batch size.
+                2) The list will be sorted in descending order (larger batch sizes first).
+                3) If using multiple decoders, at least one batch size should match the main batch size.
             **kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
@@ -66,8 +76,9 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
         self.max_seq_len = max_seq_len
         self.use_inputs_embeds = use_inputs_embeds or False
+        self.use_position_ids = use_position_ids or False
         self.use_attention_mask = use_attention_mask
         npu = self.npu or rebel.get_npu_name()
         if npu == "RBLN-CA02":
             if self.use_attention_mask is False:
@@ -76,15 +87,40 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
         else:
             self.use_attention_mask = self.use_attention_mask or False
+        if self.use_position_ids and not self.use_attention_mask:
+            raise ValueError("Position IDs should be used with attention mask.")
         self.attn_impl = attn_impl
         self.kvcache_partition_len = kvcache_partition_len
         self.kvcache_block_size = kvcache_block_size
         self.quantization = quantization or {}
-        if self.quantization:
-            QuantizationManager.validate_quantization_config(self.quantization)
+        if self.quantization and isinstance(self.quantization, dict):
+            self.quantization = RBLNQuantizationConfig(**self.quantization)
         self.prefill_chunk_size = prefill_chunk_size or 128
         if self.prefill_chunk_size % 64 != 0 or self.prefill_chunk_size <= 0:
             raise ValueError("`prefill_chunk_size` must be a positive integer divisible by 64.")
         self.kvcache_num_blocks = kvcache_num_blocks
+        self.decoder_batch_sizes = decoder_batch_sizes
+        if self.decoder_batch_sizes is None:
+            self.decoder_batch_sizes = [self.batch_size]
+        if self.use_multiple_decoder:
+            if max(self.decoder_batch_sizes) > self.batch_size:
+                raise ValueError(
+                    f"Decoder batch size ({max(self.decoder_batch_sizes)}) must be less than or equal to the runtime batch size ({self.batch_size})."
+                )
+            if max(self.decoder_batch_sizes) < self.batch_size:
+                logger.warning(
+                    f"Maximum decoder batch size ({max(self.decoder_batch_sizes)}) is less than the model's batch size ({self.batch_size}). "
+                    "Appending the model's batch size to the decoder batch size."
+                )
+                self.decoder_batch_sizes.append(self.batch_size)
+            # Larger batch size should be at the beginning of the list.
+            self.decoder_batch_sizes.sort(reverse=True)
+    @property
+    def use_multiple_decoder(self):
+        return isinstance(self.decoder_batch_sizes, list) and len(self.decoder_batch_sizes) > 1

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -146,7 +146,10 @@ class DecoderOnlyWrapper(nn.Module):
         max_seq_len: int,
         use_rotary_emb: bool,
         attn_impl: str,
+        use_inputs_embeds: bool,
         use_attention_mask: bool,
+        use_position_ids: bool,
+        use_learned_pos_emb: Optional[bool] = None,
         kvcache_partition_len: Optional[int] = None,
         kvcache_block_size: Optional[int] = None,
     ):
@@ -161,6 +164,10 @@ class DecoderOnlyWrapper(nn.Module):
         self.attn_impl = attn_impl
         self.kvcache_block_size = kvcache_block_size
         self.use_attention_mask = use_attention_mask
+        self.use_position_ids = use_position_ids
+        self.use_inputs_embeds = use_inputs_embeds
+        self.use_learned_pos_emb = use_learned_pos_emb
         if self.attn_impl == "flash_attn":
             self.kvcache_partition_len = kvcache_partition_len or DEFAULT_FLASH_ATTN_PARTITION_LENGTH
         elif self.attn_impl == "eager":
@@ -209,6 +216,7 @@ class DecoderOnlyWrapper(nn.Module):
             partition_len=self.kvcache_partition_len,
             max_seq_len=max_seq_len,
             kvcache_block_size=self.kvcache_block_size,
+            use_learned_pos_emb=self.use_learned_pos_emb,
         )
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm
@@ -222,24 +230,16 @@ class DecoderOnlyWrapper(nn.Module):
         self._phase = phase
         self.causal_lm.phase = phase
-    def forward_common(
-        self,
-        input_ids_or_inputs_embeds: torch.Tensor,
-        cache_position: torch.Tensor,
-        attention_mask: torch.Tensor,
-        query_position: torch.Tensor,
-        block_tables: torch.Tensor,
-        rotary_emb: Union[nn.Module, torch.Tensor],
-        *past_key_values: List[torch.Tensor],
-    ):
-        if input_ids_or_inputs_embeds.ndim == 2:
-            input_ids = input_ids_or_inputs_embeds
-            inputs_embeds = None
-        elif input_ids_or_inputs_embeds.ndim == 3:
-            input_ids = None
-            inputs_embeds = input_ids_or_inputs_embeds
-        else:
-            raise NotImplementedError(f"Unknown ndim of input : {input_ids_or_inputs_embeds.ndim}")
+    def prepare_forward_args(self, *args):
+        args = list(args)
+        input_ids = None if self.use_inputs_embeds else args.pop(0)
+        inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
+        cache_position = args.pop(0)
+        block_tables = args.pop(0)
+        query_position = args.pop(0) if self.phase == "prefill" else None
+        attention_mask = args.pop(0) if self.use_attention_mask else None
+        position_ids = args.pop(0) if self.use_position_ids else None
+        past_key_values = args
         if len(past_key_values) != 2 * self.num_hidden_layers:
             raise ValueError(
@@ -256,11 +256,37 @@ class DecoderOnlyWrapper(nn.Module):
             _past_key_values.append(past_key_value)
         past_key_values = _past_key_values
+        return (
+            input_ids,
+            inputs_embeds,
+            cache_position,
+            block_tables,
+            query_position,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            self.rotary_emb,
+        )
+    def forward(self, *args):
+        (
+            input_ids,
+            inputs_embeds,
+            cache_position,
+            block_tables,
+            query_position,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            rotary_emb,
+        ) = self.prepare_forward_args(*args)
         logit = self.causal_lm(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             cache_position=cache_position,
+            position_ids=position_ids,
             query_position=query_position,
             past_key_values=past_key_values,
             rotary_emb=rotary_emb,
@@ -269,58 +295,6 @@ class DecoderOnlyWrapper(nn.Module):
         return logit
-    def forward(self, *args):
-        if self.phase == "decode":
-            if self.use_attention_mask:
-                (
-                    input_ids_or_inputs_embeds,
-                    cache_position,
-                    attention_mask,
-                    block_tables,
-                    *past_key_values,
-                ) = args
-            else:
-                (
-                    input_ids_or_inputs_embeds,
-                    cache_position,
-                    block_tables,
-                    *past_key_values,
-                ) = args
-                attention_mask = None
-            query_position = None
-        elif self.phase == "prefill":
-            if self.use_attention_mask:
-                (
-                    input_ids_or_inputs_embeds,
-                    cache_position,
-                    attention_mask,
-                    query_position,
-                    block_tables,
-                    *past_key_values,
-                ) = args
-            else:
-                (
-                    input_ids_or_inputs_embeds,
-                    cache_position,
-                    query_position,
-                    block_tables,
-                    *past_key_values,
-                ) = args
-                attention_mask = None
-        else:
-            raise ValueError(f"Unknown phase: {self.phase}")
-        return self.forward_common(
-            input_ids_or_inputs_embeds,
-            cache_position,
-            attention_mask,
-            query_position,
-            block_tables,
-            self.rotary_emb,
-            *past_key_values,
-        )
 class DecoderOnlyForCausalLM(nn.Module):
     """A specialized wrapper for Causal Language Models optimized for RBLN compilation.
@@ -367,6 +341,7 @@ class DecoderOnlyForCausalLM(nn.Module):
         inputs_embeds: torch.Tensor = None,
         attention_mask: torch.Tensor = None,
         cache_position: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
         query_position: torch.Tensor = None,
         past_key_values: Tuple[Tuple[torch.Tensor]] = None,
         rotary_emb: nn.Module = None,
@@ -378,6 +353,7 @@ class DecoderOnlyForCausalLM(nn.Module):
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             cache_position=cache_position,
+            position_ids=position_ids,
             past_key_values=past_key_values,
             rotary_emb=rotary_emb,
             block_tables=block_tables,
@@ -404,7 +380,13 @@ class DecoderOnlyModel(nn.Module):
     """
     def __init__(
-        self, model, layers: List["DecoderOnlyLayer"], partition_len=None, max_seq_len=None, kvcache_block_size=None
+        self,
+        model,
+        layers: List["DecoderOnlyLayer"],
+        partition_len=None,
+        max_seq_len=None,
+        kvcache_block_size=None,
+        use_learned_pos_emb=None,
     ):
         super().__init__()
         self._original_mod = model
@@ -413,6 +395,7 @@ class DecoderOnlyModel(nn.Module):
         self.partition_len = partition_len
         self.kvcache_block_size = kvcache_block_size
         self.max_seq_len = max_seq_len
+        self.use_learned_pos_emb = use_learned_pos_emb
     @property
     def phase(self):
@@ -457,11 +440,12 @@ class DecoderOnlyModel(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor = None,
-        inputs_embeds: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         attention_mask: torch.Tensor = None,
         cache_position: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
         past_key_values: Tuple[Tuple[torch.Tensor]] = None,
-        rotary_emb: nn.Module = None,
+        rotary_emb: Optional[Union[nn.Module, torch.Tensor]] = None,
         block_tables: Optional[torch.Tensor] = None,
     ):
         # retrieve input_ids and inputs_embeds
@@ -477,24 +461,38 @@ class DecoderOnlyModel(nn.Module):
         hidden_states = inputs_embeds * self.hidden_multiplier
         # get cos,sin vector if needed
+        position_ids = position_ids if position_ids is not None else cache_position
         if rotary_emb is not None:
             if isinstance(rotary_emb, torch.Tensor):
                 cos = rotary_emb[0]
                 sin = rotary_emb[1]
             else:
                 cos, sin = rotary_emb(hidden_states, self.max_seq_len)  # dtype carrier, max_seq_len
-                cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, cache_position)
+                cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
+        elif self.use_learned_pos_emb:
+            batch_size = inputs_embeds.shape[0]
+            hidden_all = []
+            for i in range(batch_size):
+                positions_idx = position_ids[i]
+                position_weight = self.get_pos_embedding().weight[2:]
+                position = position_weight[positions_idx]
+                batch_hidden = position + inputs_embeds[i]
+                hidden_all.append(batch_hidden)
+            hidden_states = torch.stack(hidden_all, dim=0)
+            cos, sin = None, None
         else:
             batch_size = inputs_embeds.shape[0]
-            if cache_position.shape[0] > 1:
+            if position_ids.shape[0] > 1:
                 position_embeds = []
                 for b_idx in range(batch_size):
-                    position_embed = self.get_pos_embedding()(cache_position[b_idx])
+                    position_embed = self.get_pos_embedding()(position_ids[b_idx])
                     position_embeds.append(position_embed)
                 position_embeds = torch.cat(position_embeds, dim=0).unsqueeze(1)
             else:
-                position_embeds = self.get_pos_embedding()(cache_position)
+                position_embeds = self.get_pos_embedding()(position_ids)
             hidden_states = hidden_states + position_embeds
             cos, sin = None, None
@@ -798,6 +796,7 @@ class AttentionOp(nn.Module):
                     scale=scale,
                     block_table=block_tables,
                     block_size=block_size,
+                    mask=None,
                 )
         else:
@@ -825,6 +824,8 @@ class AttentionOp(nn.Module):
                     scale=scale,
                     block_table=block_tables,
                     block_size=block_size,
+                    is_bidirectional=False,
+                    mask=None,
                 )
         attn_output = attn_output.view(batch_size, self.num_heads, -1, self.head_dim)
@@ -1058,6 +1059,7 @@ class FlashAttentionOp(AttentionOp):
                     block_table=block_tables,
                     block_size=kvcache_block_size,
                     partition=self.kvcache_partition_size,
+                    mask=None,
                 )
         else:
             if self.use_attention_mask:
@@ -1086,6 +1088,8 @@ class FlashAttentionOp(AttentionOp):
                     block_table=block_tables,
                     block_size=kvcache_block_size,
                     partition=self.kvcache_partition_size,
+                    is_bidirectional=False,
+                    mask=None,
                 )
         # reshape for removing repeat_kv

optimum-rbln 0.7.4a9__py3-none-any.whl → 0.7.5a1__py3-none-any.whl

optimum-rbln 0.7.4a9py3-none-any.whl → 0.7.5a1py3-none-any.whl