PyPI - optimum-rbln - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

optimum/rbln/transformers/models/exaone/modeling_exaone.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+from ....utils import logging
+from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
+from .exaone_architecture import ExaoneForCausalLMWrapper
+from .hf_hub_cached.modeling_exaone import ExaoneForCausalLM
+logger = logging.get_logger(__name__)
+class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
+    """
+    The Exaone Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the
+    library implements for all its model.
+    It implements the methods to convert a pre-trained transformers Exaone model into a RBLN transformer model by:
+    - transferring the checkpoint weights of the original into an optimized RBLN graph,
+    - compiling the resulting graph using the RBLN compiler.
+    """
+    _decoder_wrapper_cls = ExaoneForCausalLMWrapper
+    _original_cls = ExaoneForCausalLM
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        kwargs.setdefault("trust_remote_code", True)
+        return super().from_pretrained(*args, **kwargs)

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -29,11 +29,11 @@ from transformers.modeling_outputs import (
 )
 from ...models.decoderonly import (
-    DecoderOnlyAttention,
     DecoderOnlyDecoderLayer,
     DecoderOnlyWrapper,
     slice_and_unsqueeze_cos_sin,
 )
+from ...models.decoderonly.decoderonly_architecture import DECODERONLY_ATTENTION_CLASSES
 class GemmaWrapper(DecoderOnlyWrapper):
@@ -43,7 +43,7 @@ class GemmaWrapper(DecoderOnlyWrapper):
             {
                 "wrapper": GemmaModel.forward,
                 "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
+                "decoder_layer": DECODERONLY_ATTENTION_CLASSES[self.attn_implementation].forward,
             }
         )
         return forward_dict
@@ -61,9 +61,17 @@ class GemmaModel:
         use_cache: Optional[bool] = True,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
+        cache_pos_for_partitions: Optional[torch.Tensor] = None,
+        kvcache_partition_size: Optional[torch.Tensor] = None,
         forward_dict: Optional[Dict[str, classmethod]] = None,
         rotary_pos_emb=None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
         # embed positions
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
@@ -96,6 +104,8 @@ class GemmaModel:
                 batch_ids=batch_ids,
                 cos=cos,
                 sin=sin,
+                cache_pos_for_partitions=cache_pos_for_partitions,
+                kvcache_partition_size=kvcache_partition_size,
                 forward_dict=forward_dict,
             )

optimum/rbln/transformers/models/gemma/modeling_gemma.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GemmaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gemma_architecture import GemmaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Gemma Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based GemmaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers GemmaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GemmaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GemmaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GemmaWrapper

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -21,20 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GPT2LMHeadModel
-from ....modeling_config import RBLNConfig
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gpt2_architecture import GPT2LMHeadModelWrapper
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+logger = logging.get_logger(__name__)
 class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
@@ -42,7 +34,7 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the
     library implements for all its model.
     It implements the methods to convert a pre-trained transformers GPT2 model into a RBLN transformer model by:
@@ -51,22 +43,4 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GPT2LMHeadModelWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-GPT2.
-        Returns:
-            Any: GPT2's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GPT2LMHeadModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GPT2LMHeadModelWrapper

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import LlamaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .llama_architecture import LlamaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return LlamaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(LlamaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = LlamaWrapper

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -114,7 +114,7 @@ class LoopProjector:
         return self.forward(*args, **kwds)
     def __repr__(self) -> str:
-        return repr(self.vision_tower)
+        return repr(self.multi_modal_projector)
 class RBLNLlavaNextForConditionalGeneration(RBLNModel):
@@ -228,29 +228,26 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         pixel_values=None,
         image_sizes=None,
         attention_mask=None,
-        past_cached_length=None,
+        generate_idx=None,
         **kwargs,
     ):
         # Prepare HF generation
-        is_prefill_phase = past_cached_length is None
+        is_prefill_phase = generate_idx is None
         batch_size = input_ids.shape[0]
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
-            past_cached_length=past_cached_length,  # Not affect
+            generate_idx=generate_idx,  # Not affect
             attention_mask=attention_mask,
             **kwargs,
         )
         if is_prefill_phase:
-            model_inputs["past_cached_length"] = torch.zeros((batch_size, 1), dtype=torch.int32)
-        else:
-            model_inputs["past_cached_length"] = past_cached_length + 1
+            model_inputs["generate_idx"] = torch.zeros((batch_size, 1), dtype=torch.int32)
         model_inputs.update(
             {
-                # "position_ids": position_ids or cache_positions,
                 "pixel_values": pixel_values,
                 "image_sizes": image_sizes,
                 "attention_mask": attention_mask,
@@ -264,43 +261,28 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         model_kwargs: Dict[str, Any],
         **kwargs,
     ) -> Dict[str, Any]:
-        # update past_cached_length
-        model_kwargs["past_cached_length"] = outputs.past_cached_length
+        # update generate_idx
+        model_kwargs["generate_idx"] = outputs.generate_idx
         return model_kwargs
-    def _merge_vllm_multimodal_embeddings(
+    def text_embedding(
         self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        multimodal_embeddings: torch.Tensor,
-        placeholder_token_id: int,
+        input_ids: torch.LongTensor,
     ) -> torch.Tensor:
-        mask = input_ids == placeholder_token_id
-        num_expected_tokens = mask.sum().item()
-        assert isinstance(num_expected_tokens, int)
-        if multimodal_embeddings.shape[0] != num_expected_tokens:
-            raise ValueError(
-                f"Attempted to assign {inputs_embeds[mask].shape} = {multimodal_embeddings.shape} "
-                f"multimodal tokens to {num_expected_tokens} placeholders"
-            )
+        for_inputs_embeds_ids = input_ids.clone()
+        for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
+        inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
-        inputs_embeds[mask] = multimodal_embeddings
         return inputs_embeds
-    def _embed(
+    def image_embedding(
         self,
-        input_ids: torch.LongTensor,
         image_sizes: torch.LongTensor,
-        attention_mask: torch.Tensor,
         pixel_values: torch.FloatTensor,
         vision_feature_layer: int,
         vision_feature_select_strategy: str,
-        cache_position: torch.Tensor,
-        past_cached_length: torch.Tensor,
-        from_vllm_prefill: bool = False,
-    ) -> List[torch.Tensor]:
+    ) -> torch.Tensor:
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -310,84 +292,137 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
             else self.config.vision_feature_select_strategy
         )
-        # 1. Extract the input embeddings
-        # In case image_token_index is not in the embeddings (extra token but embedding don't have it)
-        for_inputs_embeds_ids = input_ids.clone()
-        for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
+        # ! infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            )
+            for imsize in image_sizes
+        ]
-        inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
+        # figure out if pixel_values is concatenated or stacked
+        if pixel_values.dim() == 5:
+            # stacking when input is (batch_size, num_patches, num_channels, height, width)
+            _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+            raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
-        # 2. Merge text and images
-        if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-            # ! infer image_num_patches from image_sizes
-            image_num_patches = [
-                image_size_to_num_patches(
-                    image_size=imsize,
-                    grid_pinpoints=self.config.image_grid_pinpoints,
-                    patch_size=self.config.vision_config.image_size,
-                )
-                for imsize in image_sizes
-            ]
-            # figure out if pixel_values is concatenated or stacked
-            if pixel_values.dim() == 5:
-                # stacking when input is (batch_size, num_patches, num_channels, height, width)
-                _pixel_values_list = [
-                    pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)
-                ]
-                pixel_values = torch.cat(_pixel_values_list, dim=0)
-            elif pixel_values.dim() != 4:
-                # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
-                raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
-            image_features = self.vision_tower(pixel_values, output_hidden_states=True)
-            selected_image_feature = image_features.hidden_states[vision_feature_layer]
-            if vision_feature_select_strategy == "default":
-                selected_image_feature = selected_image_feature[:, 1:]
-            elif vision_feature_select_strategy == "full":
-                selected_image_feature = selected_image_feature
-            image_features = self.multi_modal_projector(selected_image_feature)
-            image_features = torch.split(image_features, image_num_patches, dim=0)
-            # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
-            image_features, feature_lens = self.pack_image_features(
-                image_features,
-                image_sizes,
-                image_newline=self.image_newline,
-            )
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_features.hidden_states[vision_feature_layer]
-            inputs_embeds = inputs_embeds.to(image_features.dtype)
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.split(image_features, image_num_patches, dim=0)
+        # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+        image_features, feature_lens = self.pack_image_features(
+            image_features,
+            image_sizes,
+            image_newline=self.image_newline,
+        )
-            if from_vllm_prefill:
-                self._merge_vllm_multimodal_embeddings(
-                    input_ids, inputs_embeds, image_features, self.config.image_token_index
-                )
+        return image_features, feature_lens
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        cache_position: torch.Tensor = None,
+        generate_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
+        if inputs_embeds is not None:
+            raise NotImplementedError("Specifying inputs_embeds is not supported.")
+        is_prefill_phase = not generate_idx.bool().all()
+        if is_prefill_phase:
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
+            # Get the number of images in the prompt
+            special_image_token_masks = [input_id == self.config.image_token_index for input_id in input_ids]
+            if legacy_processing:
+                num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
             else:
-                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
+                image_tokens_masks_diff = [
+                    torch.diff(mask, prepend=torch.tensor([0])) for mask in special_image_token_masks
+                ]
+                num_special_image_tokens = [int(torch.sum((diff == 1).int())) for diff in image_tokens_masks_diff]
+            # Split images for each prompt
+            if pixel_values is not None and pixel_values.size(0) > 0:
+                pixel_values = pixel_values.split(num_special_image_tokens, dim=0)
+                image_sizes = image_sizes.split(num_special_image_tokens, dim=0)
+            logits = []
+            for b_idx in range(input_ids.shape[0]):
+                # Get text_embeds from input_id
+                input_id = input_ids[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
+                inputs_embed = self.text_embedding(input_id)
+                # If any images in the prompt, get image_embeds and merge with text
+                if num_special_image_tokens[b_idx] > 0:
+                    image_features, feature_lens = self.image_embedding(
+                        image_sizes[b_idx], pixel_values[b_idx], vision_feature_layer, vision_feature_select_strategy
+                    )
+                    if legacy_processing:
+                        inputs_embed, _, _, _, _ = self._merge_input_ids_with_image_features(
+                            image_features,
+                            feature_lens,
+                            inputs_embed.to(image_features.dtype),
+                            input_id,
+                            torch.ones_like(input_id, dtype=torch.long),
+                        )
+                    else:
+                        special_image_mask = (
+                            (input_id == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embed)
+                        )
+                        inputs_embed = inputs_embed.masked_scatter(special_image_mask, image_features)
+                # Update generate_idx according to inputs_embed
+                generate_idx[b_idx] = inputs_embed.shape[1]
+                logit = self.language_model._forward_prefill(
+                    inputs_embeds=inputs_embed,
+                    batch_idx=b_idx,
+                    cache_position=torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0),
                 )
-            cache_position = torch.arange(0, inputs_embeds.shape[1], dtype=torch.int32).unsqueeze_(0)
+                logits.append(logit)
-        # pixel_values is not None but is empty ---> text only cases
-        elif (
-            pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) == 0 or pixel_values is None
-        ):
-            pass
+            logits = torch.cat(logits, dim=0)
+            outputs = RBLNDecoderOnlyOutput(logits=logits, generate_idx=generate_idx)
+        else:
+            inputs_embeds = self.text_embedding(input_ids)
-        # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-        # generation with cache
-        elif pixel_values is not None and input_ids.shape[1] == 1 and past_cached_length is not None:
-            cache_position = past_cached_length
+            outputs: RBLNDecoderOnlyOutput = self.language_model(
+                inputs_embeds=inputs_embeds,
+                cache_position=cache_position,
+                generate_idx=generate_idx,
+            )
-        return inputs_embeds, cache_position
+        return outputs
-    def forward(
+    def vllm_forward(
         self,
         input_ids: torch.LongTensor = None,
         pixel_values: torch.FloatTensor = None,
@@ -397,72 +432,52 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         vision_feature_select_strategy: Optional[str] = None,
         cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
         batch_idx: Optional[int] = None,
-        past_cached_length: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
-        from_vllm_prefill = isinstance(cache_position, torch.Tensor) and cache_position.shape[-1] > 1
-        from_hf_generate_prefill = isinstance(input_ids, list)
+        is_prefill = cache_position.shape[-1] > 1
         if inputs_embeds is not None:
             raise NotImplementedError("Specifying inputs_embeds is not supported.")
-        if from_hf_generate_prefill:
-            inputs_embeds = []
-            batch_size = len(input_ids)
+        if is_prefill:
+            # Get text_embeds
+            inputs_embeds = self.text_embedding(input_ids)
-            # Get the number of images in the prompt
-            special_image_token_masks = [input_id == self.config.image_token_index for input_id in input_ids]
-            num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
+            # If any images in the prompt, get image_embeds and merge with text
+            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
+                image_features, _ = self.image_embedding(
+                    image_sizes, pixel_values, vision_feature_layer, vision_feature_select_strategy
+                )
-            # Split images for each prompt
-            pixel_values = pixel_values.split(num_special_image_tokens, dim=0)
-            image_sizes = image_sizes.split(num_special_image_tokens, dim=0)
-            for b_idx in range(batch_size):
-                embed, cache_pos = self._embed(
-                    input_ids=input_ids[b_idx],
-                    image_sizes=image_sizes[b_idx] if image_sizes is not None else None,
-                    attention_mask=torch.ones_like(input_ids[b_idx]),
-                    pixel_values=pixel_values[b_idx] if pixel_values is not None else None,
-                    vision_feature_layer=vision_feature_layer,
-                    vision_feature_select_strategy=vision_feature_select_strategy,
-                    cache_position=cache_position[b_idx],
-                    past_cached_length=past_cached_length[b_idx : b_idx + 1],
+                def merge_vllm_multimodal_embeddings(
+                    input_ids: torch.Tensor,
+                    inputs_embeds: torch.Tensor,
+                    multimodal_embeddings: torch.Tensor,
+                    placeholder_token_id: int,
+                ) -> torch.Tensor:
+                    mask = input_ids == placeholder_token_id
+                    num_expected_tokens = mask.sum().item()
+                    if multimodal_embeddings.shape[0] != num_expected_tokens:
+                        raise ValueError(
+                            f"Attempted to assign {inputs_embeds[mask].shape} = {multimodal_embeddings.shape} "
+                            f"multimodal tokens to {num_expected_tokens} placeholders"
+                        )
+                    inputs_embeds[mask] = multimodal_embeddings
+                    return inputs_embeds
+                inputs_embeds = merge_vllm_multimodal_embeddings(
+                    input_ids, inputs_embeds, image_features, self.config.image_token_index
                 )
-                inputs_embeds.append(embed)
-                cache_position[b_idx] = cache_pos
-                past_cached_length[b_idx] += embed.shape[1]
-        elif from_vllm_prefill:
-            inputs_embeds, cache_position = self._embed(
-                input_ids=input_ids,
-                image_sizes=image_sizes,
-                attention_mask=torch.ones_like(input_ids),
-                pixel_values=pixel_values,
-                vision_feature_layer=vision_feature_layer,
-                vision_feature_select_strategy=vision_feature_select_strategy,
-                cache_position=cache_position,
-                past_cached_length=past_cached_length,
-                from_vllm_prefill=from_vllm_prefill,
-            )
         else:
-            # Decoding step
-            inputs_embeds, cache_position = self._embed(
-                input_ids=input_ids,
-                image_sizes=image_sizes,
-                attention_mask=torch.ones_like(input_ids),
-                pixel_values=pixel_values,
-                vision_feature_layer=vision_feature_layer,
-                vision_feature_select_strategy=vision_feature_select_strategy,
-                cache_position=cache_position,
-                past_cached_length=past_cached_length,
-            )
+            inputs_embeds = self.text_embedding(input_ids=input_ids)
-        outputs: RBLNDecoderOnlyOutput = self.language_model(
+        outputs: RBLNDecoderOnlyOutput = self.language_model.vllm_forward(
             inputs_embeds=inputs_embeds,
             batch_idx=batch_idx,
             cache_position=cache_position,
-            past_cached_length=past_cached_length,
         )
         return outputs

optimum-rbln 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

optimum-rbln 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl