PyPI - optimum-rbln - Versions diffs - 0.8.2a0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

optimum-rbln 0.8.2a0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -13,9 +13,11 @@
 # limitations under the License.
 import inspect
-from typing import Any, Callable
+from typing import Any, Callable, Optional, Tuple, Union
+import torch
 from transformers import BartForConditionalGeneration, PreTrainedModel
+from transformers.modeling_outputs import Seq2SeqModelOutput
 from ....utils.logging import get_logger
 from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
@@ -35,6 +37,25 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
     on RBLN devices, optimized for feature extraction use cases.
     """
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+        """
+        Forward pass for the RBLN-optimized BART model for feature extraction tasks.
+        Args:
+            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
+            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
+        """
+        return super().forward(input_ids, attention_mask, **kwargs)
 class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     """
@@ -48,7 +69,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     support_causal_attn = True
     @classmethod
-    def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
+    def _wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
         return BartWrapper(
             model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
         )

optimum/rbln/transformers/models/bert/bert_architecture.py ADDED Viewed

@@ -0,0 +1,16 @@
+import torch
+class BertModelWrapper(torch.nn.Module):
+    def __init__(self, model, rbln_config):
+        super().__init__()
+        self.model = model
+        self.rbln_config = rbln_config
+    def forward(self, *args, **kwargs):
+        output = self.model(*args, **kwargs)
+        if isinstance(output, torch.Tensor):
+            return output
+        elif isinstance(output, tuple):
+            return tuple(x for x in output if x is not None)
+        return output

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -12,15 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ....utils.logging import get_logger
+from typing import Optional, Tuple, Union
+import torch
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+)
 from ...modeling_generic import (
     RBLNModelForMaskedLM,
     RBLNModelForQuestionAnswering,
     RBLNTransformerEncoderForFeatureExtraction,
 )
-logger = get_logger(__name__)
+from .bert_architecture import BertModelWrapper
+from .configuration_bert import RBLNBertModelConfig
 class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
@@ -34,6 +41,46 @@ class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
     rbln_model_input_names = ["input_ids", "attention_mask"]
+    @classmethod
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
+        return BertModelWrapper(model, rbln_config)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple]:
+        """
+        Forward pass for the RBLN-optimized BERT model for feature extraction tasks.
+        Args:
+            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
+            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
+            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
+            position_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of positions of each input sequence tokens in the position embeddings.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPoolingAndCrossAttentions object.
+        """
+        input_map = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+        }
+        model_input_names = getattr(self.rbln_config, "model_input_names", None)
+        if model_input_names is None:
+            model_input_names = self.rbln_model_input_names
+        ordered_inputs = [input_map[name] for name in model_input_names if name in input_map]
+        return super().forward(*ordered_inputs, **kwargs)
 class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
     """
@@ -46,6 +93,27 @@ class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
     rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[MaskedLMOutput, Tuple]:
+        """
+        Forward pass for the RBLN-optimized BERT model for masked language modeling tasks.
+        Args:
+            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
+            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
+            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
+        """
+        return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
 class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
     """
@@ -57,3 +125,24 @@ class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
     """
     rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[QuestionAnsweringModelOutput, Tuple]:
+        """
+        Forward pass for the RBLN-optimized BERT model for question answering tasks.
+        Args:
+            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
+            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
+            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
+        Returns:
+            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
+        """
+        return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)

optimum/rbln/transformers/models/blip_2/configuration_blip_2.py CHANGED Viewed

@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
+from ....utils.logging import get_logger
+logger = get_logger(__name__)
 class RBLNBlip2VisionModelConfig(RBLNModelConfig):
@@ -25,6 +29,16 @@ class RBLNBlip2VisionModelConfig(RBLNModelConfig):
     RBLN-optimized BLIP-2 vision encoder models for multimodal tasks.
     """
+    def __init__(
+        self,
+        batch_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.batch_size = batch_size or 1
+        if not isinstance(self.batch_size, int) or self.batch_size < 0:
+            raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
 class RBLNBlip2QFormerModelConfig(RBLNModelConfig):
     """
@@ -36,24 +50,34 @@ class RBLNBlip2QFormerModelConfig(RBLNModelConfig):
     def __init__(
         self,
+        batch_size: Optional[int] = None,
         num_query_tokens: Optional[int] = None,
         image_text_hidden_size: Optional[int] = None,
         **kwargs,
     ):
         """
         Args:
-            batch_size (Optional[int]): The batch size for inference. Defaults to 1.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
-        Raises:
-            ValueError: If batch_size is not a positive integer.
+            num_query_tokens (Optional[int]): The number of query tokens passed through the Transformer.
+            image_text_hidden_size (Optional[int]): Dimensionality of the hidden state of the image-text fusion layer.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         """
         super().__init__(**kwargs)
+        self.batch_size = batch_size or 1
+        if not isinstance(self.batch_size, int) or self.batch_size < 0:
+            raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
         self.num_query_tokens = num_query_tokens
         self.image_text_hidden_size = image_text_hidden_size
 class RBLNBlip2ForConditionalGenerationConfig(RBLNModelConfig):
+    """
+    Configuration class for RBLNBlip2ForConditionalGeneration.
+    This configuration class stores the configuration parameters specific to
+    RBLN-optimized BLIP-2 models for conditional generation tasks that involve both image and text inputs.
+    """
     submodules = ["vision_model", "qformer", "language_model"]
     def __init__(
@@ -62,14 +86,15 @@ class RBLNBlip2ForConditionalGenerationConfig(RBLNModelConfig):
         vision_model: Optional[RBLNModelConfig] = None,
         qformer: Optional[RBLNModelConfig] = None,
         language_model: Optional[RBLNModelConfig] = None,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ):
         """
         Args:
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             vision_model (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
+            qformer (Optional[RBLNModelConfig]): Configuration for the RBLN-optimized BLIP-2 Q-Former model.
             language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If batch_size is not a positive integer.
@@ -79,6 +104,12 @@ class RBLNBlip2ForConditionalGenerationConfig(RBLNModelConfig):
         if not isinstance(self.batch_size, int) or self.batch_size < 0:
             raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
-        self.vision_model = self.init_submodule_config(RBLNBlip2VisionModelConfig, vision_model)
-        self.language_model = language_model
-        self.qformer = self.init_submodule_config(RBLNBlip2QFormerModelConfig, qformer)
+        if self.batch_size != 1:
+            logger.warning("Ignore batch_size for Blip2 vision model. It will be set to 1.")
+            logger.warning("Ignore batch_size for Blip2 qformer. It will be set to 1.")
+        self.vision_model = self.initialize_submodule_config(
+            submodule_config=vision_model, batch_size=1, force_kwargs=True
+        )
+        self.qformer = self.initialize_submodule_config(submodule_config=qformer, batch_size=1, force_kwargs=True)
+        self.language_model = self.initialize_submodule_config(submodule_config=language_model)

optimum/rbln/transformers/models/blip_2/modeling_blip_2.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import inspect
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
 import torch
 from transformers import (
@@ -30,38 +30,31 @@ from transformers.utils import logging
 from ....configuration_utils import RBLNCompileConfig, RBLNModelConfig
 from ....modeling import RBLNModel
+from ...utils.rbln_runtime_wrapper import LoopProcessor
+from ..decoderonly.generation_decoderonly import RBLNDecoderOnlyGenerationMixin
 logger = logging.get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-    )
+    import rebel
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
-class LoopProjector:
-    def __init__(self, language_projection) -> None:
-        self.language_projection = language_projection
+class LoopProjector(LoopProcessor):
+    def __init__(self, language_projection: Union[RBLNModel, "rebel.Runtime"]):
+        super().__init__(model=language_projection)
-    def forward(self, *args, **kwargs):
-        query_output = args[0]
+    def _get_batch_size(self, query_output, **kwargs):
+        return query_output.shape[0]
-        batch_size = query_output.shape[0]
-        outputs = []
-        for i in range(batch_size):
-            outputs.append(self.language_projection(query_output[i : i + 1]))
-        outputs = torch.cat(outputs, dim=0)
-        return outputs
+    def _prepare_inputs_for_iteration(self, index, common_inputs, query_output, **kwargs):
+        query_output_item = query_output[index : index + 1]
+        return ([query_output_item], {})
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-        return self.forward(*args, **kwds)
-    def __repr__(self) -> str:
-        return repr(self.language_projection)
+    def _process_outputs(self, outputs: list, **kwargs):
+        output = torch.cat(outputs, dim=0)
+        return output
 class RBLNBlip2VisionModel(RBLNModel):
@@ -72,11 +65,13 @@ class RBLNBlip2VisionModel(RBLNModel):
     on RBLN devices, supporting image encoding for multimodal vision-language tasks.
     """
+    _tp_support = False
     def get_input_embeddings(self):
         return self.embeddings
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         class Blip2VisionModelWrapper(torch.nn.Module):
             def __init__(self, model: "Blip2VisionModel") -> None:
                 super().__init__()
@@ -100,8 +95,7 @@ class RBLNBlip2VisionModel(RBLNModel):
             (
                 "pixel_values",
                 [
-                    # support for vllm CB (prefill)
-                    1,
+                    rbln_config.batch_size,
                     model_config.num_channels,
                     model_config.image_size,
                     model_config.image_size,
@@ -116,12 +110,21 @@ class RBLNBlip2VisionModel(RBLNModel):
     def forward(
         self,
-        pixel_values,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
+        pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        """
+        Forward pass for the RBLN-optimized Blip2VisionModel model.
+        Args:
+            pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
+            interpolate_pos_encoding (bool, optional): Whether to interpolate the positional encoding of the image embeddings. Defaults to False.
+            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
+        Returns:
+            BaseModelOutputWithPooling or tuple(torch.FloatTensor): The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
+        """
         batch_size = pixel_values.shape[0]
         outputs = []
         for i in range(batch_size):
@@ -151,11 +154,13 @@ class RBLNBlip2QFormerModel(RBLNModel):
     mechanisms for multimodal understanding tasks.
     """
+    _tp_support = False
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
     @classmethod
-    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         class Blip2QFormerModelWrapper(torch.nn.Module):
             def __init__(self, model: "Blip2QFormerModel"):
                 super().__init__()
@@ -178,7 +183,12 @@ class RBLNBlip2QFormerModel(RBLNModel):
         return Blip2QFormerModelWrapper(model).eval()
     @classmethod
-    def _update_submodule_config(cls, model: "PreTrainedModel", rbln_config: "RBLNModelConfig") -> "RBLNModelConfig":
+    def _update_submodule_config(
+        cls,
+        model: "PreTrainedModel",
+        rbln_config: RBLNModelConfig,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]],
+    ):
         if rbln_config.num_query_tokens is None:
             rbln_config.num_query_tokens = model.config.num_query_tokens
@@ -199,7 +209,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
             (
                 "query_embeds",
                 [
-                    1,
+                    rbln_config.batch_size,
                     rbln_config.num_query_tokens,
                     model_config.hidden_size,
                 ],
@@ -208,7 +218,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
             (
                 "encoder_hidden_states",
                 [
-                    1,
+                    rbln_config.batch_size,
                     # image_text_hidden_size + cls token
                     rbln_config.image_text_hidden_size + 1,
                     model_config.encoder_hidden_size,
@@ -218,7 +228,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
             (
                 "encoder_attention_mask",
                 # image_text_hidden_size + cls token
-                [1, rbln_config.image_text_hidden_size + 1],
+                [rbln_config.batch_size, rbln_config.image_text_hidden_size + 1],
                 "int64",
             ),
         ]
@@ -230,17 +240,22 @@ class RBLNBlip2QFormerModel(RBLNModel):
     def forward(
         self,
         query_embeds: torch.FloatTensor,
-        query_length: Optional[int] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        """
+        The forward pass for the RBLN-optimized Blip2QFormerModel model.
+        Args:
+            query_embeds (torch.FloatTensor): Hidden states to be used in the attention computation.
+            encoder_hidden_states (torch.FloatTensor, optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder.
+            encoder_attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder.
+            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
+        Returns:
+            BaseModelOutputWithPoolingAndCrossAttentions or tuple(torch.FloatTensor): The model outputs. If `return_dict=False` is passed, returns a tuple of tensors. Otherwise, returns a `BaseModelOutputWithPoolingAndCrossAttentions` object.
+        """
         batch_size = query_embeds.shape[0]
         outputs = []
         for i in range(batch_size):
@@ -265,7 +280,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
         )
-class RBLNBlip2ForConditionalGeneration(RBLNModel):
+class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixin):
     """
     RBLNBlip2ForConditionalGeneration is a multi-modal model that integrates vision and language processing capabilities,
     optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
@@ -348,7 +363,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel):
         return self.language_model.get_input_embeddings()
     @classmethod
-    def wrap_model_if_needed(cls, model, rbln_config):
+    def _wrap_model_if_needed(cls, model, rbln_config):
         return model.language_projection
     @classmethod
@@ -433,3 +448,79 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel):
             )
         return inputs_embeds
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> List[torch.LongTensor]:
+        """
+        The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
+        Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.generate) for more details.
+        Args:
+            pixel_values (torch.FloatTensor): Input images to be processed.
+            input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
+            attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
+            inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
+            interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
+        Returns:
+            A list of strings of length batch_size * num_captions.
+        """
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(
+            pixel_values,
+            return_dict=True,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        ).last_hidden_state
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+        if query_output.dtype != image_embeds.dtype:
+            query_output = query_output.to(image_embeds.dtype)
+        language_model_inputs = self.language_projection(query_output)
+        if inputs_embeds is None:
+            if input_ids is None:
+                image_tokens = [self.config.image_token_index] * self.config.num_query_tokens
+                start_tokens = image_tokens + [self.config.text_config.bos_token_id]
+                input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device)
+                input_ids = input_ids.repeat(batch_size, 1)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+        if not self.language_model.config.is_encoder_decoder:
+            inputs["input_ids"] = input_ids
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
+        return outputs

optimum/rbln/transformers/models/clip/configuration_clip.py CHANGED Viewed

@@ -12,20 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 from ....configuration_utils import RBLNModelConfig
 class RBLNCLIPTextModelConfig(RBLNModelConfig):
-    def __init__(self, batch_size: Optional[int] = None, **kwargs: Dict[str, Any]):
+    def __init__(self, batch_size: Optional[int] = None, **kwargs: Any):
         """
         Args:
             batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1
@@ -43,16 +43,27 @@ class RBLNCLIPTextModelWithProjectionConfig(RBLNCLIPTextModelConfig):
 class RBLNCLIPVisionModelConfig(RBLNModelConfig):
-    def __init__(self, batch_size: Optional[int] = None, image_size: Optional[int] = None, **kwargs: Dict[str, Any]):
+    def __init__(
+        self,
+        batch_size: Optional[int] = None,
+        image_size: Optional[int] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        **kwargs: Any,
+    ):
         """
         Args:
             batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
             image_size (Optional[int]): The size of input images. Can be an integer for square images,
                 a tuple/list (height, width), or a dictionary with 'height' and 'width' keys.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            interpolate_pos_encoding (Optional[bool]): Whether or not to interpolate pre-trained position encodings. Defaults to `False`.
+            output_hidden_states (Optional[bool]): Whether or not to return the hidden states of all layers.
+            output_attentions (Optional[bool]): Whether or not to return the attentions tensors of all attention layers
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1
@@ -60,6 +71,9 @@ class RBLNCLIPVisionModelConfig(RBLNModelConfig):
             raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")
         self.image_size = image_size
+        self.interpolate_pos_encoding = interpolate_pos_encoding or False
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
     @property
     def image_width(self):

optimum-rbln 0.8.2a0__py3-none-any.whl → 0.9.3__py3-none-any.whl

optimum-rbln 0.8.2a0py3-none-any.whl → 0.9.3py3-none-any.whl