PyPI - optimum-rbln - Versions diffs - 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl - Mend

optimum-rbln 0.9.3py3-none-any.whl → 0.9.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

optimum/rbln/transformers/models/bart/modeling_bart.py CHANGED Viewed

@@ -13,11 +13,9 @@
 # limitations under the License.
 import inspect
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable
-import torch
 from transformers import BartForConditionalGeneration, PreTrainedModel
-from transformers.modeling_outputs import Seq2SeqModelOutput
 from ....utils.logging import get_logger
 from ...modeling_generic import RBLNTransformerEncoderForFeatureExtraction
@@ -37,25 +35,6 @@ class RBLNBartModel(RBLNTransformerEncoderForFeatureExtraction):
     on RBLN devices, optimized for feature extraction use cases.
     """
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[Tuple, Seq2SeqModelOutput]:
-        """
-        Forward pass for the RBLN-optimized BART model for feature extraction tasks.
-        Args:
-            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
-            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a Seq2SeqModelOutput object.
-        """
-        return super().forward(input_ids, attention_mask, **kwargs)
 class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     """
@@ -69,7 +48,7 @@ class RBLNBartForConditionalGeneration(RBLNModelForSeq2SeqLM):
     support_causal_attn = True
     @classmethod
-    def _wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
+    def wrap_model_if_needed(self, model: PreTrainedModel, rbln_config: RBLNBartForConditionalGenerationConfig):
         return BartWrapper(
             model, enc_max_seq_len=rbln_config.enc_max_seq_len, use_attention_mask=rbln_config.use_attention_mask
         )

optimum/rbln/transformers/models/bert/modeling_bert.py CHANGED Viewed

@@ -12,14 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple, Union
 import torch
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    MaskedLMOutput,
-    QuestionAnsweringModelOutput,
-)
 from ...modeling_generic import (
     RBLNModelForMaskedLM,
@@ -42,45 +35,9 @@ class RBLNBertModel(RBLNTransformerEncoderForFeatureExtraction):
     rbln_model_input_names = ["input_ids", "attention_mask"]
     @classmethod
-    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNBertModelConfig) -> torch.nn.Module:
         return BertModelWrapper(model, rbln_config)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple]:
-        """
-        Forward pass for the RBLN-optimized BERT model for feature extraction tasks.
-        Args:
-            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
-            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
-            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
-            position_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of positions of each input sequence tokens in the position embeddings.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPoolingAndCrossAttentions object.
-        """
-        input_map = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-            "position_ids": position_ids,
-        }
-        model_input_names = getattr(self.rbln_config, "model_input_names", None)
-        if model_input_names is None:
-            model_input_names = self.rbln_model_input_names
-        ordered_inputs = [input_map[name] for name in model_input_names if name in input_map]
-        return super().forward(*ordered_inputs, **kwargs)
 class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
     """
@@ -93,27 +50,6 @@ class RBLNBertForMaskedLM(RBLNModelForMaskedLM):
     rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[MaskedLMOutput, Tuple]:
-        """
-        Forward pass for the RBLN-optimized BERT model for masked language modeling tasks.
-        Args:
-            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
-            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
-            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a MaskedLMOutput object.
-        """
-        return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)
 class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
     """
@@ -125,24 +61,3 @@ class RBLNBertForQuestionAnswering(RBLNModelForQuestionAnswering):
     """
     rbln_model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[QuestionAnsweringModelOutput, Tuple]:
-        """
-        Forward pass for the RBLN-optimized BERT model for question answering tasks.
-        Args:
-            input_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Indices of input sequence tokens in the vocabulary.
-            attention_mask (torch.Tensor of shape (batch_size, sequence_length), optional): Mask to avoid performing attention on padding token indices.
-            token_type_ids (torch.Tensor of shape (batch_size, sequence_length), optional): Segment token indices to indicate first and second portions of the inputs.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a QuestionAnsweringModelOutput object.
-        """
-        return super().forward(input_ids, attention_mask, token_type_ids, **kwargs)

optimum/rbln/transformers/models/blip_2/modeling_blip_2.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import inspect
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple, Union
 import torch
 from transformers import (
@@ -71,7 +71,7 @@ class RBLNBlip2VisionModel(RBLNModel):
         return self.embeddings
     @classmethod
-    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         class Blip2VisionModelWrapper(torch.nn.Module):
             def __init__(self, model: "Blip2VisionModel") -> None:
                 super().__init__()
@@ -111,20 +111,11 @@ class RBLNBlip2VisionModel(RBLNModel):
     def forward(
         self,
         pixel_values: torch.FloatTensor,
-        interpolate_pos_encoding: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        """
-        Forward pass for the RBLN-optimized Blip2VisionModel model.
-        Args:
-            pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
-            interpolate_pos_encoding (bool, optional): Whether to interpolate the positional encoding of the image embeddings. Defaults to False.
-            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
-        Returns:
-            BaseModelOutputWithPooling or tuple(torch.FloatTensor): The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
-        """
         batch_size = pixel_values.shape[0]
         outputs = []
         for i in range(batch_size):
@@ -160,7 +151,7 @@ class RBLNBlip2QFormerModel(RBLNModel):
         return self.embeddings.word_embeddings
     @classmethod
-    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNModelConfig) -> torch.nn.Module:
         class Blip2QFormerModelWrapper(torch.nn.Module):
             def __init__(self, model: "Blip2QFormerModel"):
                 super().__init__()
@@ -240,22 +231,17 @@ class RBLNBlip2QFormerModel(RBLNModel):
     def forward(
         self,
         query_embeds: torch.FloatTensor,
+        query_length: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
-        """
-        The forward pass for the RBLN-optimized Blip2QFormerModel model.
-        Args:
-            query_embeds (torch.FloatTensor): Hidden states to be used in the attention computation.
-            encoder_hidden_states (torch.FloatTensor, optional): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder.
-            encoder_attention_mask (torch.FloatTensor, optional): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder.
-            return_dict (bool, optional): Whether to return a ModelOutput instead of a plain tuple.
-        Returns:
-            BaseModelOutputWithPoolingAndCrossAttentions or tuple(torch.FloatTensor): The model outputs. If `return_dict=False` is passed, returns a tuple of tensors. Otherwise, returns a `BaseModelOutputWithPoolingAndCrossAttentions` object.
-        """
         batch_size = query_embeds.shape[0]
         outputs = []
         for i in range(batch_size):
@@ -363,7 +349,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
         return self.language_model.get_input_embeddings()
     @classmethod
-    def _wrap_model_if_needed(cls, model, rbln_config):
+    def wrap_model_if_needed(cls, model, rbln_config):
         return model.language_projection
     @classmethod
@@ -458,20 +444,7 @@ class RBLNBlip2ForConditionalGeneration(RBLNModel, RBLNDecoderOnlyGenerationMixi
         inputs_embeds: Optional[torch.FloatTensor] = None,
         interpolate_pos_encoding: bool = False,
         **generate_kwargs,
-    ) -> List[torch.LongTensor]:
-        """
-        The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
-        Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/blip-2#transformers.Blip2ForConditionalGeneration.generate) for more details.
-        Args:
-            pixel_values (torch.FloatTensor): Input images to be processed.
-            input_ids (torch.LongTensor, optional): The sequence used as a prompt for the generation.
-            attention_mask (torch.LongTensor, optional): Mask to avoid performing attention on padding token indices
-            inputs_embeds (torch.FloatTensor, optional): Embedded representation of the inputs. Should be float, not int tokens.
-            interpolate_pos_encoding (bool, optional, defaults to False) — Whether to interpolate the positional encoding of the image embeddings.
-        Returns:
-            A list of strings of length batch_size * num_captions.
-        """
+    ) -> torch.LongTensor:
         batch_size = pixel_values.shape[0]
         image_embeds = self.vision_model(
             pixel_values,

optimum/rbln/transformers/models/clip/modeling_clip.py CHANGED Viewed

@@ -54,7 +54,7 @@ class RBLNCLIPTextModel(RBLNModel):
     _tp_support = False
     @classmethod
-    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPTextModelConfig) -> torch.nn.Module:
         return _TextEncoder(model).eval()
     @classmethod
@@ -92,9 +92,6 @@ class RBLNCLIPTextModel(RBLNModel):
         Args:
             input_ids (torch.LongTensor): The input ids to the model.
             return_dict (Optional[bool]): Whether to return a dictionary of outputs.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPTextModelOutput object.
         """
         # To ignore using attention_mask, we override forward method.
@@ -160,7 +157,7 @@ class RBLNCLIPVisionModel(RBLNModel):
     _tp_support = False
     @classmethod
-    def _wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
+    def wrap_model_if_needed(cls, model: torch.nn.Module, rbln_config: RBLNCLIPVisionModelConfig) -> torch.nn.Module:
         wrapper_cfg = {
             "interpolate_pos_encoding": rbln_config.interpolate_pos_encoding,
             "output_hidden_states": rbln_config.output_hidden_states,
@@ -233,9 +230,6 @@ class RBLNCLIPVisionModel(RBLNModel):
             output_attentions (Optional[bool]): Whether to return attentions.
             output_hidden_states (Optional[bool]): Whether to return hidden states.
             interpolate_pos_encoding (bool): Whether to interpolate position encoding.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a BaseModelOutputWithPooling object.
         """
         if len(kwargs) > 0 and any(value is not None for value in kwargs.values()):
@@ -313,38 +307,6 @@ class RBLNCLIPVisionModelWithProjection(RBLNCLIPVisionModel):
     multimodal embedding alignment tasks.
     """
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor,
-        return_dict: bool = True,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
-        **kwargs,
-    ) -> Union[Tuple, CLIPVisionModelOutput]:
-        """
-        Forward pass for the RBLN-optimized CLIP vision encoder model with projection.
-        Args:
-            pixel_values (torch.Tensor): The pixel values to the model.
-            return_dict (bool): Whether to return a dictionary of outputs.
-            output_attentions (Optional[bool]): Whether to return attentions.
-            output_hidden_states (Optional[bool]): Whether to return hidden states.
-            interpolate_pos_encoding (bool): Whether to interpolate position encoding.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a CLIPVisionModelOutput object.
-        """
-        return super().forward(
-            pixel_values=pixel_values,
-            return_dict=return_dict,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            **kwargs,
-        )
     def _prepare_output(self, output, return_dict):
         # Prepare model output based on return_dict flag.
         # This method can be overridden by subclasses to provide task-specific output handling.

optimum/rbln/transformers/models/colpali/modeling_colpali.py CHANGED Viewed

@@ -14,7 +14,8 @@
 import bisect
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 import torch
 from transformers import PretrainedConfig, PreTrainedModel
@@ -181,7 +182,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
         return multi_modal_projector
     @classmethod
-    def _wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
+    def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: RBLNModelConfig):
         return RBLNColPaliForRetrievalWrapper(
             causal_lm=model.vlm,
             embedding_proj_layer=model.embedding_proj_layer,
@@ -235,11 +236,49 @@ class RBLNColPaliForRetrieval(RBLNModel):
         return rbln_config
     @classmethod
-    def _reconstruct_model_if_needed(cls, model: "PreTrainedModel"):
-        if hasattr(model, "vlm"):
+    def from_model(
+        cls,
+        model: "PreTrainedModel",
+        config: Optional[PretrainedConfig] = None,
+        rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        subfolder: str = "",
+        **kwargs: Any,
+    ) -> "RBLNModel":
+        """
+        Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
+        This method performs the actual model conversion and compilation process.
+        Args:
+            model (PreTrainedModel): The PyTorch model to be compiled.
+                The object must be an instance of the HuggingFace transformers PreTrainedModel class.
+            config (Optional[PretrainedConfig]): The configuration object associated with the model.
+            rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
+                This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
+                For detailed configuration options, see the specific model's configuration class documentation.
+            kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
+        The method performs the following steps:
+        1. Compiles the PyTorch model into an optimized RBLN graph
+        2. Configures the model for the specified NPU device
+        3. Creates the necessary runtime objects if requested
+        4. Saves the compiled model and configurations
+        Returns:
+            (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
+        """
+        if not hasattr(model, "vision_tower"):
             model.vision_tower = model.vlm.vision_tower
             del model.vlm.model.vision_tower
-            return model
+        model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
+        return model
+    @classmethod
+    def get_pytorch_model(cls, *args, **kwargs):
+        model = super().get_pytorch_model(*args, **kwargs)
+        model.vision_tower = model.vlm.vision_tower
+        del model.vlm.model.vision_tower
         return model
     def get_image_features(self, pixel_values: torch.Tensor):

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -274,18 +274,13 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
     @property
     def use_lora(self):
+        """Check if LoRA is enabled for this configuration."""
         return self.lora_config is not None
     @property
     def can_generate(self) -> bool:
         return "decode" in self.phases
-    @property
-    def nbits_per_param(self) -> int:
-        if self.quantization:
-            return self.quantization.nbits_per_param
-        return 16
 class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
     """

optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py CHANGED Viewed

@@ -46,12 +46,6 @@ class RBLNPageTableManager:
         """
         If the block is empty (empty_block), allocates a block from the free_block_pool.
         """
-        if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
-            raise IndexError(
-                f"Invalid index(batch_idx={batch_idx}, block_idx={block_idx}): \n \
-                               BlockTable Shape(batch_axis, block_axis): {self.block_tables.shape}, BlockSize: {self.rbln_config.kvcache_block_size}"
-            )
         if self.block_tables[batch_idx][block_idx] == self.EMPTY_BLOCK:
             if self.free_block_pool:
                 block = self.free_block_pool.popleft()
@@ -102,6 +96,8 @@ class RBLNPageTableManager:
                 s, e = cache_position[0][0].item(), cache_position[0][-1].item()
                 for position in range(s, e + 1, self.rbln_config.kvcache_block_size):
                     block_idx = position // self.rbln_config.kvcache_block_size
+                    if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+                        raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
                     self.update_block(batch_idx, block_idx)
                 return self.replace_empty_block(self.block_tables[batch_idx])

optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py CHANGED Viewed

@@ -12,12 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional
 import torch
-from transformers import GenerationConfig
 from transformers.generation.utils import GenerationMixin
-from transformers.modeling_outputs import ModelOutput
 if TYPE_CHECKING:
@@ -93,26 +91,20 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
         self,
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
+        max_length: Optional[int] = None,
         **kwargs,
-    ) -> Union[ModelOutput, torch.LongTensor]:
+    ):
         """
         The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
-        Check the [HuggingFace transformers documentation](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) for more details.
         Args:
-            input_ids (torch.LongTensor): The input ids to the model.
-            attention_mask (torch.LongTensor, optional): The attention mask to the model.
-            generation_config (GenerationConfig, optional): The generation configuration to be used as base parametrization for the generation call. **kwargs passed to generate matching the attributes of generation_config will override them.
-                If generation_config is not provided, the default will be used, which had the following loading priority: 1) from the generation_config.json model file, if it exists; 2) from the model configuration.
-                Please note that unspecified parameters will inherit [GenerationConfig](https://huggingface.co/docs/transformers/v4.57.1/en/main_classes/text_generation#transformers.GenerationConfig)’s default values.
-            kwargs (dict[str, Any], optional): Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
-        Returns:
-            A ModelOutput (if return_dict_in_generate=True or when config.return_dict_in_generate=True) or a torch.LongTensor.
+            input_ids: The input ids to the model.
+            attention_mask: The attention mask to the model.
+            max_length: The maximum length of the generated text.
+            kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
         """
-        if generation_config is not None:
-            kwargs["generation_config"] = generation_config
+        if max_length is not None:
+            kwargs["max_length"] = max_length
         if attention_mask is not None:
             kwargs["attention_mask"] = attention_mask

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -216,7 +216,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         return self.rbln_config.kvcache_num_blocks
     @classmethod
-    def _wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
+    def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
         return cls._decoder_wrapper_cls(model, rbln_config, cls._use_rotary_emb).eval()
     @classmethod
@@ -272,7 +272,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
-        wrapped_model = cls._wrap_model_if_needed(model, rbln_config)
+        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
         prefill_compile_config = rbln_config.compile_cfgs[0]
         # Here we use meta tensor, for the memory efficiency.
@@ -466,8 +466,13 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         # Update kvcache_num_blocks based on the attention implementation.
         if rbln_config.attn_impl == "flash_attn":
-            estimated_max_num_blocks = cls.get_maximum_num_blocks_by_model(
-                model=model, model_config=model_config, rbln_config=rbln_config
+            estimated_max_num_blocks = cls.get_maximum_num_blocks(
+                config=model_config,
+                tensor_parallel_size=rbln_config.tensor_parallel_size or 1,
+                kvcache_block_size=rbln_config.kvcache_block_size,
+                nbits_per_param=16 if not rbln_config.quantization else 4,  # TODO(jongho): FIX Ad-hoc
+                n_model_params=sum(p.numel() for p in model.parameters()),
+                num_runtimes=1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes),
             )
             if rbln_config.kvcache_num_blocks is None:
@@ -506,6 +511,7 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
                     f" than the required number of blocks ({num_full_blocks})."
                     "This can cause a failure during model compilation."
                 )
         logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
         return rbln_config
@@ -602,21 +608,11 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
+        position_embed: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> BaseModelOutputWithPast:
-        """
-        Args:
-            input_ids (torch.LongTensor, optional): The input IDs to the model.
-            inputs_embeds (torch.Tensor, optional): The input embeddings to the model.
-            attention_mask (torch.LongTensor, optional): The attention mask to the model.
-            kwargs (dict[str, Any], optional): Additional keyword arguments.
-        Returns:
-            Dataclass containing the last hidden states of the model.
-        """
+    ) -> Tuple[torch.FloatTensor]:
         inputs = inputs_embeds if inputs_embeds is not None else input_ids
         batch_size = inputs.shape[0]
-        position_embed = kwargs.get("position_embed", None)
         if batch_size != self.rbln_config.batch_size:
             raise ValueError(
@@ -639,7 +635,6 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
             all_last_hidden_states.append(last_hidden_states)
         last_hidden_states = torch.concat(all_last_hidden_states, dim=0)
         return BaseModelOutputWithPast(last_hidden_state=last_hidden_states)
@@ -764,16 +759,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
             logits = []
             inputs = inputs_embeds if inputs_embeds is not None else input_ids
             batch_size = inputs.shape[0]
-            input_len = inputs.shape[1]
-            if batch_size > self.rbln_config.batch_size:
-                raise ValueError(
-                    f"Input's batch({batch_size}) exceeds compiled batch_size({self.rbln_config.batch_size})"
-                )
-            if input_len > self.rbln_config.max_seq_len:
-                raise ValueError(
-                    f"Input's length({input_len}) exceeds compiled max_seq_len({self.rbln_config.max_seq_len})."
-                )
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
                 output = self.prefill_decoder(
@@ -798,15 +783,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
                     f"Available batch sizes are: {list(self.decoders.keys())}. "
                     f"Please run your model with one of these batch sizes or add support for batch size {batch_size}."
                 )
-            if max(cache_position.reshape(-1)) >= self.rbln_config.max_seq_len:
-                raise ValueError(
-                    f"Cache position exceeds the maximum sequence length.\n"
-                    f"  - Current max cache position: {int(torch.max(cache_position).item())}\n"
-                    f"  - Allowed max_seq_len: {self.rbln_config.max_seq_len}\n"
-                    f"Solution: Reduce the generation length by adjusting `max_new_tokens` "
-                    f"or `max_length` in the generation config."
-                )
             logits = self.decoders[batch_size](
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,

optimum/rbln/transformers/models/depth_anything/modeling_depth_anything.py CHANGED Viewed

@@ -13,11 +13,6 @@
 # limitations under the License.
-from typing import Tuple, Union
-import torch
-from transformers.modeling_outputs import DepthEstimatorOutput
 from ...modeling_generic import RBLNModelForDepthEstimation
@@ -28,15 +23,3 @@ class RBLNDepthAnythingForDepthEstimation(RBLNModelForDepthEstimation):
     This class provides hardware-accelerated inference for Depth Anything V2
     models on RBLN devices, providing the most capable monocular depth estimation (MDE) model.
     """
-    def forward(self, pixel_values: torch.Tensor, **kwargs) -> Union[Tuple, DepthEstimatorOutput]:
-        """
-        Forward pass for the RBLN-optimized DepthAnythingForDepthEstimation model.
-        Args:
-            pixel_values (torch.FloatTensor of shape (batch_size, num_channels, height, width)): The tensors corresponding to the input images.
-        Returns:
-            The model outputs. If return_dict=False is passed, returns a tuple of tensors. Otherwise, returns a DepthEstimatorOutput object.
-        """
-        return super().forward(pixel_values, **kwargs)

optimum-rbln 0.9.3__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl

optimum-rbln 0.9.3py3-none-any.whl → 0.9.3rc0py3-none-any.whl