PyPI - optimum-rbln - Versions diffs - 0.8.4a6__py3-none-any.whl → 0.8.4a8__py3-none-any.whl - Mend

optimum-rbln 0.8.4a6py3-none-any.whl → 0.8.4a8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (66) hide show

optimum/rbln/transformers/models/colpali/modeling_colpali.py CHANGED Viewed

@@ -14,7 +14,8 @@
 import bisect
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, Union
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 import torch
 from transformers import PretrainedConfig, PreTrainedModel
@@ -126,8 +127,8 @@ class RBLNColPaliForRetrieval(RBLNModel):
     The ColPali Model transformer for document retrieval using vision-language models.
     This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based ColPaliForRetrieval model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers ColPaliForRetrieval model into a RBLN transformer model by:
+    A class to convert and run pre-trained transformers based `ColPaliForRetrieval` model on RBLN devices.
+    It implements the methods to convert a pre-trained transformers `ColPaliForRetrieval` model into a RBLN transformer model by:
     - transferring the checkpoint weights of the original into an optimized RBLN graph,
     - compiling the resulting graph using the RBLN compiler.
@@ -263,11 +264,42 @@ class RBLNColPaliForRetrieval(RBLNModel):
         return rbln_config
     @classmethod
-    def from_model(cls, model: "PreTrainedModel", *args, **kwargs):
+    def from_model(
+        cls,
+        model: "PreTrainedModel",
+        config: Optional[PretrainedConfig] = None,
+        rbln_config: Optional[Union[RBLNModelConfig, Dict]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        subfolder: str = "",
+        **kwargs: Any,
+    ) -> "RBLNModel":
+        """
+        Converts and compiles a pre-trained HuggingFace library model into a RBLN model.
+        This method performs the actual model conversion and compilation process.
+        Args:
+            model (PreTrainedModel): The PyTorch model to be compiled.
+                The object must be an instance of the HuggingFace transformers PreTrainedModel class.
+            config (Optional[PretrainedConfig]): The configuration object associated with the model.
+            rbln_config (Optional[Union[RBLNModelConfig, Dict]]): Configuration for RBLN model compilation and runtime.
+                This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNLlamaForCausalLMConfig` for Llama models).
+                For detailed configuration options, see the specific model's configuration class documentation.
+            kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
+        The method performs the following steps:
+        1. Compiles the PyTorch model into an optimized RBLN graph
+        2. Configures the model for the specified NPU device
+        3. Creates the necessary runtime objects if requested
+        4. Saves the compiled model and configurations
+        Returns:
+            (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
+        """
         if not hasattr(model, "vision_tower"):
             model.vision_tower = model.vlm.vision_tower
             del model.vlm.vision_tower
-        model = super().from_model(model, *args, **kwargs)
+        model = super().from_model(model, config, rbln_config, model_save_dir, subfolder, **kwargs)
         return model
     @classmethod
@@ -334,7 +366,7 @@ class RBLNColPaliForRetrieval(RBLNModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> ColPaliForRetrievalOutput:
+    ) -> Union[Tuple, ColPaliForRetrievalOutput]:
         if pixel_values is not None:
             pixel_values = pixel_values.to(dtype=self.dtype)

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -104,7 +104,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
                 ["prefill", "decode"] if DecoderOnlyModelForCausalLM is used.
             logits_to_keep (Optional[int]): The number of logits to keep for the decoder.  If set to 0, the decoder will keep all logits.
                 Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If `batch_size` is not a positive integer.

optimum/rbln/transformers/models/decoderonly/generation_decoderonly.py CHANGED Viewed

@@ -86,3 +86,26 @@ class RBLNDecoderOnlyGenerationMixin(GenerationMixin):
         model_kwargs["generate_idx"] = outputs.generate_idx
         model_kwargs["padded_cache_lengths"] = outputs.padded_cache_lengths
         return model_kwargs
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        max_length: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        The generate function is utilized in its standard form as in the HuggingFace transformers library. User can use this function to generate text from the model.
+        Args:
+            input_ids: The input ids to the model.
+            attention_mask: The attention mask to the model.
+            max_length: The maximum length of the generated text.
+            kwargs: Additional arguments passed to the generate function. See the HuggingFace transformers documentation for more details.
+        """
+        if max_length is not None:
+            kwargs["max_length"] = max_length
+        if attention_mask is not None:
+            kwargs["attention_mask"] = attention_mask
+        return super().generate(input_ids, **kwargs)

optimum/rbln/transformers/models/exaone/modeling_exaone.py CHANGED Viewed

@@ -14,11 +14,13 @@
 import inspect
-from typing import Any, Callable
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
 from transformers import AutoModelForCausalLM
 from transformers.generation.utils import GenerationMixin
+from ....configuration_utils import RBLNModelConfig
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .exaone_architecture import ExaoneForCausalLMWrapper
@@ -92,9 +94,45 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     _supports_cache_class = True
     @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        kwargs.setdefault("trust_remote_code", True)
-        return super().from_pretrained(*args, **kwargs)
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        *,
+        export: Optional[bool] = None,
+        rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
+        trust_remote_code: Optional[bool] = None,
+        **kwargs: Any,
+    ):
+        """
+        The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
+        User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
+        Args:
+            model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
+                It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
+            export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
+                If None, it will be determined based on the existence of the compiled model files in the model_id.
+            rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
+                This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNExaoneForCausalLMConfig` for EXAONE models).
+                For detailed configuration options, see the specific model's configuration class documentation.
+            trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
+            kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
+        Returns:
+            (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
+        """
+        if trust_remote_code is not None:
+            kwargs["trust_remote_code"] = trust_remote_code
+        elif "trust_remote_code" not in kwargs:
+            kwargs["trust_remote_code"] = True
+        return super().from_pretrained(
+            model_id=model_id,
+            export=export,
+            rbln_config=rbln_config,
+            **kwargs,
+        )
     def __getattr__(self, __name: str) -> Any:
         def redirect(func):

optimum/rbln/transformers/models/gemma3/configuration_gemma3.py CHANGED Viewed

@@ -27,6 +27,21 @@ class RBLNGemma3ForCausalLMConfig(RBLNDecoderOnlyModelForCausalLMConfig):
         image_prefill_chunk_size: Optional[int] = None,
         **kwargs: Any,
     ):
+        """
+        Args:
+            use_position_ids (Optional[bool]): Whether or not to use `position_ids`, which is indices of positions of each input sequence tokens in the position embeddings.
+            use_attention_mask (Optional[bool]): Whether or not to use `attention_mask` to to avoid performing attention on padding token indices.
+            prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
+                processing input sequences. Defaults to 256. Must be a positive integer
+                divisible by 64. Affects prefill performance and memory usage.
+            image_prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
+                processing images. This config is used when `use_image_prefill` is True.
+                Currently, the `prefill_chunk_size` and `image_prefill_chunk_size` should be the same value.
+            kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
+        Raises:
+            ValueError: If `use_attention_mask` or `use_position_ids` are False.
+        """
         # use_attention_mask and use_position_ids are always True for Gemma3
         use_attention_mask = use_attention_mask or True
         use_position_ids = use_position_ids or True
@@ -64,10 +79,10 @@ class RBLNGemma3ForConditionalGenerationConfig(RBLNModelConfig):
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
             language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1

optimum/rbln/transformers/models/gemma3/modeling_gemma3.py CHANGED Viewed

@@ -201,16 +201,15 @@ class RBLNGemma3ForConditionalGeneration(RBLNModel):
         return model_kwargs
     def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        """
-        Projects the last hidden state from the vision model into language model space.
+        # Projects the last hidden state from the vision model into language model space.
-        Args:
-            pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
-                The tensors corresponding to the input images.
+        # Args:
+        #     pixel_values: (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`)
+        #         The tensors corresponding to the input images.
+        # Returns:
+        #     Image feature tensor of shape `(num_images, image_length, embed_dim)`.
-        Returns:
-            Image feature tensor of shape `(num_images, image_length, embed_dim)`.
-        """
         vision_outputs = self.vision_tower(pixel_values).last_hidden_state
         image_features = self.multi_modal_projector(vision_outputs)
         return image_features

optimum/rbln/transformers/models/grounding_dino/configuration_grounding_dino.py CHANGED Viewed

@@ -32,14 +32,20 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
         decoder: Optional["RBLNGroundingDinoDecoderConfig"] = None,
         text_backbone: Optional["RBLNModelConfig"] = None,
         backbone: Optional["RBLNModelConfig"] = None,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
         **kwargs: Any,
     ):
         """
         Args:
-            batch_size (Optional[int]): The batch size for text processing. Defaults to 1.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            batch_size (Optional[int]): The batch size for image and text processing. Defaults to 1.
+            encoder (Optional["RBLNModelConfig"]): The encoder configuration. Defaults to None.
+            decoder (Optional["RBLNModelConfig"]): The decoder configuration. Defaults to None.
+            text_backbone (Optional["RBLNModelConfig"]): The text backbone configuration. Defaults to None.
+            backbone (Optional["RBLNModelConfig"]): The backbone configuration. Defaults to None.
+            output_attentions (Optional[bool]): Whether to output attentions. Defaults to None.
+            output_hidden_states (Optional[bool]): Whether to output hidden states. Defaults to None.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If batch_size is not a positive integer.
@@ -49,8 +55,8 @@ class RBLNGroundingDinoForObjectDetectionConfig(RBLNImageModelConfig):
         self.decoder = decoder
         self.text_backbone = text_backbone
         self.backbone = backbone
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions if output_attentions is not None else False
+        self.output_hidden_states = output_hidden_states if output_hidden_states is not None else False
         if not isinstance(self.batch_size, int) or self.batch_size < 0:
             raise ValueError(f"batch_size must be a positive integer, got {self.batch_size}")

optimum/rbln/transformers/models/idefics3/configuration_idefics3.py CHANGED Viewed

@@ -45,11 +45,15 @@ class RBLNIdefics3ForConditionalGenerationConfig(RBLNModelConfig):
         Args:
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             vision_model (Optional[RBLNModelConfig]): Configuration for the vision transformer component.
+                This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
+                If not provided, default settings will be used.
             text_model (Optional[RBLNModelConfig]): Configuration for the text model component.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+                This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
+                If not provided, default settings will be used.
+            kwargs: Additional arguments passed to the parent `RBLNModelConfig`.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)

optimum/rbln/transformers/models/llava/configuration_llava.py CHANGED Viewed

@@ -39,11 +39,15 @@ class RBLNLlavaForConditionalGenerationConfig(RBLNModelConfig):
         Args:
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
+                This can include settings specific to the vision encoder, such as input resolution or other vision-related parameters.
+                If not provided, default settings will be used.
             language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+                This can include settings specific to the language model, such as tensor parallelism or other text-related parameters.
+                If not provided, default settings will be used.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1

optimum/rbln/transformers/models/llava/modeling_llava.py CHANGED Viewed

@@ -105,6 +105,7 @@ class RBLNLlavaForConditionalGeneration(RBLNModel):
     RBLNLlavaForConditionalGeneration is a multi-modal model that combines vision and language processing capabilities,
     optimized for RBLN NPUs. It is designed for conditional generation tasks that involve both image and text inputs.
     This model inherits from [`RBLNModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
     Important Note:
         This model includes a Large Language Model (LLM) as a submodule. For optimal performance, it is highly recommended to use
         tensor parallelism for the language model. This can be achieved by using the `rbln_config` parameter in the

optimum/rbln/transformers/models/llava_next/configuration_llava_next.py CHANGED Viewed

@@ -45,10 +45,10 @@ class RBLNLlavaNextForConditionalGenerationConfig(RBLNModelConfig):
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             vision_tower (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
             language_model (Optional[RBLNModelConfig]): Configuration for the language model component.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `batch_size` is not a positive integer.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -287,18 +287,15 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         Obtains image last hidden states from the vision tower and apply multimodal projection.
         Args:
-            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
-               The tensors corresponding to the input images.
-            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
-                Actual image size of each images (H, W).
-            vision_feature_layer (`int`):
-                The index of the layer to select the vision feature.
-            vision_feature_select_strategy (`str`):
-                The feature selection strategy used to select the vision feature from the vision backbone.
+            pixel_values (torch.FloatTensor): The tensors corresponding to the input images
+                whose shape is `(batch_size, num_patches, channels, height, width)`.
+            image_sizes (torch.Tensor): Actual image size of each images (H, W).
+            vision_feature_layer (int): The index of the layer to select the vision feature.
+            vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
                 Can be one of `"default"` or `"full"`
         Returns:
-            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
-            and are of shape `(num_patches, image_length, embed_dim)`).
+            image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches
+                and are of shape `(num_patches, image_length, embed_dim)`).
         """
         # ! infer image_num_patches from image_sizes
         image_num_patches = [
@@ -412,23 +409,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
     # Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
-        """
-        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+        # Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+        # Args:
+        #     image_features (List[torch.Tensor]): List of image feature tensor, each contains all the visual feature of all patches.
+        #         Its length is num_images, and each of shape is `(num_patches, image_length, embed_dim)`
+        #     image_sizes (torch.Tensor): Actual image size of each images (H, W).
+        #     vision_feature_select_strategy (str): The feature selection strategy used to select the vision feature from the vision backbone.
+        #     image_newline (torch.Tensor): New line embedding vector whose shape is `embed_dim`.
+        # Returns:
+        #     image_features (torch.Tensor): A torch.Tensor of shape `(all_feat_len, embed_dim)`)
+        #     feature_lens (List[int]): A token length of each image in image_features
-        Args:
-            image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
-                List of image feature tensor, each contains all the visual feature of all patches.
-            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
-                Actual image size of each images (H, W).
-            vision_feature_select_strategy (`str`)
-                The feature selection strategy used to select the vision feature from the vision backbone.
-            image_newline (`torch.Tensor` of shape `(embed_dim)`)
-                New line embedding vector.
-        Returns:
-            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
-            feature_lens (`List[int]`)
-                token length of each image in image_features
-        """
         new_image_features = []
         feature_lens = []
         for image_idx, image_feature in enumerate(image_features):
@@ -478,21 +471,17 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
 # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
 def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
-    """
-    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
-    Args:
-        image_size (`tuple`):
-            The size of the input image in the format (width, height).
-        grid_pinpoints (`List`):
-            A list containing possible resolutions. Each item in the list should be a tuple or list
-            of the form `(height, width)`.
-        patch_size (`int`):
-            The size of each image patch.
-    Returns:
-        tuple: The shape of the image patch grid in the format (width, height).
-    """
+    # Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    # Args:
+    #     image_size (tuple): The size of the input image in the format (width, height).
+    #     grid_pinpoints (list): A list containing possible resolutions.
+    #         Each item in the list should be a tuple or list of the form `(height, width)`.
+    #     patch_size (int): The size of each image patch.
+    # Returns:
+    #     tuple: The shape of the image patch grid in the format (width, height).
     if not isinstance(grid_pinpoints, list):
         raise TypeError("grid_pinpoints should be a list of tuples or lists")
@@ -510,18 +499,15 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
 # Almost copied from : https://github.com/huggingface/transformers/blob/1feebb5b4150882deabddd190a541f336f3be817/src/transformers/models/llava_next/modeling_llava_next.py#L115C1-L152C1
 def unpad_image(tensor, original_size):
-    """
-    Unpads a PyTorch tensor of a padded and resized image.
+    # Unpads a PyTorch tensor of a padded and resized image.
-    Args:
-        tensor (`torch.Tensor`):
-            The image tensor, assumed to be of shape (num_channels, height, width).
-        original_size (`tuple`):
-            The original size of the image (height, width).
+    # Args:
+    #     tensor (torch.Tensor): The image tensor, assumed to be of shape (num_channels, height, width).
+    #     original_size (tuple): The original size of the image (height, width).
+    # Returns:
+    #     (torch.Tensor): The unpadded image tensor.
-    Returns:
-        `torch.Tensor`: The unpadded image tensor.
-    """
     if not isinstance(original_size, (list, tuple)):
         if not isinstance(original_size, (torch.Tensor, np.ndarray)):
             raise TypeError(
@@ -550,22 +536,19 @@ def unpad_image(tensor, original_size):
 # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
 def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
+    # Selects the best resolution from a list of possible resolutions based on the original size.
-    This is done by calculating the effective and wasted resolution for each possible resolution.
+    # This is done by calculating the effective and wasted resolution for each possible resolution.
-    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+    # The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
-    Args:
-        original_size (tuple):
-            The original size of the image in the format (height, width).
-        possible_resolutions (list):
-            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+    # Args:
+    #     original_size (tuple): The original size of the image in the format (height, width).
+    #     possible_resolutions (List(tuple)): A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+    # Returns:
+    #     (tuple): The best fit resolution in the format (height, width).
-    Returns:
-        tuple: The best fit resolution in the format (height, width).
-    """
     original_height, original_width = original_size
     best_fit = None
     max_effective_resolution = 0
@@ -589,21 +572,17 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
 # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
 def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
-    """
-    Calculate the number of patches after the preprocessing for images of any resolution.
-    Args:
-        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
-            The size of the input image in the format (height, width). ?
-        grid_pinpoints (`List`):
-            A list containing possible resolutions. Each item in the list should be a tuple or list
-            of the form `(height, width)`.
-        patch_size (`int`):
-            The size of each image patch.
-    Returns:
-        int: the number of patches
-    """
+    # Calculate the number of patches after the preprocessing for images of any resolution.
+    # Args:
+    #     image_size (Union[torch.LongTensor, np.ndarray, Tuple[int, int]): The size of the input image in the format (height, width).
+    #     grid_pinpoints (list): A list containing possible resolutions.
+    #         Each item in the list should be a tuple or list of the form `(height, width)`.
+    #     patch_size (int): The size of each image patch.
+    # Returns:
+    #     (int): the number of patches.
     if not isinstance(grid_pinpoints, list):
         raise TypeError("grid_pinpoints should be a list of tuples or lists")

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -13,11 +13,13 @@
 # limitations under the License.
 import inspect
-from typing import Any, Callable
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Union
 from transformers import AutoModelForCausalLM
 from transformers.generation.utils import GenerationMixin
+from ....configuration_utils import RBLNModelConfig
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .midm_architecture import MidmLMHeadModelWrapper
@@ -91,9 +93,45 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     _supports_cache_class = True
     @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        kwargs.setdefault("trust_remote_code", True)
-        return super().from_pretrained(*args, **kwargs)
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        *,
+        export: Optional[bool] = None,
+        rbln_config: Optional[Union[Dict, RBLNModelConfig]] = None,
+        trust_remote_code: Optional[bool] = None,
+        **kwargs: Any,
+    ):
+        """
+        The `from_pretrained()` function is utilized in its standard form as in the HuggingFace transformers library.
+        User can use this function to load a pre-trained model from the HuggingFace library and convert it to a RBLN model to be run on RBLN NPUs.
+        Args:
+            model_id (Union[str, Path]): The model id of the pre-trained model to be loaded.
+                It can be downloaded from the HuggingFace model hub or a local path, or a model id of a compiled model using the RBLN Compiler.
+            export (Optional[bool]): A boolean flag to indicate whether the model should be compiled.
+                If None, it will be determined based on the existence of the compiled model files in the model_id.
+            rbln_config (Optional[Union[Dict, RBLNModelConfig]]): Configuration for RBLN model compilation and runtime.
+                This can be provided as a dictionary or an instance of the model's configuration class (e.g., `RBLNMidmLMHeadModelConfig` for Mi:dm models).
+                For detailed configuration options, see the specific model's configuration class documentation.
+            trust_remote_code (bool): Whether or not to trust the remote code when loading a model from the Hub.
+            kwargs: Additional keyword arguments. Arguments with the prefix `rbln_` are passed to rbln_config, while the remaining arguments are passed to the HuggingFace library.
+        Returns:
+            (RBLNModel): A RBLN model instance ready for inference on RBLN NPU devices.
+        """
+        if trust_remote_code is not None:
+            kwargs["trust_remote_code"] = trust_remote_code
+        elif "trust_remote_code" not in kwargs:
+            kwargs["trust_remote_code"] = True
+        return super().from_pretrained(
+            model_id=model_id,
+            export=export,
+            rbln_config=rbln_config,
+            **kwargs,
+        )
     def __getattr__(self, __name: str) -> Any:
         def redirect(func):

optimum/rbln/transformers/models/pixtral/configuration_pixtral.py CHANGED Viewed

@@ -29,7 +29,7 @@ class RBLNPixtralVisionModelConfig(RBLNModelConfig):
         Args:
             max_image_size (Tuple): The size of max input images. A tuple (max_height, max_width)
             batch_size (Optional[int]): The batch size for image processing. Defaults to 1.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
             ValueError: If batch_size is not a positive integer.

optimum/rbln/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py CHANGED Viewed

@@ -31,10 +31,22 @@ class RBLNQwen2_5_VLForConditionalGenerationConfig(RBLNDecoderOnlyModelForCausal
     def __init__(
         self,
-        visual: Optional[RBLNModelConfig] = None,
         use_inputs_embeds: bool = True,
+        visual: Optional[RBLNModelConfig] = None,
         **kwargs: Any,
     ):
+        """
+        Args:
+            use_inputs_embeds (bool): Whether or not to use `inputs_embeds` as input. Defaults to `True`.
+            visual (Optional[RBLNModelConfig]): Configuration for the vision encoder component.
+            kwargs: Additional arguments passed to the parent `RBLNDecoderOnlyModelForCausalLMConfig`.
+        Raises:
+            ValueError: If `use_inputs_embeds` is False.
+            ValueError: If the visual configuration is provided but contains invalid settings, such as an invalid max_seq_lens (e.g., not a positive integer, not a multiple of the window-based attention unit, or insufficient for the expected resolution).
+            ValueError: If visual is None and no default vision configuration can be inferred for the model architecture.
+            ValueError: If any inherited parameters violate constraints defined in the parent class, such as batch_size not being a positive integer, prefill_chunk_size not being divisible by 64, or max_seq_len not meeting requirements for Flash Attention.
+        """
         super().__init__(use_inputs_embeds=use_inputs_embeds, **kwargs)
         if not self.use_inputs_embeds:
             raise ValueError(
@@ -66,10 +78,13 @@ class RBLNQwen2_5_VisionTransformerPretrainedModelConfig(RBLNModelConfig):
                 making 256 (64 * 4) valid. RBLN optimization runs inference per image or video
                 frame, so set `max_seq_len` to match the maximum expected resolution to reduce
                 computation. If not provided, a `ValueError` is raised.
-            **kwargs: Additional arguments passed to the parent RBLNModelConfig.
+            kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer.
+            ValueError: If `max_seq_lens` is None or not provided.
+            ValueError: If `max_seq_lens` (or any value in the list) is not a positive integer.
+            ValueError: If `max_seq_lens` is not a multiple of (window_size / patch_size)^2 for window-based attention, or is insufficient for the expected image/video resolution.
+            ValueError: If `batch_size` (inherited from RBLNModelConfig) is not a positive integer.
         Max Seq Lens:
             Since `Qwen2_5_VLForConditionalGeneration` performs inference on a per-image or per-frame basis,

optimum-rbln 0.8.4a6__py3-none-any.whl → 0.8.4a8__py3-none-any.whl

Potentially problematic release.

optimum-rbln 0.8.4a6py3-none-any.whl → 0.8.4a8py3-none-any.whl