PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -22,14 +22,15 @@
 # from Rebellions Inc.
 import functools
 import glob
+import inspect
 import os
-from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-import rebel  # noqa: F401
-import torch  # noqa: F401
+import rebel
+import torch
+import transformers
 from safetensors.torch import load_file
 from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import no_init_weights
@@ -40,6 +41,7 @@ from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig,
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.timer_utils import rbln_timer
+from .decoderonly_architecture import DecoderOnlyWrapper
 logger = get_logger()
@@ -102,19 +104,47 @@ class RBLNDecoderOnlyOutput(ModelOutput):
     generate_idx: torch.Tensor = None
-class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
+class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     """
-    The DecoderOnly Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
-    A class to convert and run pre-trained transformers based DecoderOnlyForCausalLM model on RBLN devices.
-    It implements the methods to convert a pre-trained transformers DecoderOnlyForCausalLM model into a RBLN transformer model by:
-    - transferring the checkpoint weights of the original into an optimized RBLN graph,
-    - compiling the resulting graph using the RBLN compiler.
+    A base class for decoder-only transformer models optimized for causal language modeling tasks on RBLN devices.
+    This class serves as the foundation for various decoder-only architectures like GPT, LLaMA, etc.
+    The class provides core functionality for:
+    1. Converting pre-trained transformer models to RBLN-optimized format
+    2. Handling the compilation process for RBLN devices
+    3. Managing inference operations for causal language modeling
+    This class inherits from RBLNModel and implements specific methods required for
+    decoder-only architectures and causal language modeling tasks.
+    Note:
+        - This class is designed to be subclassed by specific model implementations
+          (e.g., RBLNLlamaForCausalLM, RBLNGPT2LMHeadModel)
+        - Subclasses should implement model-specific conversion logic.
+        - The class handles RBLN-specific optimizations automatically during compilation
     """
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
+    _decoder_wrapper_cls = DecoderOnlyWrapper
+    _original_cls = None
+    @classmethod
+    @property
+    def original_cls(cls):
+        """
+        Lazily loads and caches the corresponding Hugging Face model class.
+        Removes 'RBLN' prefix from the class name to get the original class name
+        (e.g., RBLNLlamaForCausalLM -> LlamaForCausalLM) and imports it from
+        the transformers module.
+        Returns:
+            type: The original Hugging Face model class
+        """
+        if cls._original_cls is None:
+            hf_original_cls_name = cls.__name__[4:]
+            cls._original_cls = getattr(transformers, hf_original_cls_name)
+        return cls._original_cls
     def __post_init__(self, **kwargs):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
@@ -233,6 +263,26 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return 0
+    def __getattr__(self, __name: str) -> Any:
+        """
+        Special method to delegate attribute access to the original Huggingface LM class.
+        This method is called when an attribute is not found in the current instance's dictionary.
+        It enables transparent access to the original model's attributes and methods while maintaining
+        proper method binding.
+        The method implements a delegation pattern that:
+        1. For methods: Creates a wrapper that properly binds 'self' to method calls
+        2. For other attributes: Returns them directly from the original class
+        """
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(self.original_cls, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val
     @classmethod
     def get_pytorch_model(cls, *args, **kwargs) -> "PreTrainedModel":
         rbln_kwargs = kwargs.get("rbln_kwargs", {})
@@ -293,6 +343,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         return wrapper
+    @classmethod
+    def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
+        wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
+        # If the model wrapper supports rbln-custom-flash-attention
+        if "kvcache_partition_len" in inspect.signature(cls._decoder_wrapper_cls.__init__).parameters:
+            wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
     @classmethod
     @torch.inference_mode()
     def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
@@ -305,11 +365,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel, ABC):
         @rbln_timer("JIT trace")
         def get_scripted_model():
             # This function is nested to dealloc the example inputs before compilation.
+            # FIXME: 3rd dummy_input(batch_idx) should be fill zero to compile flash_attn.
             prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=4)
-            batch_index = 3
-            dec_example_inputs[batch_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
+            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
             prefill_scripted_model = torch.jit.trace(
                 wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -20,31 +20,40 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+import torch
+from ....utils import logging
 from ...models.decoderonly import (
     DecoderOnlyAttention,
     DecoderOnlyDecoderLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
+    RotaryEmbedding,
 )
+logger = logging.get_logger(__name__)
 class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     """A wrapper class for the Exaone model with a language modeling head."""
-    def __init__(self, model, max_seq_len):
+    def __init__(self, model, max_seq_len, kvcache_partition_len=None):
         super(DecoderOnlyWrapper, self).__init__()
         self.config = model.config
         self.model = self.convert_attribute_name(model.transformer)
         self.lm_head = model.lm_head
-        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
-        self.max_position_embeddings = (
-            self.config.max_position_embeddings if max_seq_len > self.config.max_position_embeddings else max_seq_len
-        )
-        self.max_seq_len = max_seq_len
-        self.rope_scaling = getattr(self.config, "rope_scaling", None)
-        self.rotary_emb = self._init_rope()
+        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
+        if kvcache_partition_len is not None:
+            # WORKAROUND : for passing partition length as a value to the rbln compiler.
+            # What is actually used is the shape of this tensor.
+            self.kvcache_partition_size = torch.zeros(kvcache_partition_len, dtype=torch.int32)
+            self.attn_implementation = "flash_attn_rbln"
+            logger.info(f"Using rbln-flash-attention. (partition length : {kvcache_partition_len})")
+        else:
+            self.kvcache_partition_size = None
+            self.attn_implementation = "eager"
     @staticmethod
     def convert_attribute_name(model):

optimum/rbln/transformers/models/exaone/modeling_exaone.py CHANGED Viewed

@@ -21,21 +21,13 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from ....modeling_config import RBLNConfig
+from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .exaone_architecture import ExaoneForCausalLMWrapper
 from .hf_hub_cached.modeling_exaone import ExaoneForCausalLM
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import (
-        PreTrainedModel,
-    )
+logger = logging.get_logger(__name__)
 class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
@@ -52,25 +44,8 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return ExaoneForCausalLMWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-Exaone.
-        Returns:
-            Any: Exaone's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(ExaoneForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = ExaoneForCausalLMWrapper
+    _original_cls = ExaoneForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -29,11 +29,11 @@ from transformers.modeling_outputs import (
 )
 from ...models.decoderonly import (
-    DecoderOnlyAttention,
     DecoderOnlyDecoderLayer,
     DecoderOnlyWrapper,
     slice_and_unsqueeze_cos_sin,
 )
+from ...models.decoderonly.decoderonly_architecture import DECODERONLY_ATTENTION_CLASSES
 class GemmaWrapper(DecoderOnlyWrapper):
@@ -43,7 +43,7 @@ class GemmaWrapper(DecoderOnlyWrapper):
             {
                 "wrapper": GemmaModel.forward,
                 "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
+                "decoder_layer": DECODERONLY_ATTENTION_CLASSES[self.attn_implementation].forward,
             }
         )
         return forward_dict
@@ -61,9 +61,17 @@ class GemmaModel:
         use_cache: Optional[bool] = True,
         output_attentions: Optional[bool] = False,
         output_hidden_states: Optional[bool] = False,
+        cache_pos_for_partitions: Optional[torch.Tensor] = None,
+        kvcache_partition_size: Optional[torch.Tensor] = None,
         forward_dict: Optional[Dict[str, classmethod]] = None,
         rotary_pos_emb=None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
         # embed positions
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
@@ -96,6 +104,8 @@ class GemmaModel:
                 batch_ids=batch_ids,
                 cos=cos,
                 sin=sin,
+                cache_pos_for_partitions=cache_pos_for_partitions,
+                kvcache_partition_size=kvcache_partition_size,
                 forward_dict=forward_dict,
             )

optimum/rbln/transformers/models/gemma/modeling_gemma.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GemmaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gemma_architecture import GemmaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Gemma Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based GemmaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers GemmaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNGemmaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GemmaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GemmaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GemmaWrapper

optimum/rbln/transformers/models/gpt2/modeling_gpt2.py CHANGED Viewed

@@ -21,20 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import GPT2LMHeadModel
-from ....modeling_config import RBLNConfig
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .gpt2_architecture import GPT2LMHeadModelWrapper
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
+logger = logging.get_logger(__name__)
 class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
@@ -42,7 +34,7 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the
     library implements for all its model.
     It implements the methods to convert a pre-trained transformers GPT2 model into a RBLN transformer model by:
@@ -51,22 +43,4 @@ class RBLNGPT2LMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return GPT2LMHeadModelWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-GPT2.
-        Returns:
-            Any: GPT2's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(GPT2LMHeadModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = GPT2LMHeadModelWrapper

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -21,28 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import LlamaForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .llama_architecture import LlamaWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -50,18 +40,4 @@ class RBLNLlamaForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return LlamaWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(LlamaForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = LlamaWrapper

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -350,9 +350,22 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         is_prefill_phase = not generate_idx.bool().all()
         if is_prefill_phase:
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
             # Get the number of images in the prompt
             special_image_token_masks = [input_id == self.config.image_token_index for input_id in input_ids]
-            num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
+            if legacy_processing:
+                num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
+            else:
+                image_tokens_masks_diff = [
+                    torch.diff(mask, prepend=torch.tensor([0])) for mask in special_image_token_masks
+                ]
+                num_special_image_tokens = [int(torch.sum((diff == 1).int())) for diff in image_tokens_masks_diff]
             # Split images for each prompt
             if pixel_values is not None and pixel_values.size(0) > 0:
@@ -370,13 +383,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
                     image_features, feature_lens = self.image_embedding(
                         image_sizes[b_idx], pixel_values[b_idx], vision_feature_layer, vision_feature_select_strategy
                     )
-                    inputs_embed, _, _, _, _ = self._merge_input_ids_with_image_features(
-                        image_features,
-                        feature_lens,
-                        inputs_embed.to(image_features.dtype),
-                        input_id,
-                        torch.ones_like(input_id, dtype=torch.long),
-                    )
+                    if legacy_processing:
+                        inputs_embed, _, _, _, _ = self._merge_input_ids_with_image_features(
+                            image_features,
+                            feature_lens,
+                            inputs_embed.to(image_features.dtype),
+                            input_id,
+                            torch.ones_like(input_id, dtype=torch.long),
+                        )
+                    else:
+                        special_image_mask = (
+                            (input_id == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embed)
+                        )
+                        inputs_embed = inputs_embed.masked_scatter(special_image_mask, image_features)
                 # Update generate_idx according to inputs_embed
                 generate_idx[b_idx] = inputs_embed.shape[1]

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -58,23 +58,12 @@ class MidmLMHeadModelWrapper(torch.nn.Module):
         self.model = model.transformer
         self.lm_head = model.lm_head
         self.config = model.config
-        self.head_dim = self.config.n_embd // self.config.n_head
-        self.max_position_embeddings = (
-            self.config.max_position_embeddings if max_seq_len > self.config.max_position_embeddings else max_seq_len
-        )
         self.max_seq_len = max_seq_len
-        self.rotary_dim = int(
-            model.config.hidden_size // model.config.num_attention_heads * model.config.rotary_percentage
-        )
-        self.rotary_emb = self._init_rope()
-    def _init_rope(self):
-        """Initializes the Rotary Position Embeddings."""
-        rotary_emb = RotaryEmbedding(
-            self.rotary_dim,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-        return rotary_emb
+        self.config.partial_rotary_factor = model.config.rotary_percentage
+        self.config.head_dim = self.config.n_embd // self.config.n_head
+        self.config.rope_theta = 10000
+        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
     def forward(
         self,

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -21,11 +21,7 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from ....modeling_config import RBLNConfig
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .hf_hub_cached.modeling_midm import MidmLMHeadModel
 from .midm_architecture import (
@@ -33,11 +29,7 @@ from .midm_architecture import (
 )
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import (
-        PreTrainedModel,
-    )
+logger = logging.get_logger(__name__)
 class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
@@ -54,25 +46,8 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return MidmLMHeadModelWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-Midm.
-        Returns:
-            Any: Midm's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(MidmLMHeadModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = MidmLMHeadModelWrapper
+    _original_cls = MidmLMHeadModel
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/mistral/modeling_mistral.py CHANGED Viewed

@@ -21,29 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import MistralForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .mistral_architecture import MistralForCausalLMWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNMistralForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -51,18 +40,4 @@ class RBLNMistralForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return MistralForCausalLMWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(MistralForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = MistralForCausalLMWrapper

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl