PyPI - optimum-rbln - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -23,7 +23,7 @@
 import inspect
 import logging
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -36,7 +36,7 @@ from transformers import (
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.models.llava_next.modeling_llava_next import LlavaNextCausalLMOutputWithPast
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
 from ..decoderonly.modeling_decoderonly import RBLNDecoderOnlyOutput
@@ -166,19 +166,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         return super().__post_init__(**kwargs)
-    @classmethod
-    def get_pytorch_model(
-        cls,
-        model_id: str,
-        *args,
-        rbln_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> "PreTrainedModel":
-        # Optimum's TasksManager does not handle Llava.
-        kwargs = cls.update_kwargs(kwargs)
-        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, *args, **kwargs)
-        return model
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -350,9 +337,22 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         is_prefill_phase = not generate_idx.bool().all()
         if is_prefill_phase:
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
             # Get the number of images in the prompt
             special_image_token_masks = [input_id == self.config.image_token_index for input_id in input_ids]
-            num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
+            if legacy_processing:
+                num_special_image_tokens = [torch.sum(mask, dim=-1) for mask in special_image_token_masks]
+            else:
+                image_tokens_masks_diff = [
+                    torch.diff(mask, prepend=torch.tensor([0])) for mask in special_image_token_masks
+                ]
+                num_special_image_tokens = [int(torch.sum((diff == 1).int())) for diff in image_tokens_masks_diff]
             # Split images for each prompt
             if pixel_values is not None and pixel_values.size(0) > 0:
@@ -370,13 +370,19 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
                     image_features, feature_lens = self.image_embedding(
                         image_sizes[b_idx], pixel_values[b_idx], vision_feature_layer, vision_feature_select_strategy
                     )
-                    inputs_embed, _, _, _, _ = self._merge_input_ids_with_image_features(
-                        image_features,
-                        feature_lens,
-                        inputs_embed.to(image_features.dtype),
-                        input_id,
-                        torch.ones_like(input_id, dtype=torch.long),
-                    )
+                    if legacy_processing:
+                        inputs_embed, _, _, _, _ = self._merge_input_ids_with_image_features(
+                            image_features,
+                            feature_lens,
+                            inputs_embed.to(image_features.dtype),
+                            input_id,
+                            torch.ones_like(input_id, dtype=torch.long),
+                        )
+                    else:
+                        special_image_mask = (
+                            (input_id == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embed)
+                        )
+                        inputs_embed = inputs_embed.masked_scatter(special_image_mask, image_features)
                 # Update generate_idx according to inputs_embed
                 generate_idx[b_idx] = inputs_embed.shape[1]
@@ -403,66 +409,6 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         return outputs
-    def vllm_forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_feature_layer: Optional[int] = None,
-        vision_feature_select_strategy: Optional[str] = None,
-        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
-        batch_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
-        is_prefill = cache_position.shape[-1] > 1
-        if inputs_embeds is not None:
-            raise NotImplementedError("Specifying inputs_embeds is not supported.")
-        if is_prefill:
-            # Get text_embeds
-            inputs_embeds = self.text_embedding(input_ids)
-            # If any images in the prompt, get image_embeds and merge with text
-            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-                image_features, _ = self.image_embedding(
-                    image_sizes, pixel_values, vision_feature_layer, vision_feature_select_strategy
-                )
-                def merge_vllm_multimodal_embeddings(
-                    input_ids: torch.Tensor,
-                    inputs_embeds: torch.Tensor,
-                    multimodal_embeddings: torch.Tensor,
-                    placeholder_token_id: int,
-                ) -> torch.Tensor:
-                    mask = input_ids == placeholder_token_id
-                    num_expected_tokens = mask.sum().item()
-                    if multimodal_embeddings.shape[0] != num_expected_tokens:
-                        raise ValueError(
-                            f"Attempted to assign {inputs_embeds[mask].shape} = {multimodal_embeddings.shape} "
-                            f"multimodal tokens to {num_expected_tokens} placeholders"
-                        )
-                    inputs_embeds[mask] = multimodal_embeddings
-                    return inputs_embeds
-                inputs_embeds = merge_vllm_multimodal_embeddings(
-                    input_ids, inputs_embeds, image_features, self.config.image_token_index
-                )
-        else:
-            inputs_embeds = self.text_embedding(input_ids=input_ids)
-        outputs: RBLNDecoderOnlyOutput = self.language_model.vllm_forward(
-            inputs_embeds=inputs_embeds,
-            batch_idx=batch_idx,
-            cache_position=cache_position,
-        )
-        return outputs
     # Almost copied from : https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, image_newline=None):
         """

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -21,18 +21,24 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Optional, Tuple, Union
+from typing import TYPE_CHECKING, Tuple
 import torch
 import torch.nn as nn
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from ....transformers.models.decoderonly.decoderonly_architecture import (
-    RotaryEmbedding,
-    rotate_half,
-    slice_and_unsqueeze_cos_sin,
+from ....transformers.models.decoderonly.decoderonly_architecture import rotate_half
+from ..decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
+    DecoderOnlyWrapper,
+    apply_rotary_pos_emb_partial,
 )
-from ...cache_utils import RebelDynamicCache_4D
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as MidmLMHeadModel
 def apply_rotary_to_tensor(tensor, cos, sin, rot_dim):
@@ -50,264 +56,93 @@ def apply_rotary_pos_emb(q, k, cos, sin):
     return q_embed, k_embed
-class MidmLMHeadModelWrapper(torch.nn.Module):
-    """A wrapper class for the Midm model with a language modeling head."""
+class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
+    def get_rotary_emb(self, max_seq_len):
+        self.config.rope_theta = 10000
+        self.config.head_dim = self.config.n_embd // self.config.n_head
+        self.config.partial_rotary_factor = self.config.rotary_percentage
+        return super().get_rotary_emb(max_seq_len=max_seq_len)
+    def convert_to_rbln_causal_lm(self, causal_lm: "MidmLMHeadModel"):
+        if self.attn_impl != "eager":
+            raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            new_self_attn = MidmAttention(layer.attn)
+            new_layer = MidmLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = MidmModel(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class MidmModel(DecoderOnlyModel):
+    mask_fmin = -10000.0
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
+            )
-    def __init__(self, model, max_seq_len):
-        super().__init__()
-        self.model = model.transformer
-        self.lm_head = model.lm_head
-        self.config = model.config
-        self.head_dim = self.config.n_embd // self.config.n_head
-        self.max_position_embeddings = (
-            self.config.max_position_embeddings if max_seq_len > self.config.max_position_embeddings else max_seq_len
-        )
-        self.max_seq_len = max_seq_len
-        self.rotary_dim = int(
-            model.config.hidden_size // model.config.num_attention_heads * model.config.rotary_percentage
-        )
-        self.rotary_emb = self._init_rope()
+        return layernorm1p
-    def _init_rope(self):
-        """Initializes the Rotary Position Embeddings."""
-        rotary_emb = RotaryEmbedding(
-            self.rotary_dim,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-        return rotary_emb
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        cache_position: torch.LongTensor,
-        batch_position: int,
-        query_idx: int,
-        *past_key_values,
-    ):
-        """Defines the forward pass for the wrapper model."""
-        if input_ids.shape[1] == 1:
-            rbln_batch_position = None
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_f)
         else:
-            rbln_batch_position = batch_position
-        past_key_values = RebelDynamicCache_4D.from_input_format(
-            cache_position,
-            self.config.num_hidden_layers,
-            *past_key_values,
-        )
-        outputs = _MidmModel.forward(
-            self.model,
-            input_ids=input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            position_ids=cache_position,
-            rotary_pos_emb=self.rotary_emb,
-            batch_ids=rbln_batch_position,
-        )
+            return self._original_mod.ln_f
-        hidden_states = outputs[0]
-        if batch_position >= 0:
-            hidden_states = hidden_states[:, query_idx].unsqueeze(1)
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
-        logits = self.lm_head(hidden_states)
-        output = (logits,) + outputs[1:]
+    def get_pos_embedding(self) -> nn.Embedding:
+        return self._original_mod.wpe
-        return output, batch_position + query_idx
-def layernorm1p(module, input):
-    """Applies Layer Normalization with a slight modification on the weights."""
-    return torch.nn.functional.layer_norm(input, module.normalized_shape, module.weight + 1, module.bias, module.eps)
-class _MidmAttention:
-    """Custom implementation of the MidmAttention class with specific modifications."""
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        """Computes the attention weights and output."""
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+class MidmLayer(DecoderOnlyLayer):
+    def get_layernorm1p(self, module: nn.LayerNorm):
+        def layernorm1p(input: torch.Tensor):
+            """Applies Layer Normalization with a slight modification on the weights."""
+            return torch.nn.functional.layer_norm(
+                input, module.normalized_shape, module.weight + 1, module.bias, module.eps
             )
-        if self.scale_attn_by_inverse_layer_idx or self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-        if self.scale_qk_by_inverse_layer_idx:
-            attn_weights = attn_weights * float(self.layer_idx + 1)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights.type(value.dtype)
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-        attn_output = torch.matmul(attn_weights, value)
-        return attn_output, attn_weights
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_index: Optional[int] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        """Defines the forward pass for the attention mechanism."""
-        bsz, q_len, _ = hidden_states.size()
-        querys, keys, values = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        querys = self._split_heads(querys, self.num_heads, self.head_dim).contiguous()
-        keys = self._split_heads(keys, self.num_heads, self.head_dim).contiguous()
-        values = self._split_heads(values, self.num_heads, self.head_dim).contiguous()
-        querys, keys = apply_rotary_pos_emb(querys, keys, cos, sin)
-        # Decoder
-        if (batch_index is None or batch_index == -1) and bsz > 1:
-            all_key_states = []
-            all_value_states = []
-            all_attn_output = []
-            for b in range(bsz):
-                query = querys[b].unsqueeze(0)
-                attn_mask = attention_mask[b].unsqueeze(0)
-                key = keys[b].unsqueeze(0)
-                value = values[b].unsqueeze(0)
-                key, value = past_key_value.update(
-                    key,
-                    value,
-                    self.layer_idx,
-                    b,
-                )
-                attn_output, _ = _MidmAttention._attn(self, query, key, value, attn_mask)
-                attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-                all_key_states.append(key)
-                all_value_states.append(value)
-                all_attn_output.append(attn_output)
-            keys = torch.cat(all_key_states, dim=0)
-            values = torch.cat(all_value_states, dim=0)
-            attn_output = torch.cat(all_attn_output, dim=0)
+        return layernorm1p
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_1)
         else:
-            if batch_index is None or batch_index == -1:
-                batch_index = 0
-            keys, values = past_key_value.update(
-                keys,
-                values,
-                self.layer_idx,
-                batch_index,
-                read_first_step=True,
-            )
+            return self._original_mod.ln_1
-            attn_output, _ = _MidmAttention._attn(self, querys, keys, values, attention_mask)
-            attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        return attn_output, keys, values
-class _MidmBlock:
-    """Custom implementation of the MidmBlock class with specific modifications."""
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_idx: int,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_value: Optional[RebelDynamicCache_4D] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        """Defines the forward pass for the block."""
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_1, hidden_states)
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        if self._original_mod.use_layernorm1p:
+            return self.get_layernorm1p(self._original_mod.ln_2)
         else:
-            hidden_states = self.ln_1(hidden_states)
-        hidden_states, k, v = _MidmAttention.forward(
-            self.attn,
-            hidden_states,
-            attention_mask=attention_mask,
-            past_key_value=past_key_value,
-            cos=cos,
-            sin=sin,
-            batch_index=batch_ids,
+            return self._original_mod.ln_2
+class MidmAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.c_attn = self._original_mod.c_attn
+        self.o_proj = self._original_mod.c_proj
+        self.split_size = self._original_mod.split_size
+        self.num_key_value_heads = self._original_mod.num_heads
+    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        return query_states, key_states, value_states
+    def rbln_attention(self, *args, **kwargs):
+        return super().rbln_attention(
+            *args,
+            **kwargs,
+            layer_idx=self.layer_idx,
+            scale_attn_weights=self._original_mod.scale_attn_weights,
+            scale_attn_by_inverse_layer_idx=self._original_mod.scale_attn_by_inverse_layer_idx,
         )
-        past_key_value.assign(k, v, layer_idx)
-        hidden_states = hidden_states + residual
-        residual = hidden_states
-        if self.use_layernorm1p:
-            hidden_states = layernorm1p(self.ln_2, hidden_states)
-        else:
-            hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + feed_forward_hidden_states
-        return hidden_states, past_key_value
-class _MidmModel:
-    """Custom implementation of the MidmModel class with specific modifications."""
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[RebelDynamicCache_4D] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        rotary_pos_emb=None,
-        batch_ids: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """Defines the forward pass for the model."""
-        input_shape = input_ids.size()
-        attention_mask = (1.0 - attention_mask) * -10000.0
-        inputs_embeds = self.wte(input_ids)
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
-        hidden_states = inputs_embeds
-        for layer_idx, (block, _) in enumerate(zip(self.h, past_key_values)):
-            hidden_states, updated_cache = _MidmBlock.forward(
-                block,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                past_key_value=past_key_values,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-            )
-        hidden_states = layernorm1p(self.ln_f, hidden_states)
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-        hidden_states = hidden_states.view(output_shape)
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-        )
+    def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
+        return apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim=cos.shape[-1])

optimum/rbln/transformers/models/midm/modeling_midm.py CHANGED Viewed

@@ -21,23 +21,15 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from ....modeling_config import RBLNConfig
-from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
-from .hf_hub_cached.modeling_midm import MidmLMHeadModel
-from .midm_architecture import (
-    MidmLMHeadModelWrapper,
-)
+from transformers import AutoModelForCausalLM
+from ....utils import logging
+from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
+from .midm_architecture import MidmLMHeadModelWrapper
-logger = logging.getLogger(__name__)
-if TYPE_CHECKING:
-    from transformers import (
-        PreTrainedModel,
-    )
+logger = logging.get_logger(__name__)
 class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
@@ -54,25 +46,8 @@ class RBLNMidmLMHeadModel(RBLNDecoderOnlyModelForCausalLM):
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return MidmLMHeadModelWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        """This is the key method to implement RBLN-Midm.
-        Returns:
-            Any: Midm's corresponding method
-        """
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(MidmLMHeadModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = MidmLMHeadModelWrapper
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/mistral/modeling_mistral.py CHANGED Viewed

@@ -21,29 +21,18 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import inspect
-import logging
-from typing import TYPE_CHECKING, Any, Callable
-from transformers import MistralForCausalLM
+from ....utils import logging
 from ...models.decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .mistral_architecture import MistralForCausalLMWrapper
-if TYPE_CHECKING:
-    from transformers import PreTrainedModel
-    from ....modeling_config import RBLNConfig
-logger = logging.getLogger(__name__)
+logger = logging.get_logger(__name__)
 class RBLNMistralForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNDecoderOnlyModelForCausalLM`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -51,18 +40,4 @@ class RBLNMistralForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     - compiling the resulting graph using the RBLN compiler.
     """
-    @classmethod
-    def wrap_model_if_needed(self, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
-        rbln_max_seq_len = rbln_config.model_cfg["max_seq_len"]
-        return MistralForCausalLMWrapper(model, rbln_max_seq_len).eval()
-    def __getattr__(self, __name: str) -> Any:
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(MistralForCausalLM, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
+    _decoder_wrapper_cls = MistralForCausalLMWrapper

optimum-rbln 0.1.12__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.12py3-none-any.whl → 0.1.15py3-none-any.whl