PyPI - optimum-rbln - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -20,27 +20,23 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import functools
-import glob
 import inspect
-import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
-import transformers
-from safetensors.torch import load_file
 from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import no_init_weights
 from transformers.utils import ModelOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.timer_utils import rbln_timer
+from ...utils.rbln_quantization import QuantizationManager
 from .decoderonly_architecture import DecoderOnlyWrapper
@@ -54,12 +50,6 @@ if TYPE_CHECKING:
         PretrainedConfig,
     )
-SUPPORTED_QUANTIZATIONS = {
-    "rbln": [
-        "w4a16",
-    ],
-}
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name", "embed_tokens"]
@@ -127,24 +117,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
     _decoder_wrapper_cls = DecoderOnlyWrapper
-    _original_cls = None
-    @classmethod
-    @property
-    def original_cls(cls):
-        """
-        Lazily loads and caches the corresponding Hugging Face model class.
-        Removes 'RBLN' prefix from the class name to get the original class name
-        (e.g., RBLNLlamaForCausalLM -> LlamaForCausalLM) and imports it from
-        the transformers module.
-        Returns:
-            type: The original Hugging Face model class
-        """
-        if cls._original_cls is None:
-            hf_original_cls_name = cls.__name__[4:]
-            cls._original_cls = getattr(transformers, hf_original_cls_name)
-        return cls._original_cls
+    _use_rotary_emb = True
     def __post_init__(self, **kwargs):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
@@ -203,6 +176,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     def get_quantized_model(
         cls,
         model_id: str,
+        config: Optional[PretrainedConfig] = None,
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -212,57 +186,28 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         trust_remote_code: bool = False,
         **kwargs,
     ):
-        from ...utils.rbln_quantization import update_layers_to_quantized
+        from ...utils.rbln_quantization import prepare_model_for_quantization
         kwargs = cls.update_kwargs(kwargs)
-        config = AutoConfig.from_pretrained(
-            model_id,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            force_download=force_download,
-            cache_dir=cache_dir,
-            trust_remote_code=trust_remote_code,
-            **kwargs,
-        )
+        if config is None:
+            config = AutoConfig.from_pretrained(
+                model_id,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+                cache_dir=cache_dir,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
         with no_init_weights():
             model = AutoModelForCausalLM.from_config(config)
-        update_layers_to_quantized(model)
-        n_layer = kwargs.get("num_hidden_layers", None)
-        cls._load_weights_directly_to_model(model, model_id, n_layer)
+        prepare_model_for_quantization(model, model_id, kwargs.get("num_hidden_layers"))
         return model
-    def _load_weights_directly_to_model(model, model_id, n_layer=None):
-        """
-        Load safetensor file data directly into the model, filtering by layer if n_layer is provided.
-        """
-        model_params = dict(model.named_parameters(recurse=True))
-        model_buffers = dict(model.named_buffers(recurse=True))
-        safetensor_files = glob.glob(f"{model_id}/*.safetensors")
-        target_layers = list(range(n_layer)) if n_layer is not None else None
-        for safetensor_file in safetensor_files:
-            file_data = load_file(safetensor_file)
-            for key, value in file_data.items():
-                if target_layers is not None:
-                    parts = key.split(".")
-                    if len(parts) > 2 and parts[2].isdigit() and (int(parts[2]) not in target_layers):
-                        continue
-                if key in model_params:
-                    model_params[key].data.copy_(value)
-                elif key in model_buffers:
-                    model_buffers[key].data.copy_(value)
-        return 0
     def __getattr__(self, __name: str) -> Any:
         """
         Special method to delegate attribute access to the original Huggingface LM class.
@@ -278,7 +223,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         def redirect(func):
             return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(self.original_cls, __name)
+        val = getattr(self.hf_class, __name, None) or getattr(PreTrainedModel, __name)
         if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
             return redirect(val)
         return val
@@ -295,54 +240,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         return model
-    def validate_quantization_config(quantize_config):
-        if quantize_config is not None:
-            q_format = quantize_config.get("format")
-            q_precision = quantize_config.get("precision")
-            if q_format not in SUPPORTED_QUANTIZATIONS:
-                raise ValueError(
-                    f"Invalid quantization format: {q_format}. "
-                    f"Supported formats are: {list(SUPPORTED_QUANTIZATIONS.keys())}"
-                )
-            if q_precision not in SUPPORTED_QUANTIZATIONS[q_format]:
-                raise ValueError(
-                    f"Invalid precision: {q_precision} for format: {q_format}. "
-                    f"Supported precisions are: {SUPPORTED_QUANTIZATIONS[q_format]}"
-                )
-        return quantize_config
-    @classmethod
-    def set_quantize_env(cls, quantize_config):
-        RBLN_QUANT_BITS_ENV = "RBLN_QUANT_BITS"
-        quantize_config = cls.validate_quantization_config(quantize_config)
-        if quantize_config is not None:
-            q_precision = quantize_config.get("precision")
-            quant_bits = q_precision.split("w")[1].split("a")[0]
-            os.environ[RBLN_QUANT_BITS_ENV] = quant_bits
-            return RBLN_QUANT_BITS_ENV
-        return None
-    @classmethod
-    def reset_quantize_env(cls, env_var_name):
-        if env_var_name is not None and env_var_name in os.environ:
-            del os.environ[env_var_name]
-    @classmethod
-    def manage_quantize_env(cls, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            quantize_config = kwargs.get("quantize_config")
-            quantize_env_var = cls.set_quantize_env(quantize_config)
-            try:
-                return func(*args, **kwargs)
-            finally:
-                cls.reset_quantize_env(quantize_env_var)
-        return wrapper
     @classmethod
     def wrap_model_if_needed(cls, model: "PreTrainedModel", rbln_config: "RBLNConfig"):
         wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
@@ -351,6 +248,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         if "kvcache_partition_len" in inspect.signature(cls._decoder_wrapper_cls.__init__).parameters:
             wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        wrapper_cfg["use_rotary_emb"] = cls._use_rotary_emb
         return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
     @classmethod
@@ -369,9 +268,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             prefill_example_inputs = prefill_rbln_compile_config.get_dummy_inputs(fill=0)
             dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
+            wrapped_model.phase = "prefill"
             prefill_scripted_model = torch.jit.trace(
                 wrapped_model, prefill_example_inputs, check_trace=False, _store_inputs=False
             )
+            wrapped_model.phase = "decode"
             dec_scripted_model = torch.jit.trace(
                 wrapped_model, dec_example_inputs, check_trace=False, _store_inputs=False
             )
@@ -394,6 +295,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         prefill_ir, dec_ir = scripted_model_to_ir()
         # Caching prefill_decoder/decoder I/O
         cache_index_offset = 5
         connections = [
             (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
             for i in range(model.config.num_hidden_layers * 2)
@@ -402,7 +304,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         # Extract quantize_config from rbln_config
         quantize_config = rbln_config.model_cfg.get("quantization", None)
-        @cls.manage_quantize_env
+        @QuantizationManager.with_quantization_env
         def compile_model(*args, **kwargs):
             # Remove quantize_config from kwargs
             kwargs.pop("quantize_config", None)
@@ -432,10 +334,8 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     ) -> RBLNConfig:
         rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
         rbln_batch_size = rbln_kwargs.get("batch_size", None)
-        rbln_quantization = rbln_kwargs.get("quantization", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
-        rbln_quantization = cls.validate_quantization_config(rbln_quantization)
+        rbln_quantization = QuantizationManager.validate_quantization_config(rbln_kwargs.get("quantization", None))
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
@@ -610,8 +510,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         cache_position: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         generate_idx: Optional[torch.Tensor] = None,
-        # from llava_next forward args
-        batch_idx: Optional[int] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         # prefll
@@ -633,7 +531,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                     input_ids=input_tensor if inputs_embeds is None else None,
                     inputs_embeds=input_tensor if inputs_embeds is not None else None,
                     cache_position=cache_position,
-                    batch_idx=b_idx if batch_idx is None else batch_idx,  # Llava-next prefill
+                    batch_idx=b_idx,
                 )
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
@@ -734,11 +632,24 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         cache_position: torch.Tensor = None,
     ) -> torch.FloatTensor:
         input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
+        if input_tensors is None:
+            raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
         batch_size = input_tensors.shape[0]
+        if batch_size != self.batch_size:
+            raise RuntimeError(
+                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
+            )
+        if batch_size != cache_position.shape[0]:
+            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
         for b_idx in range(batch_size):
             decoding_step = cache_position[b_idx].item()
+            if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
+                raise ValueError(
+                    f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
+                )
             self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
         logits, _ = self.decoder(
@@ -751,31 +662,3 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         )
         return logits
-    def vllm_forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        inputs_embeds: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-        batch_idx: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor]:
-        # prefll
-        if cache_position.shape[-1] > 1:
-            logits = self._forward_prefill(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-                batch_idx=batch_idx,
-            )
-        # decoder
-        else:
-            logits = self._forward_decoder(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                cache_position=cache_position,
-            )
-        return RBLNDecoderOnlyOutput(
-            logits=logits,
-        )

optimum/rbln/transformers/models/dpt/modeling_dpt.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Union
 from transformers import AutoModelForDepthEstimation
 from transformers.modeling_outputs import DepthEstimatorOutput
-from ....modeling_base import RBLNModel
+from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -20,62 +20,78 @@
 # are the intellectual property of Rebellions Inc. and may not be
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-import torch
+from typing import TYPE_CHECKING
+import torch.nn as nn
 from ....utils import logging
-from ...models.decoderonly import (
+from ...models.decoderonly.decoderonly_architecture import (
     DecoderOnlyAttention,
-    DecoderOnlyDecoderLayer,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
     DecoderOnlyModel,
     DecoderOnlyWrapper,
-    RotaryEmbedding,
 )
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel as ExaoneForCausalLM
 logger = logging.get_logger(__name__)
 class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     """A wrapper class for the Exaone model with a language modeling head."""
-    def __init__(self, model, max_seq_len, kvcache_partition_len=None):
-        super(DecoderOnlyWrapper, self).__init__()
-        self.config = model.config
-        self.model = self.convert_attribute_name(model.transformer)
-        self.lm_head = model.lm_head
-        self.rotary_emb = RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-        if kvcache_partition_len is not None:
-            # WORKAROUND : for passing partition length as a value to the rbln compiler.
-            # What is actually used is the shape of this tensor.
-            self.kvcache_partition_size = torch.zeros(kvcache_partition_len, dtype=torch.int32)
-            self.attn_implementation = "flash_attn_rbln"
-            logger.info(f"Using rbln-flash-attention. (partition length : {kvcache_partition_len})")
-        else:
-            self.kvcache_partition_size = None
-            self.attn_implementation = "eager"
-    @staticmethod
-    def convert_attribute_name(model):
-        model.embed_tokens = model.wte
-        model.norm = model.ln_f
-        model.layers = model.h
-        for layer in model.layers:
-            layer.input_layernorm = layer.ln_1
-            layer.self_attn = layer.attn.attention
-            layer.post_attention_layernorm = layer.ln_2
-            layer.self_attn.o_proj = layer.self_attn.out_proj
-        return model
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": DecoderOnlyModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DecoderOnlyAttention.forward,
-            }
-        )
-        return forward_dict
+    def convert_to_rbln_causal_lm(self, causal_lm: "ExaoneForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.transformer.h:
+            if self.attn_impl == "eager":
+                new_self_attn = ExaoneAttention(layer.attn.attention)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = ExaoneFlashAttention(
+                    layer.attn.attention, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = ExaoneLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = ExaoneModel(causal_lm.transformer, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class ExaoneModel(DecoderOnlyModel):
+    def get_embedding(self) -> nn.Embedding:
+        return self._original_mod.wte
+    def get_last_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_f
+class ExaoneLayer(DecoderOnlyLayer):
+    def get_pre_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_1
+    def get_post_attention_layernorm(self) -> nn.LayerNorm:
+        return self._original_mod.ln_2
+class ExaoneAttention(DecoderOnlyAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj
+        self.num_key_value_heads = self._original_mod.num_key_value_heads
+class ExaoneFlashAttention(DecoderOnlyFlashAttention):
+    def __post_init__(self):
+        self.q_proj = self._original_mod.q_proj
+        self.k_proj = self._original_mod.k_proj
+        self.v_proj = self._original_mod.v_proj
+        self.o_proj = self._original_mod.out_proj
+        self.num_key_value_heads = self._original_mod.num_key_value_heads

optimum/rbln/transformers/models/exaone/modeling_exaone.py CHANGED Viewed

@@ -21,10 +21,12 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
+from transformers import AutoModelForCausalLM
 from ....utils import logging
 from ..decoderonly import RBLNDecoderOnlyModelForCausalLM
 from .exaone_architecture import ExaoneForCausalLMWrapper
-from .hf_hub_cached.modeling_exaone import ExaoneForCausalLM
 logger = logging.get_logger(__name__)
@@ -45,7 +47,7 @@ class RBLNExaoneForCausalLM(RBLNDecoderOnlyModelForCausalLM):
     """
     _decoder_wrapper_cls = ExaoneForCausalLMWrapper
-    _original_cls = ExaoneForCausalLM
+    _hf_class = AutoModelForCausalLM
     @classmethod
     def from_pretrained(cls, *args, **kwargs):

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -21,113 +21,42 @@
 # copied, modified, or distributed without prior written permission
 # from Rebellions Inc.
-from typing import Dict, List, Optional, Tuple, Union
-import torch
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-)
-from ...models.decoderonly import (
-    DecoderOnlyDecoderLayer,
+from typing import TYPE_CHECKING
+from ...models.decoderonly.decoderonly_architecture import (
+    DecoderOnlyAttention,
+    DecoderOnlyFlashAttention,
+    DecoderOnlyForCausalLM,
+    DecoderOnlyLayer,
+    DecoderOnlyModel,
     DecoderOnlyWrapper,
-    slice_and_unsqueeze_cos_sin,
 )
-from ...models.decoderonly.decoderonly_architecture import DECODERONLY_ATTENTION_CLASSES
-class GemmaWrapper(DecoderOnlyWrapper):
-    def get_forward_dict(self):
-        forward_dict = {}
-        forward_dict.update(
-            {
-                "wrapper": GemmaModel.forward,
-                "model": DecoderOnlyDecoderLayer.forward,
-                "decoder_layer": DECODERONLY_ATTENTION_CLASSES[self.attn_implementation].forward,
-            }
-        )
-        return forward_dict
-class GemmaModel:
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        batch_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = True,
-        output_attentions: Optional[bool] = False,
-        output_hidden_states: Optional[bool] = False,
-        cache_pos_for_partitions: Optional[torch.Tensor] = None,
-        kvcache_partition_size: Optional[torch.Tensor] = None,
-        forward_dict: Optional[Dict[str, classmethod]] = None,
-        rotary_pos_emb=None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        # retrieve input_ids and inputs_embeds
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
-        # embed positions
-        inputs_embeds = self.embed_tokens(input_ids)
-        hidden_states = inputs_embeds
-        ##### GEMMA change from llama#####
-        hidden_states = hidden_states * (self.config.hidden_size**0.5)
-        attention_mask = (1 - attention_mask) * torch.finfo(torch.float16).min
+if TYPE_CHECKING:
+    from transformers import GemmaForCausalLM
-        # get cos,sin vector
-        cos, sin = rotary_pos_emb(inputs_embeds, attention_mask.shape[-1])
-        cos, sin = slice_and_unsqueeze_cos_sin(cos, sin, position_ids)
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        for layer_idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            layer_outputs = forward_dict["model"](
-                decoder_layer,
-                hidden_states,
-                layer_idx,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                batch_ids=batch_ids,
-                cos=cos,
-                sin=sin,
-                cache_pos_for_partitions=cache_pos_for_partitions,
-                kvcache_partition_size=kvcache_partition_size,
-                forward_dict=forward_dict,
-            )
-            hidden_states = layer_outputs[0]
-            updated_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        hidden_states = self.norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        # convert RebelDynamicCache to legacy Tuple[Tuple[torch.Tensor]]
-        next_cache = updated_cache.to_legacy_cache()
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
+class GemmaWrapper(DecoderOnlyWrapper):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GemmaForCausalLM"):
+        new_layers = []
+        for layer in causal_lm.model.layers:
+            if self.attn_impl == "eager":
+                new_self_attn = DecoderOnlyAttention(layer.self_attn)
+            elif self.attn_impl == "flash_attn":
+                new_self_attn = DecoderOnlyFlashAttention(
+                    layer.self_attn, kvcache_partition_len=self.kvcache_partition_len
+                )
+            else:
+                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+            new_layers.append(new_layer)
+        new_model = GemmaModel(causal_lm.model, new_layers)
+        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
+        return new_causal_lm
+class GemmaModel(DecoderOnlyModel):
+    @property
+    def hidden_multiplier(self):
+        return self._original_mod.config.hidden_size**0.5

optimum-rbln 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

optimum-rbln 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl