PyPI - optimum-rbln - Versions diffs - 0.8.2a4__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl - Mend

optimum-rbln 0.8.2a4py3-none-any.whl → 0.9.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (167) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import math
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 import torch
 from torch import nn
@@ -21,106 +21,16 @@ from transformers import PretrainedConfig, PreTrainedModel
 from ....utils import logging
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
-from .configuration_decoderonly import CacheImplType
+from ...utils.rbln_quantization import RBLNQuantizationConfig
+from .configuration_lora import RBLNLoRAConfig
+from .lora_architecture import LoRALinear
-logger = logging.get_logger(__name__)
-DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
-DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
-MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
-MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
-MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
-MAX_SLIDING_WINDOW_SIZE = 32_768
-def set_default_values(
-    attn_impl: Optional[str] = None,
-    kvcache_partition_len: Optional[int] = None,
-    kvcache_block_size: Optional[int] = None,
-    max_seq_len: Optional[int] = None,
-) -> Tuple[str, int, int]:
-    if attn_impl is None:
-        attn_impl = "eager"
-    if kvcache_partition_len is not None:
-        if attn_impl == "eager":
-            attn_impl = "flash_attn"
-            logger.warning(
-                "A non-null `kvcache_partition_len` was provided, but `attn_impl` was not explicitly set or "
-                "set to 'eager'. Since KV cache partitioning is only supported with flash attention, "
-                "`attn_impl` has been automatically switched to 'flash_attn'."
-            )
-    if kvcache_partition_len is None and attn_impl == "flash_attn":
-        kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
-    if kvcache_block_size is None:
-        if attn_impl == "eager":
-            kvcache_block_size = max_seq_len
-        else:
-            kvcache_block_size = kvcache_partition_len
-    return attn_impl, kvcache_partition_len, kvcache_block_size
-def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcache_block_size: int, max_seq_len: int):
-    if attn_impl not in ["eager", "flash_attn"]:
-        raise ValueError(f"Unknown `attn_impl` : {attn_impl}. (Available : 'eager', 'flash_attn`)")
-    ## Checking Constraints...
-    # Constraint of eager attention:
-    # - `max_seq_len` <= 32k
-    # Constraints of flash attention:
-    # 1. `max_seq_len` should be multiple of `partition_len`.
-    # 2. 4k <= `partition_len` <= 32k.
-    # 3. `max_seq_len` should be larger then 8k.
-    if attn_impl == "eager" and max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
-        raise ValueError(
-            f"`max_seq_len` is set to {max_seq_len}, "
-            f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
-            f"Please reduce the `max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
-            " or consider switching `attn_impl` to 'flash_attn' for larger sequence lengths."
-        )
-    if attn_impl == "flash_attn":
-        if max_seq_len // kvcache_partition_len < 2 or max_seq_len % kvcache_partition_len != 0:
-            raise ValueError(
-                f"`max_seq_len` ({max_seq_len}) must be a multiple of `kvcache_partition_len` ({kvcache_partition_len}) "
-                f"when using 'flash_attn'. Please adjust either value to meet this requirement."
-            )
-        elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
-            raise ValueError(
-                f"`kvcache_partition_len` ({kvcache_partition_len}) is out of the supported range for 'flash_attn' "
-                f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
-                f"Please provide a valid value within this range."
-            )
-        elif max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
-            raise ValueError(
-                f"`max_seq_len` ({max_seq_len}) is too small for 'flash_attn'. The minimum "
-                f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `max_seq_len` to meet "
-                "this requirement, or consider switching `attn_impl` to 'eager' for shorter lengths."
-            )
-    if kvcache_block_size is not None:
-        if attn_impl == "flash_attn" and kvcache_partition_len != kvcache_block_size:
-            raise ValueError(
-                f" When using 'flash attention', the `kvcache_block_size` ({kvcache_block_size})  "
-                f"must always be set equal to the `kvcache_partition_len` {kvcache_partition_len}."
-            )
-        elif attn_impl == "eager" and kvcache_block_size != max_seq_len:
-            raise ValueError(
-                f" When using 'eager attention', the `kvcache_block_size` ({kvcache_block_size})  "
-                f"must always be set equal to the `max_seq_len` {max_seq_len}."
-            )
+if TYPE_CHECKING:
+    from .configuration_decoderonly import RBLNDecoderOnlyModelConfig
-def validate_sliding_window_size(sliding_window: int, prefill_chunk_size: int):
-    if sliding_window > MAX_SLIDING_WINDOW_SIZE - prefill_chunk_size:
-        raise ValueError(
-            f"Sliding window size ({sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - prefill_chunk_size})"
-        )
+logger = logging.get_logger(__name__)
 class DecoderOnlyWrapper(nn.Module):
@@ -137,40 +47,22 @@ class DecoderOnlyWrapper(nn.Module):
     - Wrapper should not contain neural network graph operations (including memory view handling)
     Args:
-        causal_lm (PreTrainedModel): The Huggingface causal language model to wrap
-        max_seq_len (int): Maximum sequence length for position embeddings and cache sizes
+        model (PreTrainedModel): The Huggingface causal language model to wrap
+        rbln_config: The RBLN model configuration containing all necessary parameters
         use_rotary_emb (bool): Whether to use rotary position embeddings
-        attn_impl (str): The attention implementation to use.
-            - "eager": Uses the standard attention.
-            - "flash_attn": Uses flash attention. When set,
-              the key/value cache is partitioned into chunks of length
-              `kvcache_partition_len`.
-        kvcache_partition_len (Optional[int]): Length of KV cache partitions for flash attention.
-            This is only relevant if `attn_impl` is set to "flash_attn`
     """
     _use_learned_pos_emb = False
-    def __init__(
-        self,
-        causal_lm: PreTrainedModel,
-        max_seq_len: int,
-        use_rotary_emb: bool,
-        attn_impl: str,
-        cache_impl: CacheImplType,
-        use_inputs_embeds: bool,
-        use_attention_mask: bool,
-        use_position_ids: bool,
-        kvcache_partition_len: Optional[int] = None,
-        kvcache_block_size: Optional[int] = None,
-        sliding_window: Optional[int] = None,
-        sliding_window_layers: Optional[List[int]] = None,
-    ):
+    def __init__(self, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig", use_rotary_emb: bool):
         super().__init__()
-        self.config = causal_lm.config
+        self.quantization = rbln_config.quantization
+        self.config = model.config
+        self.is_causal_lm = getattr(model, "lm_head", None) is not None
+        self.rbln_config = rbln_config
         if use_rotary_emb:
-            rotary_embs = self.get_rotary_emb(max_seq_len=max_seq_len)
+            rotary_embs = self.get_rotary_emb(max_seq_len=rbln_config.max_seq_len)
             if isinstance(rotary_embs, tuple):
                 self.rotary_emb_global, self.rotary_emb_local = rotary_embs
             else:
@@ -178,43 +70,27 @@ class DecoderOnlyWrapper(nn.Module):
         else:
             self.rotary_emb = None
-        self.attn_impl = attn_impl
-        self.kvcache_block_size = kvcache_block_size
-        self.use_attention_mask = use_attention_mask
-        self.use_position_ids = use_position_ids
-        self.use_inputs_embeds = use_inputs_embeds
-        self.sliding_window_layers = sliding_window_layers
-        self.cache_impl = cache_impl
-        self.sliding_window = sliding_window
-        if self.attn_impl == "flash_attn":
-            self.kvcache_partition_len = kvcache_partition_len or DEFAULT_FLASH_ATTN_PARTITION_LENGTH
-        elif self.attn_impl == "eager":
-            self.kvcache_partition_len = None
-        else:
-            raise ValueError(f"Unknown attn_impl : {self.attn_impl}")
-        if kvcache_partition_len and kvcache_partition_len > max_seq_len:
+        if rbln_config.kvcache_partition_len and rbln_config.kvcache_partition_len > rbln_config.max_seq_len:
             raise ValueError(
-                f"kvcache_partition_len({kvcache_partition_len}) should be lower"
-                f" or equal to max_seq_len({max_seq_len})!"
+                f"kvcache_partition_len({rbln_config.kvcache_partition_len}) should be lower"
+                f" or equal to max_seq_len({rbln_config.max_seq_len})!"
             )
-        self.causal_lm = self.convert_to_rbln_causal_lm(causal_lm, max_seq_len)
+        self.model = self.convert_to_rbln_class(model, rbln_config.max_seq_len)
         self.num_hidden_layers = getattr(self.config, "num_hidden_layers", None) or getattr(self.config, "n_layer")
         self._phase = "prefill"
     def get_rotary_emb(self, max_seq_len):
         return RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-    def get_decoder_layers(self, causal_lm: PreTrainedModel):
-        return causal_lm.model.layers
+    def get_decoder_layers(self, model: PreTrainedModel):
+        return model.model.layers if self.is_causal_lm else model.layers
     def get_attn_layer(self, layer: nn.Module):
         return layer.self_attn
-    def get_model_layer(self, causal_lm: PreTrainedModel):
-        return causal_lm.model
+    def get_model_layer(self, model: PreTrainedModel):
+        return model.model if self.is_causal_lm else model
     def get_rbln_attn_class(self):
         return DecoderOnlyAttention
@@ -228,35 +104,28 @@ class DecoderOnlyWrapper(nn.Module):
     def get_rbln_causal_lm_class(self):
         return DecoderOnlyForCausalLM
-    def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
+    def convert_to_rbln_class(self, model: PreTrainedModel, max_seq_len: int):
         new_layers = []
-        for layer_idx, layer in enumerate(self.get_decoder_layers(causal_lm)):
-            is_sliding = layer_idx in self.sliding_window_layers
+        for layer_idx, layer in enumerate(self.get_decoder_layers(model)):
+            is_sliding = layer_idx in self.rbln_config.sliding_window_layers
             new_self_attn = self.get_rbln_attn_class()(
-                self.get_attn_layer(layer),
-                self.use_attention_mask if not is_sliding else True,
-                self.use_position_ids,
-                kvcache_block_size=self.sliding_window
-                if layer_idx in self.sliding_window_layers
-                else self.kvcache_block_size,
-                is_sliding=is_sliding,
-                attn_impl=self.attn_impl if not is_sliding else "eager",
-                kvcache_partition_len=self.kvcache_partition_len,
+                self.get_attn_layer(layer), self.rbln_config, is_sliding=is_sliding
             )
-            new_layer = self.get_rbln_layer_class()(layer, new_self_attn)
+            new_layer = self.get_rbln_layer_class()(layer, new_self_attn, lora_config=self.rbln_config.lora_config)
             new_layers.append(new_layer)
         new_model = self.get_rbln_model_class()(
-            self.get_model_layer(causal_lm),
+            self.get_model_layer(model),
             new_layers,
-            partition_len=self.kvcache_partition_len,
-            max_seq_len=max_seq_len,
-            kvcache_block_size=self.kvcache_block_size,
+            self.rbln_config,
             use_learned_pos_emb=self.__class__._use_learned_pos_emb,
-            sliding_window_layers=self.sliding_window_layers,
         )
-        new_causal_lm = self.get_rbln_causal_lm_class()(causal_lm, new_model)
-        return new_causal_lm
+        if self.is_causal_lm:
+            new_model = self.get_rbln_causal_lm_class()(model, new_model)
+            return new_model
+        else:
+            return new_model
     @property
     def phase(self) -> str:
@@ -265,18 +134,24 @@ class DecoderOnlyWrapper(nn.Module):
     @phase.setter
     def phase(self, phase: str):
         self._phase = phase
-        self.causal_lm.phase = phase
+        self.model.phase = phase
     def prepare_forward_args(self, *args):
         args = list(args)
-        input_ids = None if self.use_inputs_embeds else args.pop(0)
-        inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
+        input_ids = None if self.rbln_config.use_inputs_embeds else args.pop(0)
+        inputs_embeds = args.pop(0) if self.rbln_config.use_inputs_embeds else None
         cache_position = args.pop(0)
-        global_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "static"] else None
-        local_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "sliding_window"] else None
-        query_position = args.pop(0) if "prefill" in self.phase else None
-        attention_mask = args.pop(0) if self.use_attention_mask else None
-        position_ids = args.pop(0) if self.use_position_ids else None
+        global_block_tables = args.pop(0) if self.rbln_config.use_global_attention else None
+        local_block_tables = args.pop(0) if self.rbln_config.use_local_attention else None
+        query_position = (
+            args.pop(0)
+            # query_position usage: 1. causal_lm prefill or 2. sliding_window cache_position
+            if ("prefill" in self.phase and (self.is_causal_lm or self.rbln_config.use_local_attention))
+            else None
+        )
+        attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
+        position_ids = args.pop(0) if self.rbln_config.use_position_ids else None
+        lora_int_id = args.pop(0) if self.rbln_config.lora_config else None
         past_key_values = args
         if len(past_key_values) != 2 * self.num_hidden_layers:
@@ -308,6 +183,7 @@ class DecoderOnlyWrapper(nn.Module):
             query_position,
             attention_mask,
             position_ids,
+            lora_int_id,
             past_key_values,
             rotary_emb,
         )
@@ -322,11 +198,12 @@ class DecoderOnlyWrapper(nn.Module):
             query_position,
             attention_mask,
             position_ids,
+            lora_int_id,
             past_key_values,
             rotary_emb,
         ) = self.prepare_forward_args(*args)
-        logit = self.causal_lm(
+        logit = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -337,6 +214,7 @@ class DecoderOnlyWrapper(nn.Module):
             rotary_emb=rotary_emb,
             global_block_tables=global_block_tables,
             local_block_tables=local_block_tables,
+            lora_int_id=lora_int_id,
         )
         return logit
@@ -393,6 +271,7 @@ class DecoderOnlyForCausalLM(nn.Module):
         rotary_emb: nn.Module = None,
         global_block_tables: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         # outputs
         hidden_states = self.model(
@@ -406,6 +285,7 @@ class DecoderOnlyForCausalLM(nn.Module):
             rotary_emb=rotary_emb,
             global_block_tables=global_block_tables,
             local_block_tables=local_block_tables,
+            lora_int_id=lora_int_id,
         )
         if "prefill" in self.phase:
@@ -428,6 +308,8 @@ class DecoderOnlyModel(nn.Module):
     Args:
         model: Original Huggingface model to adapt
         layers (List[DecoderOnlyLayer]): Modified transformer layers optimized for RBLN
+        rbln_config: RBLN model configuration
+        use_learned_pos_emb: Whether to use learned position embeddings (class-specific override)
     Attributes:
         _original_mod: Reference to original Huggingface model
@@ -439,21 +321,19 @@ class DecoderOnlyModel(nn.Module):
         self,
         model,
         layers: List["DecoderOnlyLayer"],
-        partition_len=None,
-        max_seq_len=None,
-        kvcache_block_size=None,
+        rbln_config: "RBLNDecoderOnlyModelConfig",
         use_learned_pos_emb=None,
-        sliding_window_layers=None,
     ):
         super().__init__()
         self._original_mod = model
         self.layers = nn.ModuleList(layers)
+        self.rbln_config = rbln_config
         self._phase = "prefill"
-        self.partition_len = partition_len
-        self.kvcache_block_size = kvcache_block_size
-        self.max_seq_len = max_seq_len
+        self.partition_len = rbln_config.kvcache_partition_len
+        self.kvcache_block_size = rbln_config.kvcache_block_size
+        self.max_seq_len = rbln_config.max_seq_len
         self.use_learned_pos_emb = use_learned_pos_emb
-        self.sliding_window_layers = sliding_window_layers
+        self.sliding_window_layers = rbln_config.sliding_window_layers
     @property
     def phase(self):
@@ -517,6 +397,7 @@ class DecoderOnlyModel(nn.Module):
         rotary_emb: Optional[Union[nn.Module, torch.Tensor]] = None,
         global_block_tables: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         # retrieve input_ids and inputs_embeds
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -589,6 +470,7 @@ class DecoderOnlyModel(nn.Module):
                 cos=cos,
                 sin=sin,
                 block_tables=local_block_tables if is_sliding else global_block_tables,
+                lora_int_id=lora_int_id,
             )
         hidden_states = self.get_last_layernorm()(hidden_states)
@@ -620,11 +502,27 @@ class DecoderOnlyLayer(nn.Module):
         phase: Current operation phase ("prefill" or "decode")
     """
-    def __init__(self, layer, self_attn: "DecoderOnlyAttention"):
+    def __init__(self, layer, self_attn: "DecoderOnlyAttention", lora_config: Optional[RBLNLoRAConfig] = None):
         super().__init__()
         self._original_mod = layer
         self.self_attn = self_attn
         self._phase = "prefill"
+        self.lora_config = lora_config
+        # Replace target Linear modules in MLP with LoRALinear if configured
+        if self.lora_config:
+            mlp = self.get_mlp()
+            for proj_name in ["gate_proj", "up_proj", "down_proj"]:
+                if hasattr(mlp, proj_name):
+                    original_linear = getattr(mlp, proj_name)
+                    if isinstance(original_linear, nn.Linear):
+                        lora_linear = LoRALinear(
+                            original_linear=original_linear,
+                            lora_config=self.lora_config,
+                            projection_name=proj_name,
+                            layer_idx=self.self_attn.layer_idx,
+                        )
+                        setattr(mlp, proj_name, lora_linear)
     @property
     def phase(self):
@@ -641,6 +539,25 @@ class DecoderOnlyLayer(nn.Module):
     def get_post_attention_layernorm(self) -> nn.LayerNorm:
         return self._original_mod.post_attention_layernorm
+    def get_mlp(self) -> nn.Module:
+        return self._original_mod.mlp
+    def forward_mlp(self, hidden_states: torch.Tensor, lora_int_id: Optional[torch.Tensor] = None) -> torch.Tensor:
+        mlp = self.get_mlp()
+        if self.lora_config and lora_int_id is not None:
+            gate = mlp.gate_proj(hidden_states, lora_int_id)
+            up = mlp.up_proj(hidden_states, lora_int_id)
+            act_fn = getattr(mlp, "act_fn", None) or getattr(mlp, "activation_fn", None)
+            if act_fn is None:
+                gate = torch.nn.functional.silu(gate)
+            else:
+                gate = act_fn(gate)
+            fused = gate * up
+            hidden_states = mlp.down_proj(fused, lora_int_id)
+        else:
+            hidden_states = mlp(hidden_states)
+        return hidden_states
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -650,6 +567,7 @@ class DecoderOnlyLayer(nn.Module):
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
         block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         residual = hidden_states
         hidden_states = self.get_pre_attention_layernorm()(hidden_states)
@@ -662,13 +580,14 @@ class DecoderOnlyLayer(nn.Module):
             cos=cos,
             sin=sin,
             block_tables=block_tables,
+            lora_int_id=lora_int_id,
         )
         hidden_states = residual + hidden_states
         # Fully Connected
         residual = hidden_states
         hidden_states = self.get_post_attention_layernorm()(hidden_states)
-        hidden_states = self._original_mod.mlp(hidden_states)
+        hidden_states = self.forward_mlp(hidden_states, lora_int_id)
         hidden_states = residual + hidden_states
         return hidden_states
@@ -683,32 +602,27 @@ class DecoderOnlyAttention(nn.Module):
     Args:
         self_attn: Original attention module from the base model
-        use_attention_mask: Whether to use attention mask
-        use_position_ids: Whether to use position ids
-        kvcache_block_size: Block size for KV cache
+        rbln_config: RBLN model configuration containing attention parameters
         is_sliding: Whether this is sliding window attention
-        attn_impl: Attention implementation type ("eager" or "flash_attn")
     """
     def __init__(
         self,
         self_attn,
-        use_attention_mask,
-        use_position_ids,
-        kvcache_block_size,
+        rbln_config: "RBLNDecoderOnlyModelConfig",
         is_sliding=False,
-        attn_impl="eager",
-        kvcache_partition_len=None,
     ):
         super().__init__()
         self._original_mod = self_attn
+        self.rbln_config = rbln_config
         self.layer_idx = self_attn.layer_idx
         self.num_heads = getattr(self._original_mod, "num_heads", None) or getattr(
             self._original_mod.config, "num_attention_heads"
         )
         self.head_dim = self._original_mod.head_dim
         self._phase = "prefill"
-        self.scale = torch.tensor(self.get_attn_scale())
+        self.scale = torch.nn.Parameter(torch.tensor(self.get_attn_scale()))
+        self.quantization = rbln_config.quantization
         if hasattr(self._original_mod, "num_key_value_heads"):
             self.num_key_value_heads = self._original_mod.num_key_value_heads
@@ -717,16 +631,29 @@ class DecoderOnlyAttention(nn.Module):
         else:
             self.num_key_value_heads = self.num_heads
-        self.use_attention_mask = use_attention_mask
-        self.use_position_ids = use_position_ids
+        self.use_attention_mask = rbln_config.use_attention_mask if not is_sliding else True
+        self.use_position_ids = rbln_config.use_position_ids
         self.is_sliding = is_sliding
-        self.attn_impl = attn_impl
-        self.kvcache_partition_len = kvcache_partition_len
+        self.attn_impl = rbln_config.attn_impl if not is_sliding else "eager"
+        self.kvcache_partition_len = getattr(rbln_config, "kvcache_partition_len", None)
+        self.kvcache_block_size = rbln_config.sliding_window if is_sliding else rbln_config.kvcache_block_size
+        self.lora_config = rbln_config.lora_config
         setattr(self, self.get_attention_name(), self.create_attention_op())
-        self.kvcache_block_size = kvcache_block_size
         self.__post_init__()
+    def _init_lora_weights(self):
+        """Initialize LoRA adapter weights by replacing linear layers with LoRALinear."""
+        for proj_name in ["q_proj", "k_proj", "v_proj", "o_proj"]:
+            original_linear = getattr(self._original_mod, proj_name)
+            lora_linear = LoRALinear(
+                original_linear=original_linear,
+                lora_config=self.lora_config,
+                projection_name=proj_name,
+                layer_idx=self.layer_idx,
+            )
+            setattr(self, proj_name, lora_linear)
     def get_attention_name(self):
         if self.is_sliding:
             return "sliding_window_attention"
@@ -764,6 +691,7 @@ class DecoderOnlyAttention(nn.Module):
                 self.kvcache_partition_len,
                 self.use_attention_mask,
                 self.use_position_ids,
+                self.quantization,
             )
         elif self.attn_impl == "eager":
             return AttentionOp(
@@ -772,28 +700,46 @@ class DecoderOnlyAttention(nn.Module):
                 self.num_key_value_heads,
                 self.use_attention_mask,
                 self.use_position_ids,
+                self.quantization,
             )
         else:
             raise NotImplementedError(f"Unknown attention implementation: {self.attn_impl}")
     def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.o_proj
-    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Initialize LoRA weights if configured, which will replace linear layers
+        if self.lora_config:
+            self._init_lora_weights()
+        else:
+            # Use original linear layers if no LoRA
+            self.q_proj = self._original_mod.q_proj
+            self.k_proj = self._original_mod.k_proj
+            self.v_proj = self._original_mod.v_proj
+            self.o_proj = self._original_mod.o_proj
+    def projection(
+        self, hidden_states, lora_int_id: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Projects input hidden states into query, key, and value representations.
         Args:
             hidden_states: Input tensor of shape [batch_size, seq_len, hidden_dim]
+            lora_int_id: Adapter ID tensor for LoRA selection [batch_size]
         Returns:
             Tuple of (query_states, key_states, value_states)
         """
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        # Check if using LoRALinear (which accepts lora_int_id) or standard linear layers
+        if self.lora_config:
+            # LoRALinear handles both base projection and LoRA in one forward pass
+            query_states = self.q_proj(hidden_states, lora_int_id)
+            key_states = self.k_proj(hidden_states, lora_int_id)
+            value_states = self.v_proj(hidden_states, lora_int_id)
+        else:
+            # Standard linear projection without LoRA
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
     def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
@@ -802,6 +748,16 @@ class DecoderOnlyAttention(nn.Module):
     def get_attn_scale(self):
         return 1 / math.sqrt(self.head_dim)
+    def maybe_get_kvcache_scale(self) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if hasattr(self, "k_proj") and hasattr(self, "v_proj"):
+            k_scale = getattr(self.k_proj, "k_scale", None)
+            v_scale = getattr(self.v_proj, "v_scale", None)
+        else:
+            k_scale = None
+            v_scale = None
+        return k_scale, v_scale
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -811,10 +767,11 @@ class DecoderOnlyAttention(nn.Module):
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
         block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         batch_size, query_length, _ = hidden_states.size()
-        query_states, key_states, value_states = self.projection(hidden_states=hidden_states)
+        query_states, key_states, value_states = self.projection(hidden_states=hidden_states, lora_int_id=lora_int_id)
         query_states = query_states.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -831,6 +788,8 @@ class DecoderOnlyAttention(nn.Module):
         if batch_size > 1 and "prefill" in self.phase:
             raise NotImplementedError(f"batch size should be 1 if prefill phase, but got {batch_size}.")
+        k_scale, v_scale = self.maybe_get_kvcache_scale()
         attn_output = self.get_attention_op()(
             query_states,
             key_states,
@@ -842,9 +801,18 @@ class DecoderOnlyAttention(nn.Module):
             scale=self.scale,
             block_tables=block_tables,
             block_size=self.kvcache_block_size,
+            k_scale=k_scale,
+            v_scale=v_scale,
         )
-        attn_outputs = self.o_proj(attn_output)
+        # Check if using LoRALinear (which accepts lora_int_id) or standard linear layers
+        if self.lora_config:
+            # LoRALinear handles both base projection and LoRA in one forward pass
+            attn_outputs = self.o_proj(attn_output, lora_int_id)
+        else:
+            # Standard linear projection without LoRA
+            attn_outputs = self.o_proj(attn_output)
         return attn_outputs
@@ -858,7 +826,13 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
 class AttentionOp(nn.Module):
     def __init__(
-        self, num_heads: int, head_dim: int, num_key_value_heads: int, use_attention_mask: bool, use_position_ids: bool
+        self,
+        num_heads: int,
+        head_dim: int,
+        num_key_value_heads: int,
+        use_attention_mask: bool,
+        use_position_ids: bool,
+        quantization: Optional[RBLNQuantizationConfig] = None,
     ):
         super().__init__()
         self.num_heads = num_heads
@@ -867,10 +841,10 @@ class AttentionOp(nn.Module):
         self.phase = "prefill"
         self.use_attention_mask = use_attention_mask
         self.use_position_ids = use_position_ids
+        self.quantization = quantization
     def get_attn_op_name(self):
         phase = "decode" if self.phase == "decode" else "prefill"
         if self.use_attention_mask and not self.use_position_ids:
             attn_op_name = "paged_attn_"
         else:
@@ -878,6 +852,9 @@ class AttentionOp(nn.Module):
         attn_op_name += phase
+        if self.quantization and self.quantization.kv_caches == "fp8":
+            attn_op_name += "_kv_fp8"
         return attn_op_name
     def forward(
@@ -892,6 +869,8 @@ class AttentionOp(nn.Module):
         scale: torch.Tensor,
         block_tables: torch.Tensor,
         block_size: int,
+        k_scale: Optional[torch.Tensor] = None,
+        v_scale: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Compute attention with static shapes and explicit cache management.
@@ -904,6 +883,10 @@ class AttentionOp(nn.Module):
             past_value_state: Previous value cache states
             seq_position: Current position in sequence
             scale: Scale applied to attn weights
+            block_tables: Block tables for paged attention
+            block_size: Block size for paged attention
+            k_scale: Scale applied to key
+            v_scale: Scale applied to value
         Returns:
             Tensor: attention_output: [batch, num_heads, seq_len, head_dim]
@@ -940,13 +923,19 @@ class AttentionOp(nn.Module):
             "block_size": block_size,
         }
-        if self.use_attention_mask != self.use_position_ids:
+        if self.use_attention_mask:
             op_args["mask"] = attn_mask
         if self.phase == "prefill" or self.phase == "image_prefill":
             if not self.use_attention_mask or self.use_position_ids:
                 op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+        if self.quantization and self.quantization.kv_caches == "fp8":
+            if past_key_state.dtype != torch.float8_e4m3fn:
+                raise ValueError(f"Unsupported KVCaches type: {past_key_state.dtype}")
+            op_args["k_scale"] = k_scale
+            op_args["v_scale"] = v_scale
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
         if attn_op is None:
@@ -960,97 +949,6 @@ class AttentionOp(nn.Module):
         return attn_output
-def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
-    """Slice cos[cache_position], sin[cache_position] vector for the query."""
-    if cache_position.shape[0] > 1:
-        cos_all = []
-        sin_all = []
-        for i in range(cache_position.shape[0]):
-            cos_all.append(cos[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
-            sin_all.append(sin[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
-        cos = torch.cat(cos_all, dim=0)
-        sin = torch.cat(sin_all, dim=0)
-    else:
-        cos = cos[cache_position].unsqueeze(unsqueeze_dim)
-        sin = sin[cache_position].unsqueeze(unsqueeze_dim)
-    return cos, sin
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin):
-    """Applies Rotary Position Embedding to the query and key tensors."""
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-def apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Partial rotary embedding
-    query_rot, query_pass = (
-        query_states[..., :ndim],
-        query_states[..., ndim:],
-    )
-    key_rot, key_pass = (
-        key_states[..., :ndim],
-        key_states[..., ndim:],
-    )
-    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
-    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
-    # [batch_size, seq_length, num_heads, head_dim]
-    query_states = torch.cat((query_rot, query_pass), dim=-1)
-    key_states = torch.cat((key_rot, key_pass), dim=-1)
-    return query_states, key_states
-class RotaryEmbedding(nn.Module):
-    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        max_seq_len_cached: int,
-    ):
-        super().__init__()
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            rope_type = "default"
-        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, max_seq_len_cached)
-        cache_position = torch.arange(0, max_seq_len_cached, dtype=torch.float32)
-        cache_position_expanded = cache_position[:, None]
-        if rope_type == "dynamic":
-            freqs = cache_position_expanded.float() * inv_freq.float()
-        else:
-            inv_freq_expanded = inv_freq[None, :]
-            freqs = cache_position_expanded.float() @ inv_freq_expanded.float()
-        emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * attention_scaling
-        sin = emb.sin() * attention_scaling
-        self.register_buffer("_cos_cached", cos, persistent=False)
-        self.register_buffer("_sin_cached", sin, persistent=False)
-    def forward(self, x, seq_len):
-        return (
-            self._cos_cached[:seq_len].to(dtype=x.dtype),
-            self._sin_cached[:seq_len].to(dtype=x.dtype),
-        )
 class FlashAttentionOp(AttentionOp):
     def __init__(
         self,
@@ -1060,6 +958,7 @@ class FlashAttentionOp(AttentionOp):
         kvcache_partition_len: int,
         use_attention_mask: bool,
         use_position_ids: bool,
+        quantization: Optional[RBLNQuantizationConfig] = None,
     ):
         super().__init__(
             num_heads=num_heads,
@@ -1067,6 +966,7 @@ class FlashAttentionOp(AttentionOp):
             num_key_value_heads=num_key_value_heads,
             use_attention_mask=use_attention_mask,
             use_position_ids=use_position_ids,
+            quantization=quantization,
         )
         self.kvcache_partition_size = kvcache_partition_len
@@ -1079,6 +979,9 @@ class FlashAttentionOp(AttentionOp):
         attn_op_name += phase
+        if self.quantization and self.quantization.kv_caches == "fp8":
+            attn_op_name += "_kv_fp8"
         return attn_op_name
     def forward(
@@ -1093,6 +996,8 @@ class FlashAttentionOp(AttentionOp):
         scale,
         block_tables,
         block_size,
+        k_scale=None,
+        v_scale=None,
     ):
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
         key_state = key_state.unsqueeze(2)
@@ -1133,6 +1038,12 @@ class FlashAttentionOp(AttentionOp):
             if not self.use_attention_mask or self.use_position_ids:
                 op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+        if self.quantization and self.quantization.kv_caches == "fp8":
+            if past_key_state.dtype != torch.float8_e4m3fn:
+                raise ValueError(f"Unsupported KVCaches type: {past_key_state.dtype}")
+            op_args["k_scale"] = k_scale
+            op_args["v_scale"] = v_scale
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
         if attn_op is None:
@@ -1160,14 +1071,19 @@ class SlidingWindowAttentionOp(AttentionOp):
         query_state: torch.Tensor,
         key_state: torch.Tensor,
         value_state: torch.Tensor,
-        attn_mask: torch.Tensor,
+        attn_mask: Optional[torch.Tensor],
         past_key_state: torch.Tensor,
         past_value_state: torch.Tensor,
         seq_position: Tuple[torch.Tensor],
         scale: torch.Tensor,
         block_tables: torch.Tensor,
         block_size: int,
+        k_scale: Optional[torch.Tensor] = None,
+        v_scale: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert self.quantization is None, "Sliding window attention does not support quantization"
+        assert k_scale is None and v_scale is None, "Sliding window attention does not support quantization"
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
         key_state = key_state.unsqueeze(2)
         value_state = value_state.unsqueeze(2)
@@ -1199,8 +1115,7 @@ class SlidingWindowAttentionOp(AttentionOp):
         }
         if self.phase == "prefill" or self.phase == "image_prefill":
-            if not self.use_attention_mask or self.use_position_ids:
-                op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+            op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
@@ -1213,3 +1128,97 @@ class SlidingWindowAttentionOp(AttentionOp):
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
         return attn_output
+class RotaryEmbedding(nn.Module):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_seq_len_cached: int,
+    ):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            rope_type = "default"
+        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, max_seq_len_cached)
+        cache_position = torch.arange(0, max_seq_len_cached)
+        cache_position_expanded = cache_position[:, None]
+        if rope_type == "dynamic":
+            freqs = cache_position_expanded.float() * inv_freq.float()
+        else:
+            inv_freq_expanded = inv_freq[None, :]
+            freqs = cache_position_expanded.float() @ inv_freq_expanded.float()
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling
+        self.register_buffer("_cos_cached", cos, persistent=False)
+        self.register_buffer("_sin_cached", sin, persistent=False)
+    def forward(self, x, seq_len):
+        return (
+            self._cos_cached[:seq_len].to(dtype=torch.float32),
+            self._sin_cached[:seq_len].to(dtype=torch.float32),
+        )
+def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
+    """Slice cos[cache_position], sin[cache_position] vector for the query."""
+    if cache_position.shape[0] > 1:
+        cos_all = []
+        sin_all = []
+        for i in range(cache_position.shape[0]):
+            cos_all.append(cos[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
+            sin_all.append(sin[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
+        cos = torch.cat(cos_all, dim=0)
+        sin = torch.cat(sin_all, dim=0)
+    else:
+        cos = cos[cache_position].unsqueeze(unsqueeze_dim)
+        sin = sin[cache_position].unsqueeze(unsqueeze_dim)
+    return cos, sin
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    dtype = q.dtype
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(dtype)
+    k_embed = k_embed.to(dtype)
+    return q_embed, k_embed
+def apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Partial rotary embedding
+    query_rot, query_pass = (
+        query_states[..., :ndim],
+        query_states[..., ndim:],
+    )
+    key_rot, key_pass = (
+        key_states[..., :ndim],
+        key_states[..., ndim:],
+    )
+    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
+    # [batch_size, seq_length, num_heads, head_dim]
+    query_states = torch.cat((query_rot, query_pass), dim=-1)
+    key_states = torch.cat((key_rot, key_pass), dim=-1)
+    return query_states, key_states

optimum-rbln 0.8.2a4__py3-none-any.whl → 0.9.3rc0__py3-none-any.whl

optimum-rbln 0.8.2a4py3-none-any.whl → 0.9.3rc0py3-none-any.whl