PyPI - optimum-rbln - Versions diffs - 0.9.2a2__py3-none-any.whl → 0.9.2a4__py3-none-any.whl - Mend

optimum-rbln 0.9.2a2py3-none-any.whl → 0.9.2a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (26) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -22,6 +22,8 @@ from transformers import PretrainedConfig, PreTrainedModel
 from ....utils import logging
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...utils.rbln_quantization import RBLNQuantizationConfig
+from .configuration_lora import RBLNLoRAConfig
+from .lora_architecture import LoRALinear
 if TYPE_CHECKING:
@@ -52,12 +54,7 @@ class DecoderOnlyWrapper(nn.Module):
     _use_learned_pos_emb = False
-    def __init__(
-        self,
-        model: PreTrainedModel,
-        rbln_config: "RBLNDecoderOnlyModelConfig",
-        use_rotary_emb: bool,
-    ):
+    def __init__(self, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig", use_rotary_emb: bool):
         super().__init__()
         self.quantization = rbln_config.quantization
         self.config = model.config
@@ -114,7 +111,7 @@ class DecoderOnlyWrapper(nn.Module):
             new_self_attn = self.get_rbln_attn_class()(
                 self.get_attn_layer(layer), self.rbln_config, is_sliding=is_sliding
             )
-            new_layer = self.get_rbln_layer_class()(layer, new_self_attn)
+            new_layer = self.get_rbln_layer_class()(layer, new_self_attn, lora_config=self.rbln_config.lora_config)
             new_layers.append(new_layer)
         new_model = self.get_rbln_model_class()(
@@ -154,6 +151,7 @@ class DecoderOnlyWrapper(nn.Module):
         )
         attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
         position_ids = args.pop(0) if self.rbln_config.use_position_ids else None
+        lora_int_id = args.pop(0) if self.rbln_config.lora_config else None
         past_key_values = args
         if len(past_key_values) != 2 * self.num_hidden_layers:
@@ -185,6 +183,7 @@ class DecoderOnlyWrapper(nn.Module):
             query_position,
             attention_mask,
             position_ids,
+            lora_int_id,
             past_key_values,
             rotary_emb,
         )
@@ -199,6 +198,7 @@ class DecoderOnlyWrapper(nn.Module):
             query_position,
             attention_mask,
             position_ids,
+            lora_int_id,
             past_key_values,
             rotary_emb,
         ) = self.prepare_forward_args(*args)
@@ -214,6 +214,7 @@ class DecoderOnlyWrapper(nn.Module):
             rotary_emb=rotary_emb,
             global_block_tables=global_block_tables,
             local_block_tables=local_block_tables,
+            lora_int_id=lora_int_id,
         )
         return logit
@@ -270,6 +271,7 @@ class DecoderOnlyForCausalLM(nn.Module):
         rotary_emb: nn.Module = None,
         global_block_tables: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         # outputs
         hidden_states = self.model(
@@ -283,6 +285,7 @@ class DecoderOnlyForCausalLM(nn.Module):
             rotary_emb=rotary_emb,
             global_block_tables=global_block_tables,
             local_block_tables=local_block_tables,
+            lora_int_id=lora_int_id,
         )
         if "prefill" in self.phase:
@@ -394,6 +397,7 @@ class DecoderOnlyModel(nn.Module):
         rotary_emb: Optional[Union[nn.Module, torch.Tensor]] = None,
         global_block_tables: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         # retrieve input_ids and inputs_embeds
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -466,6 +470,7 @@ class DecoderOnlyModel(nn.Module):
                 cos=cos,
                 sin=sin,
                 block_tables=local_block_tables if is_sliding else global_block_tables,
+                lora_int_id=lora_int_id,
             )
         hidden_states = self.get_last_layernorm()(hidden_states)
@@ -497,11 +502,27 @@ class DecoderOnlyLayer(nn.Module):
         phase: Current operation phase ("prefill" or "decode")
     """
-    def __init__(self, layer, self_attn: "DecoderOnlyAttention"):
+    def __init__(self, layer, self_attn: "DecoderOnlyAttention", lora_config: Optional[RBLNLoRAConfig] = None):
         super().__init__()
         self._original_mod = layer
         self.self_attn = self_attn
         self._phase = "prefill"
+        self.lora_config = lora_config
+        # Replace target Linear modules in MLP with LoRALinear if configured
+        if self.lora_config:
+            mlp = self.get_mlp()
+            for proj_name in ["gate_proj", "up_proj", "down_proj"]:
+                if hasattr(mlp, proj_name):
+                    original_linear = getattr(mlp, proj_name)
+                    if isinstance(original_linear, nn.Linear):
+                        lora_linear = LoRALinear(
+                            original_linear=original_linear,
+                            lora_config=self.lora_config,
+                            projection_name=proj_name,
+                            layer_idx=self.self_attn.layer_idx,
+                        )
+                        setattr(mlp, proj_name, lora_linear)
     @property
     def phase(self):
@@ -518,6 +539,25 @@ class DecoderOnlyLayer(nn.Module):
     def get_post_attention_layernorm(self) -> nn.LayerNorm:
         return self._original_mod.post_attention_layernorm
+    def get_mlp(self) -> nn.Module:
+        return self._original_mod.mlp
+    def forward_mlp(self, hidden_states: torch.Tensor, lora_int_id: Optional[torch.Tensor] = None) -> torch.Tensor:
+        mlp = self.get_mlp()
+        if self.lora_config and lora_int_id is not None:
+            gate = mlp.gate_proj(hidden_states, lora_int_id)
+            up = mlp.up_proj(hidden_states, lora_int_id)
+            act_fn = getattr(mlp, "act_fn", None) or getattr(mlp, "activation_fn", None)
+            if act_fn is None:
+                gate = torch.nn.functional.silu(gate)
+            else:
+                gate = act_fn(gate)
+            fused = gate * up
+            hidden_states = mlp.down_proj(fused, lora_int_id)
+        else:
+            hidden_states = mlp(hidden_states)
+        return hidden_states
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -527,6 +567,7 @@ class DecoderOnlyLayer(nn.Module):
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
         block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         residual = hidden_states
         hidden_states = self.get_pre_attention_layernorm()(hidden_states)
@@ -539,13 +580,14 @@ class DecoderOnlyLayer(nn.Module):
             cos=cos,
             sin=sin,
             block_tables=block_tables,
+            lora_int_id=lora_int_id,
         )
         hidden_states = residual + hidden_states
         # Fully Connected
         residual = hidden_states
         hidden_states = self.get_post_attention_layernorm()(hidden_states)
-        hidden_states = self._original_mod.mlp(hidden_states)
+        hidden_states = self.forward_mlp(hidden_states, lora_int_id)
         hidden_states = residual + hidden_states
         return hidden_states
@@ -595,10 +637,23 @@ class DecoderOnlyAttention(nn.Module):
         self.attn_impl = rbln_config.attn_impl if not is_sliding else "eager"
         self.kvcache_partition_len = getattr(rbln_config, "kvcache_partition_len", None)
         self.kvcache_block_size = rbln_config.sliding_window if is_sliding else rbln_config.kvcache_block_size
+        self.lora_config = rbln_config.lora_config
         setattr(self, self.get_attention_name(), self.create_attention_op())
         self.__post_init__()
+    def _init_lora_weights(self):
+        """Initialize LoRA adapter weights by replacing linear layers with LoRALinear."""
+        for proj_name in ["q_proj", "k_proj", "v_proj", "o_proj"]:
+            original_linear = getattr(self._original_mod, proj_name)
+            lora_linear = LoRALinear(
+                original_linear=original_linear,
+                lora_config=self.lora_config,
+                projection_name=proj_name,
+                layer_idx=self.layer_idx,
+            )
+            setattr(self, proj_name, lora_linear)
     def get_attention_name(self):
         if self.is_sliding:
             return "sliding_window_attention"
@@ -651,23 +706,40 @@ class DecoderOnlyAttention(nn.Module):
             raise NotImplementedError(f"Unknown attention implementation: {self.attn_impl}")
     def __post_init__(self):
-        self.q_proj = self._original_mod.q_proj
-        self.k_proj = self._original_mod.k_proj
-        self.v_proj = self._original_mod.v_proj
-        self.o_proj = self._original_mod.o_proj
-    def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Initialize LoRA weights if configured, which will replace linear layers
+        if self.lora_config:
+            self._init_lora_weights()
+        else:
+            # Use original linear layers if no LoRA
+            self.q_proj = self._original_mod.q_proj
+            self.k_proj = self._original_mod.k_proj
+            self.v_proj = self._original_mod.v_proj
+            self.o_proj = self._original_mod.o_proj
+    def projection(
+        self, hidden_states, lora_int_id: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Projects input hidden states into query, key, and value representations.
         Args:
             hidden_states: Input tensor of shape [batch_size, seq_len, hidden_dim]
+            lora_int_id: Adapter ID tensor for LoRA selection [batch_size]
         Returns:
             Tuple of (query_states, key_states, value_states)
         """
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        # Check if using LoRALinear (which accepts lora_int_id) or standard linear layers
+        if self.lora_config:
+            # LoRALinear handles both base projection and LoRA in one forward pass
+            query_states = self.q_proj(hidden_states, lora_int_id)
+            key_states = self.k_proj(hidden_states, lora_int_id)
+            value_states = self.v_proj(hidden_states, lora_int_id)
+        else:
+            # Standard linear projection without LoRA
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
         return query_states, key_states, value_states
     def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
@@ -695,10 +767,11 @@ class DecoderOnlyAttention(nn.Module):
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
         block_tables: Optional[torch.Tensor] = None,
+        lora_int_id: Optional[torch.Tensor] = None,
     ):
         batch_size, query_length, _ = hidden_states.size()
-        query_states, key_states, value_states = self.projection(hidden_states=hidden_states)
+        query_states, key_states, value_states = self.projection(hidden_states=hidden_states, lora_int_id=lora_int_id)
         query_states = query_states.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -732,7 +805,14 @@ class DecoderOnlyAttention(nn.Module):
             v_scale=v_scale,
         )
-        attn_outputs = self.o_proj(attn_output)
+        # Check if using LoRALinear (which accepts lora_int_id) or standard linear layers
+        if self.lora_config:
+            # LoRALinear handles both base projection and LoRA in one forward pass
+            attn_outputs = self.o_proj(attn_output, lora_int_id)
+        else:
+            # Standard linear projection without LoRA
+            attn_outputs = self.o_proj(attn_output)
         return attn_outputs

optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py CHANGED Viewed

@@ -187,6 +187,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 torch.ones(1, 1, self.rbln_config.prefill_chunk_size, self.rbln_config.prefill_chunk_size), diagonal=1
             )
+        self.lora_int_ids = None
     def inputs_embeddings_if_needed(
         self, input_ids: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None
     ):
@@ -210,6 +212,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         position_ids: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_ids: Optional[torch.Tensor] = None,
     ):
         inputs = self.inputs_embeddings_if_needed(input_ids, inputs_embeds)
         block_tables, local_block_tables, is_external_block_tables = (
@@ -233,6 +236,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 position_embed=position_embed,
                 position_ids=position_ids,
                 local_block_tables=local_block_tables,
+                lora_int_ids=lora_int_ids,
             )
         else:
             return self.prefill_forward(
@@ -245,6 +249,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 position_embed=position_embed,
                 token_type_ids=token_type_ids,
                 local_block_tables=local_block_tables,
+                lora_int_ids=lora_int_ids,
             )
     def decode_forward(
@@ -257,7 +262,20 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         position_embed: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_ids: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
+        if self.rbln_config.use_lora and lora_int_ids is None:
+            if self.lora_int_ids is None:
+                raise ValueError(
+                    "lora_int_id is required when using LoRA. "
+                    "You should call set_lora_int_ids() before forward() or pass lora_int_id to forward()."
+                )
+            lora_int_ids = self.lora_int_ids
+        if lora_int_ids is not None and lora_int_ids.shape[0] != self.batch_size:
+            raise ValueError(f"lora_int_ids size mismatch: got {lora_int_ids.shape[0]}, expected {self.batch_size}.")
         if self.batch_size != cache_position.shape[0]:
             raise RuntimeError(
                 f"Cache position size mismatch: got {cache_position.shape[0]}, expected {self.batch_size}."
@@ -287,6 +305,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             position_embed,
             attention_mask if self.rbln_config.use_attention_mask else None,
             position_ids if self.rbln_config.use_position_ids else None,
+            lora_int_ids if self.rbln_config.use_lora else None,
         )
         return RBLNDecoderOnlyOutput(logits=logits)
@@ -369,12 +388,25 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         position_embed: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         local_block_tables: Optional[torch.Tensor] = None,
+        lora_int_ids: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         """
         Performs chunked prefill for efficient KV-cache updates and memory optimization.
         Instead of processing the entire sequence at once, the input is divided into chunks of size `prefill_chunk_size`,
         and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
         """
+        if self.rbln_config.use_lora and lora_int_ids is None:
+            if self.lora_int_ids is None:
+                raise ValueError(
+                    "lora_int_id is required when using LoRA. "
+                    "You should call set_lora_int_ids() before forward() or pass lora_int_id to forward()."
+                )
+            if batch_idx is not None:
+                lora_int_ids = self.lora_int_ids[batch_idx : batch_idx + 1].clone()
+            else:
+                lora_int_ids = self.lora_int_ids.clone()
         (
             inputs,
             cache_position,
@@ -426,6 +458,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 query_position,
                 chunked_attention_mask if self.rbln_config.use_attention_mask else None,
                 position_ids_chunk,
+                lora_int_ids if self.rbln_config.use_lora else None,
                 out=self.out_buffers,
             )
             output_logits.append(output_logit)

optimum/rbln/transformers/models/decoderonly/lora_architecture.py ADDED Viewed

@@ -0,0 +1,204 @@
+import math
+from pathlib import Path
+from typing import Optional
+import safetensors.torch
+import torch
+from torch import nn
+from ....utils import logging
+from .configuration_lora import RBLNLoRAConfig
+logger = logging.get_logger()
+class LoRALinear(nn.Module):
+    """
+    A linear layer that supports multiple LoRA adapters compiled at static time.
+    This class replaces the original linear layer and handles both base weights
+    and multiple LoRA adapters in a single forward pass using custom ops.
+    """
+    def __init__(
+        self,
+        original_linear: nn.Linear,
+        lora_config: RBLNLoRAConfig,
+        projection_name: str = "",
+        layer_idx: int = 0,
+    ):
+        """
+        Args:
+            original_linear: The original linear layer to be replaced
+            lora_config: LoRA configuration containing all adapters
+            projection_name: Name of the projection (e.g., "q_proj", "k_proj")
+            layer_idx: Layer index for loading the correct LoRA weights
+        """
+        super().__init__()
+        self.in_features = original_linear.in_features
+        self.out_features = original_linear.out_features
+        self.projection_name = projection_name
+        self.layer_idx = layer_idx
+        self.lora_config = lora_config
+        # Store original linear weights and bias directly without cloning
+        self.register_buffer("weight", original_linear.weight.data)
+        if original_linear.bias is not None:
+            self.register_buffer("bias", original_linear.bias.data)
+        else:
+            self.bias = None
+        # Initialize LoRA weights
+        self._init_lora_weights()
+    def _should_apply_lora(self) -> bool:
+        """Check if this projection should have LoRA applied."""
+        # Check if any adapter targets this projection
+        return any(self.projection_name in adapter.target_modules for adapter in self.lora_config.adapters)
+    def _load_adapter_weights(self, adapter_path: Path):
+        """
+        Load adapter weights from local directory.
+        Args:
+            adapter_path: Path to local directory containing adapter weights
+        Returns:
+            Dictionary containing adapter weights
+        Raises:
+            FileNotFoundError: If no adapter weights are found in the directory
+        """
+        if not adapter_path.is_dir():
+            raise ValueError(f"Adapter path must be a directory, got: {adapter_path}")
+        # Try to load weights in order of preference
+        weight_files = [
+            ("adapter_model.safetensors", lambda p: safetensors.torch.load_file(p)),
+            ("adapter_model.bin", lambda p: torch.load(p, map_location="cpu")),
+            ("pytorch_model.bin", lambda p: torch.load(p, map_location="cpu")),
+        ]
+        for filename, load_fn in weight_files:
+            weight_path = adapter_path / filename
+            if weight_path.exists():
+                return load_fn(weight_path)
+        raise FileNotFoundError(
+            f"No adapter weights found in {adapter_path}. "
+            f"Expected one of: {', '.join(filename for filename, _ in weight_files)}"
+        )
+    def _init_lora_weights(self):
+        """Initialize LoRA adapter weights by loading and stacking them."""
+        lora_a_weights = []
+        lora_b_weights = []
+        for adapter in self.lora_config.adapters:
+            if self.projection_name not in adapter.target_modules:
+                # Create zero weights for adapters that don't target this projection
+                lora_a_weights.append(torch.zeros(adapter.r, self.in_features))
+                lora_b_weights.append(torch.zeros(self.out_features, adapter.r))
+                continue
+            adapter_weights = self._load_adapter_weights(adapter.local_adapter_path)
+            # Determine module type from projection name
+            attn_projs = {"q_proj", "k_proj", "v_proj", "o_proj"}
+            mlp_projs = {"gate_proj", "up_proj", "down_proj"}
+            if self.projection_name in attn_projs:
+                module_type = "self_attn"
+            elif self.projection_name in mlp_projs:
+                module_type = "mlp"
+            else:
+                module_type = "self_attn"
+            layer_key = f"base_model.model.model.layers.{self.layer_idx}.{module_type}.{self.projection_name}"
+            lora_a_key = f"{layer_key}.lora_A.weight"
+            lora_b_key = f"{layer_key}.lora_B.weight"
+            if lora_a_key in adapter_weights and lora_b_key in adapter_weights:
+                # Calculate scaling factor and fold it into lora_b
+                scaling = adapter.lora_alpha / adapter.r
+                if adapter.use_rslora:
+                    scaling = scaling / math.sqrt(adapter.r)
+                scaling = scaling * adapter.scaling_factor
+                lora_a_weights.append(adapter_weights[lora_a_key])
+                # scaling is pre-applied to lora_b_weights
+                lora_b_weights.append(adapter_weights[lora_b_key] * scaling)
+            else:
+                logger.warning(f"No LoRA weights found for {lora_a_key} or {lora_b_key}")
+                lora_a_weights.append(torch.zeros(adapter.r, self.in_features))
+                lora_b_weights.append(torch.zeros(self.out_features, adapter.r))
+        # Stack weights along adapter dimension
+        max_rank = self.lora_config.max_lora_rank
+        # Pad smaller ranks to max_rank
+        padded_lora_a = []
+        padded_lora_b = []
+        for i, (lora_a, lora_b) in enumerate(zip(lora_a_weights, lora_b_weights)):
+            current_rank = lora_a.shape[0]
+            if current_rank < max_rank:
+                # Pad with zeros
+                padded_a = torch.zeros(max_rank, self.in_features)
+                padded_b = torch.zeros(self.out_features, max_rank)
+                padded_a[:current_rank] = lora_a
+                padded_b[:, :current_rank] = lora_b
+                padded_lora_a.append(padded_a)
+                padded_lora_b.append(padded_b)
+            else:
+                padded_lora_a.append(lora_a)
+                padded_lora_b.append(lora_b)
+        lora_a_transposed = [lora_a.transpose(0, 1) for lora_a in padded_lora_a]  # [in_features, rank]
+        lora_b_transposed = [lora_b.transpose(0, 1) for lora_b in padded_lora_b]  # [rank, out_features]
+        self.register_buffer(
+            "lora_a_weights", torch.stack(lora_a_transposed, dim=0).to(self.weight.dtype)
+        )  # [num_adapters, in_features, rank]
+        self.register_buffer(
+            "lora_b_weights", torch.stack(lora_b_transposed, dim=0).to(self.weight.dtype)
+        )  # [num_adapters, rank, out_features]
+    def forward(self, x: torch.Tensor, lora_int_id: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Forward pass that combines base linear transformation with LoRA.
+        Args:
+            x: Input tensor [batch_size, seq_len, in_features]
+            lora_int_id: Adapter ID tensor [batch_size] indicating which adapter to use
+        Returns:
+            Output tensor [batch_size, seq_len, out_features]
+        """
+        # Base linear transformation
+        output = torch.nn.functional.linear(x, self.weight, self.bias)
+        # Apply LoRA if enabled and adapter ID is provided
+        if self._should_apply_lora() and lora_int_id is not None:
+            # Gather LoRA weights for each batch item
+            # lora_int_id: [batch_size] -> use as indices to select weights
+            selected_lora_a = self.lora_a_weights[lora_int_id]  # [batch_size, in_features, rank]
+            selected_lora_b = self.lora_b_weights[lora_int_id]  # [batch_size, rank, out_features]
+            # Batched matrix multiplication for LoRA computation
+            # x: [batch_size, seq_len, in_features]
+            # selected_lora_a: [batch_size, in_features, rank] (already transposed)
+            # selected_lora_b: [batch_size, rank, out_features] (already transposed)
+            # First matmul: x @ lora_a -> [batch_size, seq_len, rank]
+            temp = torch.bmm(x, selected_lora_a)
+            # Second matmul: temp @ lora_b -> [batch_size, seq_len, out_features]
+            lora_delta = torch.bmm(temp, selected_lora_b)
+            # Add LoRA delta to base output
+            output = output + lora_delta
+        return output

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -375,6 +375,9 @@ class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
         if rbln_config.use_position_ids:
             input_info.append(("position_ids", [batch_size, query_length], "int32"))
+        if rbln_config.use_lora:
+            input_info.append(("lora_int_ids", [batch_size], "int32"))
         kvcache_dtype = rbln_config.torch_dtype
         if rbln_config.quantization and rbln_config.quantization.kv_caches == "fp8":
             kvcache_dtype = "float8_e4m3fn"
@@ -667,6 +670,53 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
     def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
         return is_prefill
+    def set_lora_int_ids(self, lora_int_ids: Optional[torch.Tensor]):
+        if isinstance(lora_int_ids, int):
+            lora_int_ids = torch.tensor([lora_int_ids], dtype=torch.int32)
+        elif isinstance(lora_int_ids, list):
+            lora_int_ids = torch.tensor(lora_int_ids, dtype=torch.int32)
+        self.lora_int_ids = lora_int_ids
+        self.prefill_decoder.lora_int_ids = lora_int_ids
+        if self.rbln_config.can_generate:
+            for batch_size in self.rbln_config.decoder_batch_sizes:
+                self.decoders[batch_size].lora_int_ids = lora_int_ids
+    def set_adapter(self, adapter_name: Union[str, List[str]]) -> None:
+        """
+        Sets the active adapter(s) for the model using adapter name(s).
+        Args:
+            adapter_name (Union[str, List[str]]): The name(s) of the adapter(s) to be activated.
+                Can be a single adapter name or a list of adapter names.
+        Raises:
+            ValueError: If the model is not configured with LoRA or if the adapter name is not found.
+        """
+        if not hasattr(self.rbln_config, "lora_config") or self.rbln_config.lora_config is None:
+            raise ValueError("Model is not configured with LoRA. Cannot set adapter.")
+        # Convert single adapter name to list for uniform processing
+        if isinstance(adapter_name, str):
+            adapter_names = [adapter_name]
+        else:
+            adapter_names = adapter_name
+        # Validate that all adapter names exist
+        available_adapters = {
+            adapter.lora_name: adapter.lora_int_id for adapter in self.rbln_config.lora_config.adapters
+        }
+        missing_adapters = [name for name in adapter_names if name not in available_adapters]
+        if missing_adapters:
+            raise ValueError(
+                f"Adapter(s) {missing_adapters} not found. Available adapters: {list(available_adapters.keys())}"
+            )
+        # Get the adapter IDs and set them
+        lora_int_ids = [available_adapters[name] for name in adapter_names]
+        self.set_lora_int_ids(torch.tensor(lora_int_ids, dtype=torch.int32))
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -677,6 +727,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
         padded_cache_lengths: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
+        lora_int_ids: Optional[torch.Tensor] = None,
         return_dict: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
@@ -684,6 +735,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
         # For continuous batching, the prefill stage processes one batch at a time and updates the KV cache using batch_idx.
         # A for-loop ensures synchronization with the HuggingFace generate API.
         # The decoder stage operates as usual, processing inputs in batch mode.
+        if self.rbln_config.use_lora and lora_int_ids is None:
+            if self.lora_int_ids is None:
+                raise ValueError(
+                    "lora_int_id is required when using LoRA. "
+                    "You should call set_lora_int_ids() before forward() or pass lora_int_id to forward()."
+                )
+            lora_int_ids = self.lora_int_ids
         # for only use forward
         if generate_idx is None:
@@ -708,6 +766,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
                     cache_position=cache_position,
                     batch_idx=b_idx,
                     token_type_ids=token_type_ids[b_idx : b_idx + 1] if token_type_ids is not None else None,
+                    lora_int_ids=lora_int_ids[b_idx : b_idx + 1] if lora_int_ids is not None else None,
                 )
                 padded_cache_lengths[b_idx] += output.padded_cache_lengths
                 logits.append(output.logits)
@@ -727,6 +786,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel, RBLNDecoderOnlyGener
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
                 position_ids=position_ids if self.rbln_config.use_position_ids else None,
+                lora_int_ids=lora_int_ids,
             ).logits
         if not return_dict:

optimum-rbln 0.9.2a2__py3-none-any.whl → 0.9.2a4__py3-none-any.whl

Potentially problematic release.

optimum-rbln 0.9.2a2py3-none-any.whl → 0.9.2a4py3-none-any.whl