PyPI - optimum-rbln - Versions diffs - 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl - Mend

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import asdict, dataclass
 from typing import Any, Dict, List, Literal, Optional, Union, get_args
-from ....configuration_utils import RBLNModelConfig
+from ....configuration_utils import RBLNModelConfig, RBLNSerializableConfigProtocol
 from ....utils.logging import get_logger
 from ...utils.rbln_quantization import RBLNQuantizationConfig
 from .configuration_lora import RBLNLoRAConfig
@@ -59,6 +60,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
         phases: Optional[List[PhaseType]] = None,
         logits_to_keep: Optional[int] = None,
         output_hidden_states: Optional[bool] = None,
+        kvcache_metas: Optional[List["KVCacheMeta"]] = None,
         **kwargs,
     ):
         """
@@ -93,8 +95,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
                 processing input sequences. Defaults to 128. Must be a positive integer
                 divisible by 64. Affects prefill performance and memory usage.
             kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
-                PagedAttention KV cache. See the "KV Cache Number of Blocks (`kvcache_num_blocks`)"
-                section below for details.
+                PagedAttention KV cache at compile time. Defaults to 0 (automatically determined).
+                See the "KV Cache Number of Blocks (`kvcache_num_blocks`)" section below for details.
             decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
                 This allows the model to handle varying batch sizes efficiently during generation. If not specified,
                 defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
@@ -114,6 +116,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
             logits_to_keep (Optional[int]): The number of logits to keep for the decoder.  If set to 0, the decoder will keep all logits.
                 Defaults to 0 if DecoderOnlyModel is used, 1 if DecoderOnlyModelForCausalLM is used.
             output_hidden_states (Optional[bool]): Whether to output the hidden states of the decoder. Defaults to False.
+            kvcache_metas (Optional[List["KVCacheMeta"]]): The metadata for the KV cache tensors. Handled internally if not provided. Defaults to None.
             kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
@@ -152,17 +155,15 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
         KV Cache Number of Blocks:
-            `kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache.
-            Each block holds `kvcache_block_size` tokens of Key and Value states.
-            - **Automatic Estimation (Default)**: If `kvcache_num_blocks` is `None`, the system estimates
-                the maximum number of blocks that can fit into the available RBLN device memory. This
-                calculation considers the model size (kernel memory), required buffer memory, the number
-                of layers and heads, `kvcache_block_size`, tensor parallelism, and available RBLN NPU DRAM.
-                This aims to maximize cache capacity for potentially better performance with long sequences
-                or larger batches without manual tuning.
-            - **Manual Setting**: You can explicitly set the number of blocks. This provides finer control
-                but requires careful consideration of memory limits. Setting it too high may lead to
+            `kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache
+            at compile time. Each block holds `kvcache_block_size` tokens of Key and Value states.
+            - **Automatic Determination (Default)**: If `kvcache_num_blocks` is `0` (default), the number of blocks
+                is automatically determined during compilation to fit within the available DRAM on the NPU. This allows
+                the model to utilize the remaining memory after compilation without manual tuning, providing optimal
+                cache capacity for better performance with long sequences or larger batches.
+            - **Manual Setting**: You can explicitly set the number of blocks to a positive integer. This provides
+                finer control but requires careful consideration of memory limits. Setting it too high may lead to
                 compilation errors if it exceeds available memory. The system will issue warnings if your
                 setting exceeds the estimated maximum.
             - **Performance Impact**: A larger number of blocks reduces the likelihood of cache eviction,
@@ -175,7 +176,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
                 are violated (e.g., if `kvcache_num_blocks` is less than `batch_size` when using Flash Attention).
             The optimal value depends on the specific model, task, hardware, and desired trade-off
-            between performance and memory usage. The automatic estimation provides a robust starting point.
+            between performance and memory usage. Automatic determination (default) provides a robust starting point
+            that adapts to the available DRAM on the NPU at compile time.
         """
         super().__init__(**kwargs)
@@ -222,7 +224,7 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
         if self.prefill_chunk_size % 64 != 0 or self.prefill_chunk_size <= 0:
             raise ValueError("`prefill_chunk_size` must be a positive integer divisible by 64.")
-        self.kvcache_num_blocks = kvcache_num_blocks
+        self.kvcache_num_blocks = kvcache_num_blocks if kvcache_num_blocks is not None else 0
         self.cache_impl = cache_impl or "static"
         self.sliding_window = sliding_window
         self.sliding_window_layers = sliding_window_layers or []
@@ -257,6 +259,8 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
                 # Larger batch size should be at the beginning of the list.
                 self.decoder_batch_sizes.sort(reverse=True)
+        self.kvcache_metas: List["KVCacheMeta"] = kvcache_metas or []
     @staticmethod
     def validate_phases_type(phases: List[PhaseType]):
         if not isinstance(phases, list):
@@ -290,6 +294,21 @@ class RBLNDecoderOnlyModelConfig(RBLNModelConfig):
             return self.quantization.nbits_per_param
         return 16
+    @property
+    def is_auto_num_blocks(self) -> bool:
+        """Returns True if kvcache_num_blocks will be automatically determined during compilation to fit within the available DRAM on the NPU."""
+        return self.kvcache_num_blocks == 0
+    @property
+    def num_full_blocks(self) -> int:
+        return (self.max_seq_len // self.kvcache_block_size) * self.batch_size
+    @property
+    def num_min_blocks(self) -> int:
+        if self.attn_impl == "flash_attn":
+            return min(self.max_seq_len // self.kvcache_block_size + 1, self.num_full_blocks)
+        return self.batch_size
 class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
     """
@@ -302,3 +321,86 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNDecoderOnlyModelConfig):
     _default_phases = ["prefill", "decode"]
     _default_logits_to_keep = 1
+@dataclass
+class KVCacheMeta(RBLNSerializableConfigProtocol):
+    """
+    KVCacheMeta contains metadata describing the key-value (KV) cache tensor for a specific transformer layer.
+    This is used during compilation and runtime on RBLN devices to manage memory and configure the
+    static or dynamic characteristics of the cache implementation for decoder-only models.
+    Attributes:
+        name (str): Logical name of the KV cache tensor.
+        layer_index (int): Index of the transformer layer corresponding to this cache.
+        shape (list[int]): The 4D shape of the cache tensor:
+            [num_blocks, num_heads, block_size, head_dim]. The number of blocks may be dynamic or static
+            depending on model configuration.
+        layer_type (str): String describing the attention/cache algorithm (e.g., "full_attention", "sliding_attention").
+        is_auto (bool): Whether the number of blocks is automatically determined during compilation (True) or manually specified (False).
+            In both cases, the KV cache size is fixed at compile time.
+        dtype (str): Data type of the cache buffer ("float16", "float32", etc.).
+    """
+    name: str
+    layer_index: int
+    shape: list[int]  # (num_blocks, num_heads, block_size(seq), head_dim)
+    layer_type: str
+    is_auto: bool
+    dtype: str
+    def _prepare_for_serialization(self) -> dict[str, Any]:
+        return asdict(self)
+    @property
+    def compile_shape(self):
+        return [1, self.shape[1], self.shape[2], self.shape[3]] if self.can_resize else self.shape
+    @property
+    def can_resize(self):
+        return self.is_auto and self.layer_type == "full_attention"
+    @property
+    def num_blocks(self) -> int:
+        return self.shape[0]
+    @property
+    def block_size(self) -> int:
+        return self.shape[2]
+    @staticmethod
+    def make(
+        name: str,
+        layer_index: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        dtype: str,
+        rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
+    ) -> "KVCacheMeta":
+        assert len(rbln_config.compile_cfgs) == 0, "KVCacheMeta cannot be created from rbln_config with compile_cfgs"
+        if rbln_config.sliding_window is not None and layer_index in rbln_config.sliding_window_layers:
+            layer_type = "sliding_attention"
+            block_size = rbln_config.sliding_window
+            num_blocks = rbln_config.batch_size
+            is_auto = False
+        else:
+            layer_type = "full_attention"
+            block_size = rbln_config.kvcache_block_size
+            if rbln_config.is_auto_num_blocks:
+                num_blocks = rbln_config.num_full_blocks
+                is_auto = True
+            else:
+                num_blocks = rbln_config.kvcache_num_blocks
+                is_auto = False
+        shape = [num_blocks, num_key_value_heads, block_size, head_dim]
+        if num_blocks <= 0:
+            raise ValueError("`num_blocks` must be greater than 0 when using KV cache.")
+        return KVCacheMeta(
+            name=name, layer_index=layer_index, shape=shape, layer_type=layer_type, is_auto=is_auto, dtype=dtype
+        )

optimum/rbln/transformers/models/decoderonly/configuration_lora.py CHANGED Viewed

@@ -46,7 +46,7 @@ class RBLNLoRAAdapterConfig(RBLNSerializableConfigProtocol):
         model = RBLNLlamaForCausalLM.from_pretrained(
             model_id,
             rbln_config=RBLNLlamaForCausalLMConfig(lora_config=lora_config, tensor_parallel_size=tp_size, max_seq_len=8192),
-            torch_dtype="auto",
+            dtype="auto",
         )

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -75,7 +75,7 @@ class DecoderOnlyWrapper(nn.Module):
                 f" or equal to max_seq_len({rbln_config.max_seq_len})!"
             )
-        self.model = self.convert_to_rbln_class(model, rbln_config.max_seq_len)
+        self.model = self.convert_to_rbln_class(model, rbln_config.max_seq_len, use_rotary_emb)
         self.num_hidden_layers = getattr(self.config, "num_hidden_layers", None) or self.config.n_layer
         self._phase = "prefill"
@@ -103,7 +103,7 @@ class DecoderOnlyWrapper(nn.Module):
     def get_rbln_causal_lm_class(self):
         return DecoderOnlyForCausalLM
-    def convert_to_rbln_class(self, model: PreTrainedModel, max_seq_len: int):
+    def convert_to_rbln_class(self, model: PreTrainedModel, max_seq_len: int, use_rotary_emb: bool):
         new_layers = []
         for layer_idx, layer in enumerate(self.get_decoder_layers(model)):
             is_sliding = layer_idx in self.rbln_config.sliding_window_layers
@@ -118,6 +118,7 @@ class DecoderOnlyWrapper(nn.Module):
             new_layers,
             self.rbln_config,
             use_learned_pos_emb=self.__class__._use_learned_pos_emb,
+            use_rotary_emb=use_rotary_emb,
         )
         if self.is_causal_lm:
@@ -144,8 +145,11 @@ class DecoderOnlyWrapper(nn.Module):
         local_block_tables = args.pop(0) if self.rbln_config.use_local_attention else None
         query_position = (
             args.pop(0)
-            # query_position usage: 1. causal_lm prefill or 2. sliding_window cache_position
-            if ("prefill" in self.phase and (self.is_causal_lm or self.rbln_config.use_local_attention))
+            # query_position usage: prefill & (logits_to_keep == 1 or use_local_attention)
+            if (
+                "prefill" in self.phase
+                and (self.rbln_config.logits_to_keep == 1 or self.rbln_config.use_local_attention)
+            )
             else None
         )
         attention_mask = args.pop(0) if self.rbln_config.use_attention_mask else None
@@ -240,7 +244,6 @@ class DecoderOnlyForCausalLM(nn.Module):
     Attributes:
         config: Configuration from the original causal language model
-        _original_mod: Reference to the original model for components like lm_head
         model: RBLN-optimized decoder model instance
         _phase: Current processing phase ("prefill" or "decode")
     """
@@ -248,10 +251,9 @@ class DecoderOnlyForCausalLM(nn.Module):
     def __init__(self, causal_lm: PreTrainedModel, model: nn.Module):
         super().__init__()
         self.config = causal_lm.config
-        self._original_mod = causal_lm
         self.model = model
         self._phase = "prefill"
-        self.lm_head = self._original_mod.lm_head
+        self.lm_head = causal_lm.lm_head
     @property
     def phase(self):
@@ -293,7 +295,7 @@ class DecoderOnlyForCausalLM(nn.Module):
             output_hidden_states=output_hidden_states,
         )
-        if "prefill" in self.phase:
+        if "prefill" in self.phase and query_position is not None:
             hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
         logits = self.lm_head(hidden_states)
@@ -317,20 +319,35 @@ class DecoderOnlyModel(nn.Module):
         use_learned_pos_emb: Whether to use learned position embeddings (class-specific override)
     Attributes:
-        _original_mod: Reference to original Huggingface model
         layers: ModuleList of RBLN-optimized transformer layers
         _phase: Current processing phase ("prefill" or "decode")
     """
+    _EMBEDDING_ATTRS = ["embed_tokens", "wte"]
+    _POSITION_ATTRS = ["embed_positions", "wpe"]
+    _LAYERNORM_ATTRS = ["norm", "final_layer_norm", "final_layernorm", "ln_f", "layer_norm"]
+    _PRE_FF_LAYERNORM_ATTRS = None
+    _POST_FF_LAYERNORM_ATTRS = None
     def __init__(
         self,
         model,
         layers: List["DecoderOnlyLayer"],
         rbln_config: "RBLNDecoderOnlyModelConfig",
         use_learned_pos_emb=None,
+        use_rotary_emb=True,
     ):
         super().__init__()
-        self._original_mod = model
+        self.config = model.config
+        # Keep commonly-used original submodules registered on this wrapper so their weights
+        # are preserved in state_dict even if the original model object is not kept.
+        # Different HF model families use different attribute names; we register what we can
+        # and allow subclasses to override getters when needed.
+        self.embed_tokens = _get_attr_from_candidates(model, self._EMBEDDING_ATTRS)
+        # hasattr(model, "rotary_emb") is workaround for Qwen2VL
+        if not (use_rotary_emb or hasattr(model, "rotary_emb")):
+            self.embed_positions = _get_attr_from_candidates(model, self._POSITION_ATTRS)
+        self.norm = _get_attr_from_candidates(model, self._LAYERNORM_ATTRS)
         self.layers = nn.ModuleList(layers)
         self.rbln_config = rbln_config
         self._phase = "prefill"
@@ -369,26 +386,28 @@ class DecoderOnlyModel(nn.Module):
         cache_pos_for_partitions = torch.clamp(cs - pidx * partition_len, 0, partition_len)
         return cache_pos_for_partitions
-    def get_local_cache_positions(self, position_ids, query_position):
-        max_cache_len = self._original_mod.config.sliding_window
+    def get_swa_custom_op_args(self, position_ids, query_position):
+        max_cache_len = self.config.sliding_window
         valid_input_len = 1 if query_position is None else query_position + 1
-        cache_seq_len = torch.clamp(position_ids, max=max_cache_len)[:, :1]  # past seen tokens
+        cache_seq_len = torch.clamp(position_ids.to(torch.int32), max=max_cache_len)[:, :1]  # past seen tokens
         cache_offset = (
             torch.clamp(position_ids, max=max_cache_len)[:, :1] + valid_input_len
         )  # cache offset for next steps
-        return cache_seq_len, cache_offset
+        # Causal mask for sliding window attention
+        attn_mask = torch.arange(max_cache_len)[None, :] - cache_seq_len
+        attn_mask = torch.where(attn_mask > 0, 0.0, 1.0)[:, None, None, :]
+        return cache_seq_len, cache_offset, attn_mask
     def get_last_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.norm
+        return self.norm
     def get_embedding(self) -> nn.Embedding:
-        return self._original_mod.embed_tokens
+        return self.embed_tokens
     def get_pos_embedding(self) -> nn.Embedding:
-        raise NotImplementedError(
-            "The 'get_pos_embedding' method is not implemented. Please define this method in a subclass."
-        )
+        return self.embed_positions
     def forward(
         self,
@@ -464,7 +483,8 @@ class DecoderOnlyModel(nn.Module):
         # Get local cache positions for sliding window layers
         if len(self.sliding_window_layers) > 0:
-            sliding_cache_pos = self.get_local_cache_positions(position_ids, query_position)
+            cache_seq_len, cache_offset, swa_attn_mask = self.get_swa_custom_op_args(position_ids, query_position)
+            sliding_cache_pos = (cache_seq_len, cache_offset)
         all_hidden_states = () if output_hidden_states else None
         for layer_idx, layer in enumerate(self.layers):
@@ -472,9 +492,10 @@ class DecoderOnlyModel(nn.Module):
                 all_hidden_states += (hidden_states,)
             is_sliding = True if layer_idx in self.sliding_window_layers else False
+            is_sliding_decode = is_sliding and self.phase == "decode"
             hidden_states = layer(
                 hidden_states=hidden_states,
-                attention_mask=attention_mask,
+                attention_mask=swa_attn_mask if is_sliding_decode else attention_mask,
                 seq_positions=sliding_cache_pos if is_sliding else seq_positions,
                 past_key_values=past_key_values,
                 cos=cos,
@@ -510,14 +531,23 @@ class DecoderOnlyLayer(nn.Module):
         self_attn (DecoderOnlyAttention): Modified attention module optimized for RBLN
     Attributes:
-        _original_mod: Reference to original layer for accessing components
         self_attn: Modified attention mechanism mapped to RBLN ops at compile time
         phase: Current operation phase ("prefill" or "decode")
     """
+    _PRE_ATTN_LAYERNORM = ["input_layernorm", "ln_1", "self_attn_layer_norm", "pre_feedforward_layernorm"]
+    _POST_ATTN_LAYERNORM = ["post_attention_layernorm", "ln_2", "final_layer_norm", "post_feedforward_layernorm"]
+    _PRE_FF_LAYERNORM_ATTRS = None
+    _POST_FF_LAYERNORM_ATTRS = None
     def __init__(self, layer, self_attn: "DecoderOnlyAttention", lora_config: Optional[RBLNLoRAConfig] = None):
         super().__init__()
-        self._original_mod = layer
+        self.pre_attention_layernorm = _get_attr_from_candidates(layer, self._PRE_ATTN_LAYERNORM)
+        self.post_attention_layernorm = _get_attr_from_candidates(layer, self._POST_ATTN_LAYERNORM)
+        self.pre_feedforward_layernorm = _get_attr_from_candidates(layer, self._PRE_FF_LAYERNORM_ATTRS)
+        self.post_feedforward_layernorm = _get_attr_from_candidates(layer, self._POST_FF_LAYERNORM_ATTRS)
+        self.mlp = layer.mlp
         self.self_attn = self_attn
         self._phase = "prefill"
         self.lora_config = lora_config
@@ -547,13 +577,19 @@ class DecoderOnlyLayer(nn.Module):
         self.self_attn.phase = phase
     def get_pre_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.input_layernorm
+        return self.pre_attention_layernorm
     def get_post_attention_layernorm(self) -> nn.LayerNorm:
-        return self._original_mod.post_attention_layernorm
+        return self.post_attention_layernorm
+    def get_pre_feedforward_layernorm(self) -> nn.LayerNorm:
+        return self.pre_feedforward_layernorm
+    def get_post_feedforward_layernorm(self) -> nn.LayerNorm:
+        return self.post_feedforward_layernorm
     def get_mlp(self) -> nn.Module:
-        return self._original_mod.mlp
+        return self.mlp
     def forward_mlp(self, hidden_states: torch.Tensor, lora_int_id: Optional[torch.Tensor] = None) -> torch.Tensor:
         mlp = self.get_mlp()
@@ -619,6 +655,8 @@ class DecoderOnlyAttention(nn.Module):
         is_sliding: Whether this is sliding window attention
     """
+    _O_PROJ_ATTRS = ["o_proj", "out_proj", "dense"]
     def __init__(
         self,
         self_attn,
@@ -626,20 +664,18 @@ class DecoderOnlyAttention(nn.Module):
         is_sliding=False,
     ):
         super().__init__()
-        self._original_mod = self_attn
+        self.config = getattr(self_attn, "config", None)
         self.rbln_config = rbln_config
         self.layer_idx = self_attn.layer_idx
-        self.num_heads = (
-            getattr(self._original_mod, "num_heads", None) or self._original_mod.config.num_attention_heads
-        )
-        self.head_dim = self._original_mod.head_dim
+        self.num_heads = getattr(self_attn, "num_heads", None) or self_attn.config.num_attention_heads
+        self.head_dim = self_attn.head_dim
         self._phase = "prefill"
-        self.scale = torch.nn.Parameter(torch.tensor(self.get_attn_scale()))
+        self.scale = torch.nn.Parameter(torch.tensor(self.get_attn_scale(self_attn)))
-        if hasattr(self._original_mod, "num_key_value_heads"):
-            self.num_key_value_heads = self._original_mod.num_key_value_heads
-        elif hasattr(self._original_mod, "config") and hasattr(self._original_mod.config, "num_key_value_heads"):
-            self.num_key_value_heads = self._original_mod.config.num_key_value_heads
+        if hasattr(self_attn, "num_key_value_heads"):
+            self.num_key_value_heads = self_attn.num_key_value_heads
+        elif hasattr(self_attn, "config") and hasattr(self_attn.config, "num_key_value_heads"):
+            self.num_key_value_heads = self_attn.config.num_key_value_heads
         else:
             self.num_key_value_heads = self.num_heads
@@ -649,13 +685,16 @@ class DecoderOnlyAttention(nn.Module):
         self.kvcache_block_size = rbln_config.sliding_window if is_sliding else rbln_config.kvcache_block_size
         self.lora_config = rbln_config.lora_config
+        if hasattr(self_attn, "sinks"):
+            self.sinks = self_attn.sinks.data[:, None]
         setattr(self, self.get_attention_name(), self.create_attention_op())
-        self.__post_init__()
+        self.__post_init__(self_attn)
     def _init_lora_weights(self):
         """Initialize LoRA adapter weights by replacing linear layers with LoRALinear."""
         for proj_name in ["q_proj", "k_proj", "v_proj", "o_proj"]:
-            original_linear = getattr(self._original_mod, proj_name)
+            original_linear = getattr(self, proj_name)
             lora_linear = LoRALinear(
                 original_linear=original_linear,
                 lora_config=self.lora_config,
@@ -712,16 +751,15 @@ class DecoderOnlyAttention(nn.Module):
         else:
             raise NotImplementedError(f"Unknown attention implementation: {self.attn_impl}")
-    def __post_init__(self):
+    def __post_init__(self, self_attn=None):
+        self.q_proj = self_attn.q_proj
+        self.k_proj = self_attn.k_proj
+        self.v_proj = self_attn.v_proj
+        self.o_proj = _get_attr_from_candidates(self_attn, self._O_PROJ_ATTRS)
         # Initialize LoRA weights if configured, which will replace linear layers
         if self.lora_config:
             self._init_lora_weights()
-        else:
-            # Use original linear layers if no LoRA
-            self.q_proj = self._original_mod.q_proj
-            self.k_proj = self._original_mod.k_proj
-            self.v_proj = self._original_mod.v_proj
-            self.o_proj = self._original_mod.o_proj
     def projection(
         self, hidden_states, lora_int_id: Optional[torch.Tensor] = None
@@ -752,8 +790,8 @@ class DecoderOnlyAttention(nn.Module):
     def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
         return apply_rotary_pos_emb(query_states, key_states, cos, sin)
-    def get_attn_scale(self):
-        return 1 / math.sqrt(self.head_dim)
+    def get_attn_scale(self, self_attn):
+        return 1 / math.sqrt(self_attn.head_dim)
     def maybe_get_kvcache_scale(self) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         if hasattr(self, "k_proj") and hasattr(self, "v_proj"):
@@ -810,6 +848,7 @@ class DecoderOnlyAttention(nn.Module):
             block_size=self.kvcache_block_size,
             k_scale=k_scale,
             v_scale=v_scale,
+            s_aux=getattr(self, "sinks", None),
         )
         # Check if using LoRALinear (which accepts lora_int_id) or standard linear layers
@@ -882,6 +921,7 @@ class AttentionOp(nn.Module):
         block_size: int,
         k_scale: Optional[torch.Tensor] = None,
         v_scale: Optional[torch.Tensor] = None,
+        s_aux: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Compute attention with static shapes and explicit cache management.
@@ -898,6 +938,7 @@ class AttentionOp(nn.Module):
             block_size: Block size for paged attention
             k_scale: Scale applied to key
             v_scale: Scale applied to value
+            s_aux: Auxiliary states for attention sinks
         Returns:
             Tensor: attention_output: [batch, num_heads, seq_len, head_dim]
@@ -953,6 +994,9 @@ class AttentionOp(nn.Module):
             op_args["k_scale"] = k_scale
             op_args["v_scale"] = v_scale
+        if s_aux is not None:
+            op_args["s_aux"] = s_aux
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
         if attn_op is None:
@@ -1017,6 +1061,7 @@ class FlashAttentionOp(AttentionOp):
         block_size,
         k_scale=None,
         v_scale=None,
+        s_aux=None,
     ):
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
         key_state = key_state.unsqueeze(2)
@@ -1070,6 +1115,9 @@ class FlashAttentionOp(AttentionOp):
             op_args["k_scale"] = k_scale
             op_args["v_scale"] = v_scale
+        if s_aux is not None:
+            op_args["s_aux"] = s_aux
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
         if attn_op is None:
@@ -1122,6 +1170,7 @@ class SlidingWindowAttentionOp(AttentionOp):
         block_size: int,
         k_scale: Optional[torch.Tensor] = None,
         v_scale: Optional[torch.Tensor] = None,
+        s_aux: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         assert self.quantization is None, "Sliding window attention does not support quantization"
         assert k_scale is None and v_scale is None, "Sliding window attention does not support quantization"
@@ -1165,6 +1214,11 @@ class SlidingWindowAttentionOp(AttentionOp):
                     op_args["is_bidirectional"] = True
                 else:
                     op_args["is_bidirectional"] = False
+        elif self.phase == "decode":
+            op_args["attn_mask"] = attn_mask
+        if s_aux is not None:
+            op_args["s_aux"] = s_aux
         attn_op_name = self.get_attn_op_name()
         attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
@@ -1194,7 +1248,7 @@ class RotaryEmbedding(nn.Module):
         else:
             rope_type = "default"
-        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, max_seq_len_cached)
+        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, "cpu", max_seq_len_cached)
         cache_position = torch.arange(0, max_seq_len_cached)
         cache_position_expanded = cache_position[:, None]
@@ -1271,3 +1325,22 @@ def apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim) -> Tu
     query_states = torch.cat((query_rot, query_pass), dim=-1)
     key_states = torch.cat((key_rot, key_pass), dim=-1)
     return query_states, key_states
+def _get_attr_from_candidates(
+    src: object,
+    candidates: Optional[List[str]] = None,
+):
+    """
+    Get an attribute from a list of candidate names.
+    - If `candidates` is None, this attribute is treated as optional and returns None.
+    - Otherwise, returns `getattr(src, name)` for the first `name` in `candidates` that exists on `src`.
+    - Raises AttributeError if `candidates` is provided but none of the names exist on `src`.
+    """
+    if candidates is None:
+        return None
+    for name in candidates:
+        if hasattr(src, name):
+            return getattr(src, name)
+    raise AttributeError(f"None of the attributes {candidates} exist in {src}")

optimum/rbln/transformers/models/decoderonly/decoderonly_runtime_utils.py CHANGED Viewed

@@ -177,7 +177,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         dec_attn_mask: torch.Tensor,
         page_table_manager: RBLNPageTableManager,
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
-        config: "PreTrainedConfig" = None,
+        config: Optional["PreTrainedConfig"] = None,
         logits_last_dim: Optional[int] = None,
         **kwargs: Any,
     ) -> None:
@@ -391,16 +391,14 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         # Initialize attention mask for chunked processing
         if self.rbln_config.use_attention_mask:
             if self.rbln_config.use_position_ids:
-                chunked_attention_mask = torch.zeros(
-                    1, self.rbln_config.max_seq_len, dtype=self.rbln_config.torch_dtype
-                )
+                chunked_attention_mask = torch.zeros(1, self.rbln_config.max_seq_len, dtype=self.rbln_config.dtype)
             else:
                 chunked_attention_mask = torch.zeros(
                     1,
                     1,
                     self.rbln_config.prefill_chunk_size,
                     self.rbln_config.max_seq_len,
-                    dtype=self.rbln_config.torch_dtype,
+                    dtype=self.rbln_config.dtype,
                 )
         else:
             chunked_attention_mask = None
@@ -467,7 +465,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             1 if self.rbln_config.logits_to_keep == 1 else padded_mask_length,
             logits_last_dim,
         )
-        output_logits = torch.full(logits_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+        output_logits = torch.full(logits_size, fill_value=1e-10, dtype=self.rbln_config.dtype)
         if self.rbln_config.logits_to_keep == 1:
             for i in range(padded_input_length // self.rbln_config.prefill_chunk_size):
@@ -486,7 +484,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 self.config.hidden_size,
             )
             output_hidden_states = [
-                torch.full(hidden_states_size, fill_value=1e-10, dtype=self.rbln_config.torch_dtype)
+                torch.full(hidden_states_size, fill_value=1e-10, dtype=self.rbln_config.dtype)
                 for _ in range(self.config.num_hidden_layers + 1)
             ]

optimum-rbln 0.9.4a2__py3-none-any.whl → 0.9.5a4__py3-none-any.whl

optimum-rbln 0.9.4a2py3-none-any.whl → 0.9.5a4py3-none-any.whl