PyPI - optimum-rbln - Versions diffs - 0.8.1rc0__py3-none-any.whl → 0.8.2__py3-none-any.whl - Mend - Supply Chain Defender

optimum-rbln 0.8.1rc0py3-none-any.whl → 0.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (120) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -20,108 +20,13 @@ from torch import nn
 from transformers import PretrainedConfig, PreTrainedModel
 from ....utils import logging
+from ...modeling_attention_utils import DEFAULT_FLASH_ATTN_PARTITION_LENGTH
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from .configuration_decoderonly import CacheImplType
 logger = logging.get_logger(__name__)
-DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
-DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
-MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
-MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
-MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
-MAX_SLIDING_WINDOW_SIZE = 32_768
-def set_default_values(
-    attn_impl: Optional[str] = None,
-    kvcache_partition_len: Optional[int] = None,
-    kvcache_block_size: Optional[int] = None,
-    max_seq_len: Optional[int] = None,
-) -> Tuple[str, int, int]:
-    if attn_impl is None:
-        attn_impl = "eager"
-    if kvcache_partition_len is not None:
-        if attn_impl == "eager":
-            attn_impl = "flash_attn"
-            logger.warning(
-                "A non-null `kvcache_partition_len` was provided, but `attn_impl` was not explicitly set or "
-                "set to 'eager'. Since KV cache partitioning is only supported with flash attention, "
-                "`attn_impl` has been automatically switched to 'flash_attn'."
-            )
-    if kvcache_partition_len is None and attn_impl == "flash_attn":
-        kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
-    if kvcache_block_size is None:
-        if attn_impl == "eager":
-            kvcache_block_size = max_seq_len
-        else:
-            kvcache_block_size = kvcache_partition_len
-    return attn_impl, kvcache_partition_len, kvcache_block_size
-def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcache_block_size: int, max_seq_len: int):
-    if attn_impl not in ["eager", "flash_attn"]:
-        raise ValueError(f"Unknown `attn_impl` : {attn_impl}. (Available : 'eager', 'flash_attn`)")
-    ## Checking Constraints...
-    # Constraint of eager attention:
-    # - `max_seq_len` <= 32k
-    # Constraints of flash attention:
-    # 1. `max_seq_len` should be multiple of `partition_len`.
-    # 2. 4k <= `partition_len` <= 32k.
-    # 3. `max_seq_len` should be larger then 8k.
-    if attn_impl == "eager" and max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
-        raise ValueError(
-            f"`max_seq_len` is set to {max_seq_len}, "
-            f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
-            f"Please reduce the `max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
-            " or consider switching `attn_impl` to 'flash_attn' for larger sequence lengths."
-        )
-    if attn_impl == "flash_attn":
-        if max_seq_len // kvcache_partition_len < 2 or max_seq_len % kvcache_partition_len != 0:
-            raise ValueError(
-                f"`max_seq_len` ({max_seq_len}) must be a multiple of `kvcache_partition_len` ({kvcache_partition_len}) "
-                f"when using 'flash_attn'. Please adjust either value to meet this requirement."
-            )
-        elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
-            raise ValueError(
-                f"`kvcache_partition_len` ({kvcache_partition_len}) is out of the supported range for 'flash_attn' "
-                f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
-                f"Please provide a valid value within this range."
-            )
-        elif max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
-            raise ValueError(
-                f"`max_seq_len` ({max_seq_len}) is too small for 'flash_attn'. The minimum "
-                f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `max_seq_len` to meet "
-                "this requirement, or consider switching `attn_impl` to 'eager' for shorter lengths."
-            )
-    if kvcache_block_size is not None:
-        if attn_impl == "flash_attn" and kvcache_partition_len != kvcache_block_size:
-            raise ValueError(
-                f" When using 'flash attention', the `kvcache_block_size` ({kvcache_block_size})  "
-                f"must always be set equal to the `kvcache_partition_len` {kvcache_partition_len}."
-            )
-        elif attn_impl == "eager" and kvcache_block_size != max_seq_len:
-            raise ValueError(
-                f" When using 'eager attention', the `kvcache_block_size` ({kvcache_block_size})  "
-                f"must always be set equal to the `max_seq_len` {max_seq_len}."
-            )
-def validate_sliding_window_size(sliding_window: int, prefill_chunk_size: int):
-    if sliding_window > MAX_SLIDING_WINDOW_SIZE - prefill_chunk_size:
-        raise ValueError(
-            f"Sliding window size ({sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - prefill_chunk_size})"
-        )
 class DecoderOnlyWrapper(nn.Module):
     """A wrapper class for decoder-only language models that handles RBLN-specific optimizations and requirements.
@@ -149,9 +54,11 @@ class DecoderOnlyWrapper(nn.Module):
             This is only relevant if `attn_impl` is set to "flash_attn`
     """
+    _use_learned_pos_emb = False
     def __init__(
         self,
-        causal_lm: PreTrainedModel,
+        model: PreTrainedModel,
         max_seq_len: int,
         use_rotary_emb: bool,
         attn_impl: str,
@@ -159,14 +66,14 @@ class DecoderOnlyWrapper(nn.Module):
         use_inputs_embeds: bool,
         use_attention_mask: bool,
         use_position_ids: bool,
-        use_learned_pos_emb: Optional[bool] = None,
         kvcache_partition_len: Optional[int] = None,
         kvcache_block_size: Optional[int] = None,
         sliding_window: Optional[int] = None,
         sliding_window_layers: Optional[List[int]] = None,
     ):
         super().__init__()
-        self.config = causal_lm.config
+        self.config = model.config
+        self.is_causal_lm = getattr(model, "lm_head", None) is not None
         if use_rotary_emb:
             rotary_embs = self.get_rotary_emb(max_seq_len=max_seq_len)
@@ -182,9 +89,10 @@ class DecoderOnlyWrapper(nn.Module):
         self.use_attention_mask = use_attention_mask
         self.use_position_ids = use_position_ids
         self.use_inputs_embeds = use_inputs_embeds
-        self.use_learned_pos_emb = use_learned_pos_emb
         self.sliding_window_layers = sliding_window_layers
         self.cache_impl = cache_impl
+        self.use_global_attention = cache_impl in ["static", "hybrid"]
+        self.use_local_attention = cache_impl in ["hybrid", "sliding_window"]
         self.sliding_window = sliding_window
         if self.attn_impl == "flash_attn":
@@ -200,59 +108,67 @@ class DecoderOnlyWrapper(nn.Module):
                 f" or equal to max_seq_len({max_seq_len})!"
             )
-        self.causal_lm = self.convert_to_rbln_causal_lm(causal_lm, max_seq_len)
+        self.model = self.convert_to_rbln_class(model, max_seq_len)
         self.num_hidden_layers = getattr(self.config, "num_hidden_layers", None) or getattr(self.config, "n_layer")
         self._phase = "prefill"
     def get_rotary_emb(self, max_seq_len):
         return RotaryEmbedding(config=self.config, max_seq_len_cached=max_seq_len)
-    def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
+    def get_decoder_layers(self, model: PreTrainedModel):
+        return model.model.layers if self.is_causal_lm else model.layers
+    def get_attn_layer(self, layer: nn.Module):
+        return layer.self_attn
+    def get_model_layer(self, model: PreTrainedModel):
+        return model.model if self.is_causal_lm else model
+    def get_rbln_attn_class(self):
+        return DecoderOnlyAttention
+    def get_rbln_layer_class(self):
+        return DecoderOnlyLayer
+    def get_rbln_model_class(self):
+        return DecoderOnlyModel
+    def get_rbln_causal_lm_class(self):
+        return DecoderOnlyForCausalLM
+    def convert_to_rbln_class(self, model: PreTrainedModel, max_seq_len: int):
         new_layers = []
-        for layer_idx, layer in enumerate(causal_lm.model.layers):
-            if layer_idx in self.sliding_window_layers:
-                # Flash attention is not yet supported for sliding window attention.
-                new_self_attn = DecoderOnlyAttention(
-                    layer.self_attn,
-                    self.use_attention_mask,
-                    self.use_position_ids,
-                    kvcache_block_size=self.sliding_window,
-                    is_sliding=True,
-                )
-            else:
-                if self.attn_impl == "eager":
-                    new_self_attn = DecoderOnlyAttention(
-                        layer.self_attn,
-                        self.use_attention_mask,
-                        self.use_position_ids,
-                        kvcache_block_size=self.kvcache_block_size,
-                        is_sliding=False,
-                    )
-                elif self.attn_impl == "flash_attn":
-                    new_self_attn = DecoderOnlyFlashAttention(
-                        layer.self_attn,
-                        kvcache_partition_len=self.kvcache_partition_len,
-                        kvcache_block_size=self.kvcache_block_size,
-                        use_attention_mask=self.use_attention_mask,
-                        use_position_ids=self.use_position_ids,
-                    )
-                else:
-                    raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
-            new_layer = DecoderOnlyLayer(layer, new_self_attn)
+        for layer_idx, layer in enumerate(self.get_decoder_layers(model)):
+            is_sliding = layer_idx in self.sliding_window_layers
+            new_self_attn = self.get_rbln_attn_class()(
+                self.get_attn_layer(layer),
+                self.use_attention_mask if not is_sliding else True,
+                self.use_position_ids,
+                kvcache_block_size=self.sliding_window
+                if layer_idx in self.sliding_window_layers
+                else self.kvcache_block_size,
+                is_sliding=is_sliding,
+                attn_impl=self.attn_impl if not is_sliding else "eager",
+                kvcache_partition_len=self.kvcache_partition_len,
+            )
+            new_layer = self.get_rbln_layer_class()(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = DecoderOnlyModel(
-            causal_lm.model,
+        new_model = self.get_rbln_model_class()(
+            self.get_model_layer(model),
             new_layers,
             partition_len=self.kvcache_partition_len,
             max_seq_len=max_seq_len,
             kvcache_block_size=self.kvcache_block_size,
-            use_learned_pos_emb=self.use_learned_pos_emb,
+            use_learned_pos_emb=self.__class__._use_learned_pos_emb,
             sliding_window_layers=self.sliding_window_layers,
         )
-        new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
-        return new_causal_lm
+        if self.is_causal_lm:
+            new_model = self.get_rbln_causal_lm_class()(model, new_model)
+            return new_model
+        else:
+            return new_model
     @property
     def phase(self) -> str:
@@ -261,16 +177,21 @@ class DecoderOnlyWrapper(nn.Module):
     @phase.setter
     def phase(self, phase: str):
         self._phase = phase
-        self.causal_lm.phase = phase
+        self.model.phase = phase
     def prepare_forward_args(self, *args):
         args = list(args)
         input_ids = None if self.use_inputs_embeds else args.pop(0)
         inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
         cache_position = args.pop(0)
-        global_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "static"] else None
-        local_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "sliding_window"] else None
-        query_position = args.pop(0) if "prefill" in self.phase else None
+        global_block_tables = args.pop(0) if self.use_global_attention else None
+        local_block_tables = args.pop(0) if self.use_local_attention else None
+        query_position = (
+            args.pop(0)
+            # query_position usage: 1. causal_lm prefill or 2. sliding_window cache_position
+            if ("prefill" in self.phase and (self.is_causal_lm or self.use_local_attention))
+            else None
+        )
         attention_mask = args.pop(0) if self.use_attention_mask else None
         position_ids = args.pop(0) if self.use_position_ids else None
         past_key_values = args
@@ -322,7 +243,7 @@ class DecoderOnlyWrapper(nn.Module):
             rotary_emb,
         ) = self.prepare_forward_args(*args)
-        logit = self.causal_lm(
+        logit = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -679,9 +600,23 @@ class DecoderOnlyAttention(nn.Module):
     Args:
         self_attn: Original attention module from the base model
+        use_attention_mask: Whether to use attention mask
+        use_position_ids: Whether to use position ids
+        kvcache_block_size: Block size for KV cache
+        is_sliding: Whether this is sliding window attention
+        attn_impl: Attention implementation type ("eager" or "flash_attn")
     """
-    def __init__(self, self_attn, use_attention_mask, use_position_ids, kvcache_block_size, is_sliding=False):
+    def __init__(
+        self,
+        self_attn,
+        use_attention_mask,
+        use_position_ids,
+        kvcache_block_size,
+        is_sliding=False,
+        attn_impl="eager",
+        kvcache_partition_len=None,
+    ):
         super().__init__()
         self._original_mod = self_attn
         self.layer_idx = self_attn.layer_idx
@@ -702,10 +637,24 @@ class DecoderOnlyAttention(nn.Module):
         self.use_attention_mask = use_attention_mask
         self.use_position_ids = use_position_ids
         self.is_sliding = is_sliding
-        self.attention = self.get_attention()
+        self.attn_impl = attn_impl
+        self.kvcache_partition_len = kvcache_partition_len
+        setattr(self, self.get_attention_name(), self.create_attention_op())
         self.kvcache_block_size = kvcache_block_size
         self.__post_init__()
+    def get_attention_name(self):
+        if self.is_sliding:
+            return "sliding_window_attention"
+        elif self.attn_impl == "flash_attn":
+            return "flash_attention"
+        else:
+            return "attention"
+    def get_attention_op(self):
+        return getattr(self, self.get_attention_name())
     @property
     def phase(self):
         return self._phase
@@ -713,17 +662,36 @@ class DecoderOnlyAttention(nn.Module):
     @phase.setter
     def phase(self, phase: str):
         self._phase = phase
-        self.attention.phase = phase
+        getattr(self, self.get_attention_name()).phase = phase
-    def get_attention(self):
+    def create_attention_op(self):
         if self.is_sliding:
             return SlidingWindowAttentionOp(
-                self.num_heads, self.head_dim, self.num_key_value_heads, self.use_attention_mask, self.use_position_ids
+                self.num_heads,
+                self.head_dim,
+                self.num_key_value_heads,
+                self.use_attention_mask,
+                self.use_position_ids,
             )
-        else:
+        elif self.attn_impl == "flash_attn":
+            return FlashAttentionOp(
+                self.num_heads,
+                self.head_dim,
+                self.num_key_value_heads,
+                self.kvcache_partition_len,
+                self.use_attention_mask,
+                self.use_position_ids,
+            )
+        elif self.attn_impl == "eager":
             return AttentionOp(
-                self.num_heads, self.head_dim, self.num_key_value_heads, self.use_attention_mask, self.use_position_ids
+                self.num_heads,
+                self.head_dim,
+                self.num_key_value_heads,
+                self.use_attention_mask,
+                self.use_position_ids,
             )
+        else:
+            raise NotImplementedError(f"Unknown attention implementation: {self.attn_impl}")
     def __post_init__(self):
         self.q_proj = self._original_mod.q_proj
@@ -780,7 +748,7 @@ class DecoderOnlyAttention(nn.Module):
         if batch_size > 1 and "prefill" in self.phase:
             raise NotImplementedError(f"batch size should be 1 if prefill phase, but got {batch_size}.")
-        attn_output = self.attention(
+        attn_output = self.get_attention_op()(
             query_states,
             key_states,
             value_states,
@@ -797,6 +765,14 @@ class DecoderOnlyAttention(nn.Module):
         return attn_outputs
+class DecoderOnlyFlashAttention(DecoderOnlyAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        logger.warning(
+            "DecoderOnlyFlashAttention is deprecated and may not work as expected. Use DecoderOnlyAttention instead."
+        )
 class AttentionOp(nn.Module):
     def __init__(
         self, num_heads: int, head_dim: int, num_key_value_heads: int, use_attention_mask: bool, use_position_ids: bool
@@ -809,6 +785,18 @@ class AttentionOp(nn.Module):
         self.use_attention_mask = use_attention_mask
         self.use_position_ids = use_position_ids
+    def get_attn_op_name(self):
+        phase = "decode" if self.phase == "decode" else "prefill"
+        if self.use_attention_mask and not self.use_position_ids:
+            attn_op_name = "paged_attn_"
+        else:
+            attn_op_name = "paged_causal_attn_"
+        attn_op_name += phase
+        return attn_op_name
     def forward(
         self,
         query_state: torch.Tensor,
@@ -857,63 +845,31 @@ class AttentionOp(nn.Module):
             self.head_dim,
         )
-        if self.phase == "decode":
-            if self.use_attention_mask and not self.use_position_ids:
-                attn_output = torch.ops.rbln_custom_ops.paged_attn_decode(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    mask=attn_mask,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=block_size,
-                )
-            else:
-                attn_output = torch.ops.rbln_custom_ops.paged_causal_attn_decode(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=block_size,
-                    mask=attn_mask if self.use_position_ids else None,
-                )
-        else:
-            if self.use_attention_mask and not self.use_position_ids:
-                attn_output = torch.ops.rbln_custom_ops.paged_attn_prefill(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    mask=attn_mask,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=block_size,
-                )
-            else:
-                attn_output = torch.ops.rbln_custom_ops.paged_causal_attn_prefill(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=block_size,
-                    is_bidirectional=True if self.phase == "image_prefill" else False,  # FIXME, Hard-coded for Gemma3.
-                    mask=attn_mask if self.use_position_ids else None,
-                )
+        op_args = {
+            "q": query_state,
+            "k": key_state,
+            "v": value_state,
+            "kcache": past_key_state.unsqueeze(2),
+            "vcache": past_value_state.unsqueeze(2),
+            "seq": seq_position,
+            "scale": scale,
+            "block_table": block_tables,
+            "block_size": block_size,
+        }
+        if self.use_attention_mask:
+            op_args["mask"] = attn_mask
+        if self.phase == "prefill" or self.phase == "image_prefill":
+            if not self.use_attention_mask or self.use_position_ids:
+                op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+        attn_op_name = self.get_attn_op_name()
+        attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
+        if attn_op is None:
+            raise ValueError(f"Attention operator {attn_op_name} not found.")
+        attn_output = attn_op(**op_args)
         attn_output = attn_output.view(batch_size, self.num_heads, -1, self.head_dim)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
@@ -921,161 +877,6 @@ class AttentionOp(nn.Module):
         return attn_output
-def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
-    """Slice cos[cache_position], sin[cache_position] vector for the query."""
-    if cache_position.shape[0] > 1:
-        cos_all = []
-        sin_all = []
-        for i in range(cache_position.shape[0]):
-            cos_all.append(cos[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
-            sin_all.append(sin[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
-        cos = torch.cat(cos_all, dim=0)
-        sin = torch.cat(sin_all, dim=0)
-    else:
-        cos = cos[cache_position].unsqueeze(unsqueeze_dim)
-        sin = sin[cache_position].unsqueeze(unsqueeze_dim)
-    return cos, sin
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin):
-    """Applies Rotary Position Embedding to the query and key tensors."""
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-def apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Partial rotary embedding
-    query_rot, query_pass = (
-        query_states[..., :ndim],
-        query_states[..., ndim:],
-    )
-    key_rot, key_pass = (
-        key_states[..., :ndim],
-        key_states[..., ndim:],
-    )
-    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
-    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
-    # [batch_size, seq_length, num_heads, head_dim]
-    query_states = torch.cat((query_rot, query_pass), dim=-1)
-    key_states = torch.cat((key_rot, key_pass), dim=-1)
-    return query_states, key_states
-class RotaryEmbedding(nn.Module):
-    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        max_seq_len_cached: int,
-    ):
-        super().__init__()
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            rope_type = "default"
-        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, max_seq_len_cached)
-        cache_position = torch.arange(0, max_seq_len_cached, dtype=torch.float32)
-        cache_position_expanded = cache_position[:, None]
-        if rope_type == "dynamic":
-            freqs = cache_position_expanded.float() * inv_freq.float()
-        else:
-            inv_freq_expanded = inv_freq[None, :]
-            freqs = cache_position_expanded.float() @ inv_freq_expanded.float()
-        emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * attention_scaling
-        sin = emb.sin() * attention_scaling
-        self.register_buffer("_cos_cached", cos, persistent=False)
-        self.register_buffer("_sin_cached", sin, persistent=False)
-    def forward(self, x, seq_len):
-        return (
-            self._cos_cached[:seq_len].to(dtype=x.dtype),
-            self._sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-class DecoderOnlyFlashAttention(DecoderOnlyAttention):
-    def __init__(self, self_attn, kvcache_partition_len, kvcache_block_size, use_attention_mask, use_position_ids):
-        self.kvcache_partition_size = kvcache_partition_len
-        super().__init__(
-            self_attn=self_attn,
-            use_attention_mask=use_attention_mask,
-            use_position_ids=use_position_ids,
-            kvcache_block_size=kvcache_block_size,
-        )
-    def get_attention(self):
-        return FlashAttentionOp(
-            self.num_heads,
-            self.head_dim,
-            self.num_key_value_heads,
-            self.kvcache_partition_size,
-            self.use_attention_mask,
-            self.use_position_ids,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        seq_positions: torch.LongTensor,
-        past_key_values: Tuple[Tuple[torch.Tensor]],
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-        block_tables: Optional[torch.Tensor] = None,
-    ):
-        batch_size, query_length, _ = hidden_states.size()
-        query_states, key_states, value_states = self.projection(hidden_states=hidden_states)
-        query_states = query_states.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(
-            1, 2
-        )
-        if hasattr(self, "q_norm") and hasattr(self, "k_norm"):
-            query_states = self.q_norm(query_states)
-            key_states = self.k_norm(key_states)
-        if cos is not None and sin is not None:
-            query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)
-        attn_output = self.attention(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            past_key_state=past_key_values[self.layer_idx][0],
-            past_value_state=past_key_values[self.layer_idx][1],
-            seq_position=seq_positions,
-            scale=self.scale,
-            block_tables=block_tables,
-            kvcache_block_size=self.kvcache_block_size,
-        )
-        attn_outputs = self.o_proj(attn_output)
-        return attn_outputs
 class FlashAttentionOp(AttentionOp):
     def __init__(
         self,
@@ -1095,6 +896,17 @@ class FlashAttentionOp(AttentionOp):
         )
         self.kvcache_partition_size = kvcache_partition_len
+    def get_attn_op_name(self):
+        phase = "decode" if self.phase == "decode" else "prefill"
+        if self.use_attention_mask and not self.use_position_ids:
+            attn_op_name = "paged_flash_attn_"
+        else:
+            attn_op_name = "paged_flash_causal_attn_"
+        attn_op_name += phase
+        return attn_op_name
     def forward(
         self,
         query_state,
@@ -1106,7 +918,7 @@ class FlashAttentionOp(AttentionOp):
         seq_position,
         scale,
         block_tables,
-        kvcache_block_size,
+        block_size,
     ):
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
         key_state = key_state.unsqueeze(2)
@@ -1127,67 +939,32 @@ class FlashAttentionOp(AttentionOp):
             self.head_dim,
         )
-        if self.phase == "decode":
-            if self.use_attention_mask and not self.use_position_ids:
-                attn_output = torch.ops.rbln_custom_ops.paged_flash_attn_decode(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    mask=attn_mask,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=kvcache_block_size,
-                    partition=self.kvcache_partition_size,
-                )
-            else:
-                attn_output = torch.ops.rbln_custom_ops.paged_flash_causal_attn_decode(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=kvcache_block_size,
-                    partition=self.kvcache_partition_size,
-                    mask=attn_mask if self.use_position_ids else None,
-                )
-        else:
-            if self.use_attention_mask and not self.use_position_ids:
-                attn_output = torch.ops.rbln_custom_ops.paged_flash_attn_prefill(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    mask=attn_mask,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=kvcache_block_size,
-                    partition=self.kvcache_partition_size,
-                )
-            else:
-                attn_output = torch.ops.rbln_custom_ops.paged_flash_causal_attn_prefill(
-                    q=query_state,
-                    k=key_state,
-                    v=value_state,
-                    kcache=past_key_state.unsqueeze(2),
-                    vcache=past_value_state.unsqueeze(2),
-                    seq=seq_position,
-                    scale=scale,
-                    block_table=block_tables,
-                    block_size=kvcache_block_size,
-                    partition=self.kvcache_partition_size,
-                    is_bidirectional=True if self.phase == "image_prefill" else False,
-                    mask=attn_mask if self.use_position_ids else None,
-                )
-        # reshape for removing repeat_kv
+        op_args = {
+            "q": query_state,
+            "k": key_state,
+            "v": value_state,
+            "kcache": past_key_state.unsqueeze(2),
+            "vcache": past_value_state.unsqueeze(2),
+            "seq": seq_position,
+            "scale": scale,
+            "block_table": block_tables,
+            "block_size": block_size,
+            "partition": self.kvcache_partition_size,
+        }
+        if self.use_attention_mask:
+            op_args["mask"] = attn_mask
+        if self.phase == "prefill" or self.phase == "image_prefill":
+            if not self.use_attention_mask or self.use_position_ids:
+                op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+        attn_op_name = self.get_attn_op_name()
+        attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
+        if attn_op is None:
+            raise ValueError(f"Attention operator {attn_op_name} not found.")
+        attn_output = attn_op(**op_args)
         attn_output = attn_output.view(batch_size, self.num_heads, -1, self.head_dim)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
@@ -1196,6 +973,14 @@ class FlashAttentionOp(AttentionOp):
 class SlidingWindowAttentionOp(AttentionOp):
+    def get_attn_op_name(self):
+        phase = "decode" if self.phase == "decode" else "prefill"
+        if not self.use_attention_mask:
+            raise NotImplementedError("Attention mask is needed for sliding window attention.")
+        attn_op_name = "paged_sliding_window_attn_" + phase
+        return attn_op_name
     def forward(
         self,
         query_state: torch.Tensor,
@@ -1226,37 +1011,121 @@ class SlidingWindowAttentionOp(AttentionOp):
             self.head_dim,
         )
-        if self.phase == "decode":
-            attn_output = torch.ops.rbln_custom_ops.paged_sliding_window_attn_decode(
-                q=query_state,
-                k=key_state,
-                v=value_state,
-                kcache=past_key_state.unsqueeze(2),
-                vcache=past_value_state.unsqueeze(2),
-                cache_seq_len=seq_position[0],
-                cache_offset=seq_position[1],
-                scale=scale,
-                block_table=block_tables,
-                block_size=block_size,
-            )
-        else:
-            attn_output = torch.ops.rbln_custom_ops.paged_sliding_window_attn_prefill(
-                q=query_state,
-                k=key_state,
-                v=value_state,
-                kcache=past_key_state.unsqueeze(2),
-                vcache=past_value_state.unsqueeze(2),
-                cache_seq_len=seq_position[0],
-                cache_offset=seq_position[1],
-                scale=scale,
-                block_table=block_tables,
-                block_size=block_size,
-                is_bidirectional=True if self.phase == "image_prefill" else False,
-            )
-        # reshape for removing repeat_kv
+        op_args = {
+            "q": query_state,
+            "k": key_state,
+            "v": value_state,
+            "kcache": past_key_state.unsqueeze(2),
+            "vcache": past_value_state.unsqueeze(2),
+            "cache_seq_len": seq_position[0],
+            "cache_offset": seq_position[1],
+            "scale": scale,
+            "block_table": block_tables,
+            "block_size": block_size,
+        }
+        if self.phase == "prefill" or self.phase == "image_prefill":
+            op_args["is_bidirectional"] = self.phase == "image_prefill"  # FIXME, Hard-coded for Gemma3.
+        attn_op_name = self.get_attn_op_name()
+        attn_op = getattr(torch.ops.rbln_custom_ops, attn_op_name, None)
+        if attn_op is None:
+            raise ValueError(f"Attention operator {attn_op_name} not found.")
+        attn_output = attn_op(**op_args)
         attn_output = attn_output.view(batch_size, self.num_heads, -1, self.head_dim)
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, -1, self.num_heads * self.head_dim)
         return attn_output
+class RotaryEmbedding(nn.Module):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_seq_len_cached: int,
+    ):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            rope_type = "default"
+        inv_freq, attention_scaling = ROPE_INIT_FUNCTIONS[rope_type](config, max_seq_len_cached)
+        cache_position = torch.arange(0, max_seq_len_cached, dtype=torch.float32)
+        cache_position_expanded = cache_position[:, None]
+        if rope_type == "dynamic":
+            freqs = cache_position_expanded.float() * inv_freq.float()
+        else:
+            inv_freq_expanded = inv_freq[None, :]
+            freqs = cache_position_expanded.float() @ inv_freq_expanded.float()
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling
+        self.register_buffer("_cos_cached", cos, persistent=False)
+        self.register_buffer("_sin_cached", sin, persistent=False)
+    def forward(self, x, seq_len):
+        return (
+            self._cos_cached[:seq_len].to(dtype=x.dtype),
+            self._sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
+    """Slice cos[cache_position], sin[cache_position] vector for the query."""
+    if cache_position.shape[0] > 1:
+        cos_all = []
+        sin_all = []
+        for i in range(cache_position.shape[0]):
+            cos_all.append(cos[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
+            sin_all.append(sin[cache_position[i : i + 1]].unsqueeze(unsqueeze_dim))
+        cos = torch.cat(cos_all, dim=0)
+        sin = torch.cat(sin_all, dim=0)
+    else:
+        cos = cos[cache_position].unsqueeze(unsqueeze_dim)
+        sin = sin[cache_position].unsqueeze(unsqueeze_dim)
+    return cos, sin
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def apply_rotary_pos_emb_partial(query_states, key_states, cos, sin, ndim) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Partial rotary embedding
+    query_rot, query_pass = (
+        query_states[..., :ndim],
+        query_states[..., ndim:],
+    )
+    key_rot, key_pass = (
+        key_states[..., :ndim],
+        key_states[..., ndim:],
+    )
+    # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+    query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
+    # [batch_size, seq_length, num_heads, head_dim]
+    query_states = torch.cat((query_rot, query_pass), dim=-1)
+    key_states = torch.cat((key_rot, key_pass), dim=-1)
+    return query_states, key_states