PyPI - optimum-rbln - Versions diffs - 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

optimum-rbln 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -27,129 +27,82 @@ from typing import List, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig, PreTrainedModel
-from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
+from ....ops import register_rbln_custom_attention, register_rbln_custom_flash_attention
 from ....utils import logging
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
-if is_torch_greater_or_equal_than_2_4:
-    register_fake = torch.library.register_fake
-else:
-    register_fake = torch.library.impl_abstract
 logger = logging.get_logger(__name__)
-"""
-##############################################################################
-# RBLN custom operation (python interface)
-# torch.compile custom operation
-# torch.library.define - kernel declaration
-# torch.library.impl - kernel implementation
-# torch.library.impl_abstract - symbolic trace
-##############################################################################
-"""
-# RBLN custom op(flash attention decode)
-torch.library.define(
-    "rbln_custom_ops::flash_attn_decode",
-    "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d) -> Tensor[]",
-)
-@torch.library.impl("rbln_custom_ops::flash_attn_decode", "cpu")
-def flash_attn_decode_cpu(q, k, v, mask, kcache, vcache, seq, partition):
-    """
-    WORKAROUND:
-    Partition is declared as an argument to the function, even though it is
-    not actually used in the CPU implementation, this allows the rbln compiler
-    to perform flash attention operations with partition as an argument.
-    """
-    assert kcache.dim() == k.dim()
-    assert vcache.dim() == v.dim()
-    assert k.size(-2) == v.size(-2)
-    assert partition.dim() == 1
-    b = 0
-    if seq.dim() == 1:
-        s = seq[0]
-    elif seq.dim() == 0:
-        s = seq
-    else:
-        assert False
-    e = s + k.size(-2)
-    updated_k = kcache[b].unsqueeze(0).slice_scatter(k, dim=-2, start=s, end=e)
-    updated_v = vcache[b].unsqueeze(0).slice_scatter(v, dim=-2, start=s, end=e)
-    attn_weight = torch.matmul(q, updated_k.transpose(3, 4)) / math.sqrt(128)
-    attn_weight = attn_weight + mask
-    attn_weight = nn.functional.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)
-    attn_output = torch.matmul(attn_weight, updated_v)
-    return attn_output, torch.empty_like(kcache), torch.empty_like(vcache)
-@register_fake("rbln_custom_ops::flash_attn_decode")
-def flash_attn_decode_abstract(q, k, v, m, kcache, vcache, seq, partition):
-    return torch.empty_like(q), torch.empty_like(kcache), torch.empty_like(vcache)
-# RBLN custom op(flash attention prefill)
-torch.library.define(
-    "rbln_custom_ops::flash_attn_prefill",
-    "(Tensor x, Tensor y, Tensor z, Tensor w, Tensor a, Tensor b, Tensor c, Tensor d, Tensor e) -> Tensor[]",
-)
-@torch.library.impl("rbln_custom_ops::flash_attn_prefill", "cpu")
-def flash_attn_prefill_cpu(q, k, v, mask, kcache, vcache, batch, seq, partition):
-    """
-    WORKAROUND:
-    Partition is declared as an argument to the function, even though it is
-    not actually used in the CPU implementation, this allows the rbln compiler
-    to perform flash attention operations with partition as an argument.
-    """
-    assert kcache.dim() == k.dim()
-    assert vcache.dim() == v.dim()
-    assert k.size(-2) == v.size(-2)
-    assert partition.dim() == 1
-    if batch.dim() == 1:
-        b = batch[0]
-    elif batch.dim() == 0:
-        b = batch
-    else:
-        assert False
-    if seq.dim() == 1:
-        s = seq[0]
-    elif seq.dim() == 0:
-        s = seq
-    else:
-        assert False
-    e = s + k.size(-2)
-    updated_k = kcache[b].unsqueeze(0).slice_scatter(k, dim=-2, start=s, end=e)
-    updated_v = vcache[b].unsqueeze(0).slice_scatter(v, dim=-2, start=s, end=e)
-    attn_weight = torch.matmul(q, updated_k.transpose(3, 4)) / math.sqrt(128)
-    attn_weight = attn_weight + mask
-    attn_weight = nn.functional.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)
-    attn_output = torch.matmul(attn_weight, updated_v)
-    return attn_output, torch.empty_like(kcache), torch.empty_like(vcache)
-@register_fake("rbln_custom_ops::flash_attn_prefill")
-def flash_attn_prefill_abstract(q, k, v, m, kcache, vcache, batch, seq, partition):
-    return torch.empty_like(q), torch.empty_like(kcache), torch.empty_like(vcache)
+DEFAULT_FLASH_ATTN_PARTITION_LENGTH = 16_384
+DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
+MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
+MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
+MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
-# RBLN custom op(cache update)
-torch.library.define("rbln_custom_ops::rbln_cache_update", "(Tensor x, Tensor y, Tensor z, Tensor w) -> Tensor")
+def validate_attention_method(
+    rbln_attn_impl: str, rbln_kvcache_partition_len: int, rbln_max_seq_len: int
+) -> Tuple[str, int]:
+    if rbln_kvcache_partition_len is not None:
+        if rbln_attn_impl == "eager":
+            raise ValueError(
+                f"`rbln_kvcache_partition_len` is set to {rbln_kvcache_partition_len}, but KV cache partitioning"
+                " is not supported with 'eager' attention. Please set `rbln_kvcache_partition_len` to None, "
+                "or switch `rbln_attn_impl` to 'flash_attn' to use KV cache partitioning."
+            )
+        elif rbln_attn_impl is None:
+            rbln_attn_impl = "flash_attn"
+            logger.warning(
+                "A non-null `rbln_kvcache_partition_len` was provided, but `rbln_attn_impl` was not explicitly set. "
+                "Since KV cache partitioning is only supported with flash attention, "
+                "`rbln_attn_impl` has been automatically switched to 'flash_attn'."
+            )
-@torch.library.impl("rbln_custom_ops::rbln_cache_update", "cpu")
-def rbln_cache_update_cpu(cache, value, batch, seq):
-    updated_cache = cache[batch].slice_scatter(value, dim=-2, start=batch[0], end=batch[0] + seq[0])
-    return updated_cache
+    rbln_attn_impl = "eager" if rbln_attn_impl is None else rbln_attn_impl
+    if rbln_attn_impl not in ["eager", "flash_attn"]:
+        raise ValueError(f"Unknown `rbln_attn_impl` : {rbln_attn_impl}. (Available : 'eager', 'flash_attn`)")
+    if rbln_kvcache_partition_len is None and rbln_attn_impl == "flash_attn":
+        rbln_kvcache_partition_len = DEFAULT_FLASH_ATTN_PARTITION_LENGTH
+    ## Checking Constraints...
+    # Constraint of eager attention:
+    # - `max_seq_len` <= 32k
+    # Constraints of flash attention:
+    # 1. `max_seq_len` should be multiple of `partition_len`.
+    # 2. 4k <= `partition_len` <= 32k.
+    # 3. `max_seq_len` should be larger then 8k.
+    if rbln_attn_impl == "eager" and rbln_max_seq_len > DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH:
+        raise ValueError(
+            f"`rbln_max_seq_len` is set to {rbln_max_seq_len}, "
+            f"which exceeds the limit of {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} for 'eager' attention. "
+            f"Please reduce the `rbln_max_seq_len` to {DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH} or lower,"
+            " or consider switching `rbln_attn_impl` to 'flash_attn' for larger sequence lengths."
+        )
+    if rbln_attn_impl == "flash_attn":
+        if rbln_max_seq_len // rbln_kvcache_partition_len < 2 or rbln_max_seq_len % rbln_kvcache_partition_len != 0:
+            raise ValueError(
+                f"`rbln_max_seq_len` ({rbln_max_seq_len}) must be a multiple of `rbln_kvcache_partition_len` ({rbln_kvcache_partition_len}) "
+                f"when using 'flash_attn'. Please adjust either value to meet this requirement."
+            )
+        elif not (MIN_FLASH_ATTN_PARTITION_LENGTH <= rbln_kvcache_partition_len <= MAX_FLASH_ATTN_PARTITION_LENGTH):
+            raise ValueError(
+                f"`rbln_kvcache_partition_len` ({rbln_kvcache_partition_len}) is out of the supported range for 'flash_attn' "
+                f"({MIN_FLASH_ATTN_PARTITION_LENGTH} <= `rbln_kvcache_partition_len` <= {MAX_FLASH_ATTN_PARTITION_LENGTH}). "
+                f"Please provide a valid value within this range."
+            )
+        elif rbln_max_seq_len < MIN_FLASH_ATTN_MAX_SEQ_LEN:
+            raise ValueError(
+                f"`rbln_max_seq_len` ({rbln_max_seq_len}) is too small for 'flash_attn'. The minimum "
+                f"supported value is {MIN_FLASH_ATTN_MAX_SEQ_LEN}. Please increase `rbln_max_seq_len` to meet "
+                "this requirement, or consider switching `rbln_attn_impl` to 'eager' for shorter lengths."
+            )
-@register_fake("rbln_custom_ops::rbln_cache_update")
-def rbln_cache_update_abstract(cache, value, batch, seq):
-    return torch.empty_like(cache)
+    return rbln_attn_impl, rbln_kvcache_partition_len
 class DecoderOnlyWrapper(nn.Module):
@@ -169,11 +122,23 @@ class DecoderOnlyWrapper(nn.Module):
         causal_lm (PreTrainedModel): The Huggingface causal language model to wrap
         max_seq_len (int): Maximum sequence length for position embeddings and cache sizes
         use_rotary_emb (bool): Whether to use rotary position embeddings
+        attn_impl (str): The attention implementation to use.
+            - "eager": Uses the standard attention.
+            - "flash_attn": Uses flash attention. When set,
+              the key/value cache is partitioned into chunks of length
+              `kvcache_partition_len`.
         kvcache_partition_len (Optional[int]): Length of KV cache partitions for flash attention.
-            If provided, uses flash attention; if None, uses standard attention
+            This is only relevant if `attn_impl` is set to "flash_attn`
     """
-    def __init__(self, causal_lm: PreTrainedModel, max_seq_len, use_rotary_emb: bool, kvcache_partition_len=None):
+    def __init__(
+        self,
+        causal_lm: PreTrainedModel,
+        max_seq_len: int,
+        use_rotary_emb: bool,
+        attn_impl: str,
+        kvcache_partition_len: Optional[int] = None,
+    ):
         super().__init__()
         self.config = causal_lm.config
@@ -182,14 +147,21 @@ class DecoderOnlyWrapper(nn.Module):
         else:
             self.rotary_emb = None
-        if kvcache_partition_len is not None:
-            # WORKAROUND : for passing partition length as a value to the rbln compiler.
-            # What is actually used is the shape of this tensor.
-            self.attn_impl = "flash_attn"
-            logger.info(f"Using flash-attention. (partition length : {kvcache_partition_len})")
+        self.attn_impl = attn_impl
+        if self.attn_impl == "flash_attn":
+            self.kvcache_partition_len = kvcache_partition_len or DEFAULT_FLASH_ATTN_PARTITION_LENGTH
+            register_rbln_custom_flash_attention()
+        elif self.attn_impl == "eager":
+            self.kvcache_partition_len = None
+            register_rbln_custom_attention()
         else:
-            self.attn_impl = "eager"
-        self.kvcache_partition_len = kvcache_partition_len
+            raise ValueError(f"Unknown attn_impl : {self.attn_impl}")
+        if kvcache_partition_len and kvcache_partition_len > max_seq_len:
+            raise ValueError(
+                f"kvcache_partition_len({kvcache_partition_len}) should be lower"
+                f" or equal to max_seq_len({max_seq_len})!"
+            )
         self.causal_lm = self.convert_to_rbln_causal_lm(causal_lm)
@@ -213,12 +185,12 @@ class DecoderOnlyWrapper(nn.Module):
             new_layer = DecoderOnlyLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = DecoderOnlyModel(causal_lm.model, new_layers)
+        new_model = DecoderOnlyModel(causal_lm.model, new_layers, partition_len=self.kvcache_partition_len)
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm
     @property
-    def phase(self):
+    def phase(self) -> str:
         return self._phase
     @phase.setter
@@ -226,21 +198,32 @@ class DecoderOnlyWrapper(nn.Module):
         self._phase = phase
         self.causal_lm.phase = phase
-    def forward(
-        self,
-        input_ids_or_inputs_embeds,
-        attention_mask,
-        cache_position,
-        batch_position,
-        query_position,
-        *past_key_values,
-    ):
+    def forward(self, *args):
+        if self.phase == "decode":
+            (
+                input_ids_or_inputs_embeds,
+                attention_mask,
+                cache_position,
+                *past_key_values,
+            ) = args
+            batch_position = torch.tensor(0, dtype=torch.int16)
+            query_position = None
+        elif self.phase == "prefill":
+            (
+                input_ids_or_inputs_embeds,
+                attention_mask,
+                cache_position,
+                batch_position,
+                query_position,
+                *past_key_values,
+            ) = args
+        else:
+            raise ValueError(f"Unknown phase: {self.phase}")
         if input_ids_or_inputs_embeds.ndim == 2:
-            # It is input_ids
             input_ids = input_ids_or_inputs_embeds
             inputs_embeds = None
         elif input_ids_or_inputs_embeds.ndim == 3:
-            # It is inputs_embeds
             input_ids = None
             inputs_embeds = input_ids_or_inputs_embeds
         else:
@@ -248,15 +231,9 @@ class DecoderOnlyWrapper(nn.Module):
         if len(past_key_values) != 2 * self.num_hidden_layers:
             raise ValueError(
-                f"Different past_key_values to model's config. {len(past_key_values)} != {self.num_hidden_layers}"
+                f"Different past_key_values to model's config. {len(past_key_values)} != {2 * self.num_hidden_layers}"
             )
-        seq_len = input_ids_or_inputs_embeds.shape[1]
-        if seq_len == 1:
-            self.phase = "decode"
-        else:
-            self.phase = "prefill"
         # [key, value] * n_layer -> ( (key, value) ) * n_layer
         # cache shape : batch, n_heads, 1, max_seq_len, head_dim
         _past_key_values = []
@@ -286,8 +263,7 @@ class DecoderOnlyWrapper(nn.Module):
             _present_key_values = _present_key_values + (key_states, value_states)
         present_key_values = _present_key_values
-        # batch_position + query_position is dummy output node to keep the number of outputs
-        return logit, present_key_values, batch_position + query_position
+        return logit, present_key_values
 class DecoderOnlyForCausalLM(nn.Module):
@@ -371,13 +347,12 @@ class DecoderOnlyModel(nn.Module):
         _phase: Current processing phase ("prefill" or "decode")
     """
-    mask_fmin = torch.finfo(torch.float16).min
-    def __init__(self, model, layers: List["DecoderOnlyLayer"]):
+    def __init__(self, model, layers: List["DecoderOnlyLayer"], partition_len=None):
         super().__init__()
         self._original_mod = model
         self.layers = nn.ModuleList(layers)
         self._phase = "prefill"
+        self.partition_len = partition_len
     @property
     def phase(self):
@@ -389,10 +364,26 @@ class DecoderOnlyModel(nn.Module):
         for layer in self.layers:
             layer.phase = phase
+    @property
+    def attn_impl(self) -> str:
+        return "eager" if self.partition_len is None else "flash_attn"
     @property
     def hidden_multiplier(self):
         return 1
+    def convert_sequence_positions_for_flash_attn(self, seq_positions, max_seq_len):
+        if self.attn_impl != "flash_attn":
+            raise NotImplementedError(f"Unknown attn_impl ({self.attn_impl}).")
+        partition_len = self.partition_len
+        num_partition = max_seq_len // partition_len
+        cs = seq_positions.repeat(num_partition, 1).transpose(0, 1)
+        pidx = torch.arange(num_partition)
+        cache_pos_for_partitions = torch.clamp(cs - pidx * partition_len, 0, partition_len)
+        return cache_pos_for_partitions
     def get_last_layernorm(self) -> nn.LayerNorm:
         return self._original_mod.norm
@@ -425,7 +416,6 @@ class DecoderOnlyModel(nn.Module):
             inputs_embeds = self.get_embedding()(input_ids)
         hidden_states = inputs_embeds * self.hidden_multiplier
-        attention_mask = (1 - attention_mask) * self.mask_fmin
         # get cos,sin vector if needed
         if rotary_emb is not None:
@@ -446,14 +436,19 @@ class DecoderOnlyModel(nn.Module):
             cos, sin = None, None
         # (batch, seq_len) -> (batch,)
-        current_steps = cache_position[:, 0]
+        seq_positions = cache_position[:, 0]
+        if self.attn_impl == "flash_attn":
+            max_seq_len = past_key_values[0][0].shape[-2]
+            seq_positions = self.convert_sequence_positions_for_flash_attn(
+                seq_positions=seq_positions, max_seq_len=max_seq_len
+            )
         present_key_values = past_key_values
         for layer in self.layers:
             hidden_states, present_key_values = layer(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                current_steps=current_steps,
+                seq_positions=seq_positions,
                 batch_position=batch_position,
                 past_key_values=present_key_values,
                 cos=cos,
@@ -514,20 +509,19 @@ class DecoderOnlyLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        current_steps: torch.LongTensor,
+        seq_positions: torch.LongTensor,
         batch_position: torch.Tensor,
         past_key_values: Tuple[Tuple[torch.Tensor]],
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
     ):
         residual = hidden_states
         hidden_states = self.get_pre_attention_layernorm()(hidden_states)
         hidden_states, present_key_values = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            current_steps=current_steps,
+            seq_positions=seq_positions,
             batch_position=batch_position,
             past_key_values=past_key_values,
             cos=cos,
@@ -561,15 +555,34 @@ class DecoderOnlyAttention(nn.Module):
         self.layer_idx = self_attn.layer_idx
         self.num_heads = self._original_mod.num_heads
         self.head_dim = self._original_mod.head_dim
-        self.phase = "prefill"
+        self._phase = "prefill"
+        self.scale = torch.tensor(self.get_attn_scale())
+        if hasattr(self._original_mod, "num_key_value_heads"):
+            self.num_key_value_heads = self._original_mod.num_key_value_heads
+        else:
+            self.num_key_value_heads = self._original_mod.num_heads
+        self.attention = self.get_attention()
         self.__post_init__()
+    @property
+    def phase(self):
+        return self._phase
+    @phase.setter
+    def phase(self, phase: str):
+        self._phase = phase
+        self.attention.phase = phase
+    def get_attention(self):
+        return AttentionOp(self.num_heads, self.head_dim, self.num_key_value_heads)
     def __post_init__(self):
         self.q_proj = self._original_mod.q_proj
         self.k_proj = self._original_mod.k_proj
         self.v_proj = self._original_mod.v_proj
         self.o_proj = self._original_mod.o_proj
-        self.num_key_value_heads = self._original_mod.num_key_value_heads
     def projection(self, hidden_states) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Projects input hidden states into query, key, and value representations.
@@ -588,97 +601,17 @@ class DecoderOnlyAttention(nn.Module):
     def apply_rotary_pos_embed(self, query_states, key_states, cos, sin):
         return apply_rotary_pos_emb(query_states, key_states, cos, sin)
-    def rbln_attention(
-        self,
-        query_state,
-        key_state,
-        value_state,
-        attn_mask,
-        batch_idx,
-        past_key_state,
-        past_value_state,
-        current_step,
-        # below are designed for Midm, GPT which requires to support scaling for attention weights
-        # TODO(jongho): Merge and manage scales generally
-        layer_idx=None,
-        scale_attn_weights: bool = None,
-        scale_attn_by_inverse_layer_idx: bool = None,
-        scale_qk_by_inverse_layer_idx: bool = None,
-    ):
-        """Compute attention with static shapes and explicit cache management.
-        Args:
-            query_state: Query tensor [1, num_heads, 1, head_dim]
-            key_state: Key tensor [1, num_heads, seq_len, head_dim]
-            value_state: Value tensor [1, num_heads, seq_len, head_dim]
-            attn_mask: Attention mask tensor
-            batch_idx: Batch index for cache lookup
-            past_key_state: Previous key cache states
-            past_value_state: Previous value cache states
-            current_step: Current position in sequence
-        Returns:
-            Tuple of (attention_output, key_state, value_state)
-        """
-        # Implementation details.
-        # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
-        key_state = key_state.unsqueeze(2)  # 1, 32, 1, 128, 128
-        value_state = value_state.unsqueeze(2)
-        attn_mask = attn_mask.unsqueeze(2)
-        query_state = query_state.view(
-            1,
-            self.num_key_value_heads,
-            self.num_heads // self.num_key_value_heads,
-            -1,  # seq len
-            self.head_dim,
-        )  #
-        kend = current_step + key_state.shape[-2]
-        vend = current_step + value_state.shape[-2]
-        key_state = (
-            past_key_state[batch_idx]
-            .unsqueeze(0)
-            .unsqueeze(2)
-            .slice_scatter(key_state, dim=-2, start=current_step, end=kend)
-        )
-        value_state = (
-            past_value_state[batch_idx]
-            .unsqueeze(0)
-            .unsqueeze(2)
-            .slice_scatter(value_state, dim=-2, start=current_step, end=vend)
-        )
-        attn_weight = torch.matmul(query_state, key_state.transpose(3, 4))
-        attn_weight = attn_weight / math.sqrt(self.head_dim)
-        if layer_idx is not None and (scale_attn_by_inverse_layer_idx or scale_qk_by_inverse_layer_idx):
-            attn_weight = attn_weight / float(layer_idx + 1)
-        attn_weight += attn_mask
-        if layer_idx is not None and scale_qk_by_inverse_layer_idx:
-            attn_weight = attn_weight * float(layer_idx + 1)
-        attn_weight = nn.functional.softmax(attn_weight, dim=-1)
-        attn_output = torch.matmul(attn_weight, value_state)
-        attn_output = attn_output.view(1, self.num_heads, -1, self.head_dim)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(1, -1, self.num_heads * self.head_dim)
-        return attn_output, key_state, value_state
+    def get_attn_scale(self):
+        return 1 / math.sqrt(self.head_dim)
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        current_steps: torch.LongTensor,
+        seq_positions: torch.LongTensor,
         batch_position: torch.Tensor,
         past_key_values: Tuple[Tuple[torch.Tensor]],
-        cos: Optional[torch.Tensor] = None,  # (batch, 1, prefill_size, head_dim)
+        cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
     ):
         batch_size, query_length, _ = hidden_states.size()
@@ -698,22 +631,24 @@ class DecoderOnlyAttention(nn.Module):
         if batch_size > 1 and self.phase == "prefill":
             raise NotImplementedError(f"batch size should be 1 if prefill phase, but got {batch_size}.")
+        # TODO(jongho): flash attn legacy. (clone)
+        _seq_positions = seq_positions.clone().unsqueeze(1)
         _key_states = []
         _value_states = []
         _attn_outputs = []
         for b in range(batch_size):
-            current_step = current_steps[b]
-            attn_output, key_state, value_state = self.rbln_attention(
+            seq_position = _seq_positions[b][0]
+            attn_output, key_state, value_state = self.attention(
                 query_states[b].unsqueeze(0),
                 key_states[b].unsqueeze(0),
                 value_states[b].unsqueeze(0),
-                attention_mask[b].unsqueeze(0)
-                if self.phase == "decode"
-                else attention_mask,  # TODO(jongho): fix when msoftmax is supported
+                attention_mask[b].unsqueeze(0) if self.phase == "decode" else attention_mask,
                 past_key_state=past_key_values[self.layer_idx][0],
                 past_value_state=past_key_values[self.layer_idx][1],
-                batch_idx=b if self.phase == "decode" else batch_position,
-                current_step=current_step,
+                batch_position=b if self.phase == "decode" else batch_position,
+                seq_position=seq_position,
+                scale=self.scale,
             )
             _key_states.append(key_state)
             _value_states.append(value_state)
@@ -727,6 +662,87 @@ class DecoderOnlyAttention(nn.Module):
         return attn_outputs, past_key_values
+class AttentionOp(nn.Module):
+    def __init__(self, num_heads: int, head_dim: int, num_key_value_heads: int):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.phase = "prefill"
+    def forward(
+        self,
+        query_state: torch.Tensor,
+        key_state: torch.Tensor,
+        value_state: torch.Tensor,
+        attn_mask: torch.Tensor,
+        batch_position: torch.Tensor,
+        past_key_state: torch.Tensor,
+        past_value_state: torch.Tensor,
+        seq_position: torch.Tensor,
+        scale: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute attention with static shapes and explicit cache management.
+        Args:
+            query_state: Query tensor [1, num_heads, 1, head_dim]
+            key_state: Key tensor [1, num_heads, seq_len, head_dim]
+            value_state: Value tensor [1, num_heads, seq_len, head_dim]
+            attn_mask: Attention mask tensor ∈ {0, 1}
+            batch_position: Batch index for cache lookup
+            past_key_state: Previous key cache states
+            past_value_state: Previous value cache states
+            seq_position: Current position in sequence
+            scale: Scale applied to attn weights
+        Returns:
+            Tuple of (attention_output, key_state, value_state)
+        """
+        # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
+        key_state = key_state.unsqueeze(2)  # 1, 32, 1, 128, 128
+        value_state = value_state.unsqueeze(2)
+        attn_mask = attn_mask.unsqueeze(2)
+        query_state = query_state.view(
+            1,
+            self.num_key_value_heads,
+            self.num_heads // self.num_key_value_heads,
+            -1,  # seq len
+            self.head_dim,
+        )
+        if self.phase == "decode":
+            attn_output, key_state, value_state = torch.ops.rbln_custom_ops.attn_decode(
+                query_state,
+                key_state,
+                value_state,
+                attn_mask,
+                past_key_state.unsqueeze(2),
+                past_value_state.unsqueeze(2),
+                seq_position,
+                scale,
+            )
+        else:
+            attn_output, key_state, value_state = torch.ops.rbln_custom_ops.attn_prefill(
+                query_state,
+                key_state,
+                value_state,
+                attn_mask,
+                past_key_state.unsqueeze(2),
+                past_value_state.unsqueeze(2),
+                batch_position,
+                seq_position,
+                scale,
+            )
+        attn_output = attn_output.view(1, self.num_heads, -1, self.head_dim)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(1, -1, self.num_heads * self.head_dim)
+        return attn_output, key_state.squeeze(2), value_state.squeeze(2)
 def slice_and_unsqueeze_cos_sin(cos, sin, cache_position, unsqueeze_dim=1):
     """Slice cos[cache_position], sin[cache_position] vector for the query."""
     if cache_position.shape[0] > 1:
@@ -821,40 +837,83 @@ class RotaryEmbedding(nn.Module):
 class DecoderOnlyFlashAttention(DecoderOnlyAttention):
     def __init__(self, self_attn, kvcache_partition_len):
+        self.kvcache_partition_size = kvcache_partition_len
         super().__init__(self_attn=self_attn)
-        self.kvcache_partition_size = torch.zeros(kvcache_partition_len, dtype=torch.int32)
-    def get_cache_pos_for_partitions(self, current_steps, batch_size, max_seq_len):
-        partition_len = self.kvcache_partition_size.size()[0]
-        num_partition = max_seq_len // partition_len
-        cache_pos_for_partitions = torch.zeros((batch_size, num_partition), dtype=torch.int32)
-        if self.phase == "decode":
-            for b_idx in range(batch_size):
-                cache_pos = current_steps[b_idx]
-                for p_idx in range(num_partition):
-                    cache_pos_for_partitions[b_idx][p_idx] = torch.clamp(
-                        cache_pos - partition_len * p_idx, 0, partition_len
-                    )
-        else:  # prefill
-            cache_pos = current_steps[0]
-            for p_idx in range(num_partition):
-                cache_pos_for_partitions[0][p_idx] = torch.clamp(cache_pos - partition_len * p_idx, 0, partition_len)
+    def get_attention(self):
+        return FlashAttentionOp(self.num_heads, self.head_dim, self.num_key_value_heads, self.kvcache_partition_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_positions: torch.LongTensor,
+        batch_position: torch.Tensor,
+        past_key_values: Tuple[Tuple[torch.Tensor]],
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ):
+        batch_size, query_length, _ = hidden_states.size()
+        query_states, key_states, value_states = self.projection(hidden_states=hidden_states)
+        query_states = query_states.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(
+            1, 2
+        )
+        # b, num_head, query, head_dim
+        if cos is not None and sin is not None:
+            query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)
+        _key_states = []
+        _value_states = []
+        _attn_outputs = []
+        for b in range(batch_size):
+            seq_position = seq_positions[b][0]  # FIXME: Remove take-take pattern matching
+            attn_output, key_state, value_state = self.attention(
+                query_states[b].unsqueeze(0),
+                key_states[b].unsqueeze(0),
+                value_states[b].unsqueeze(0),
+                attention_mask[b].unsqueeze(0) if self.phase == "decode" else attention_mask,
+                past_key_state=past_key_values[self.layer_idx][0],
+                past_value_state=past_key_values[self.layer_idx][1],
+                batch_position=b if self.phase == "decode" else batch_position,
+                seq_position=seq_position,
+                scale=self.scale,
+            )
+            _key_states.append(key_state)
+            _value_states.append(value_state)
+            _attn_outputs.append(attn_output)
+        key_states = torch.cat(_key_states, dim=0)
+        value_states = torch.cat(_value_states, dim=0)
+        attn_outputs = torch.cat(_attn_outputs, dim=0)
+        attn_outputs = self.o_proj(attn_outputs)
+        past_key_values[self.layer_idx] = key_states, value_states
+        return attn_outputs, past_key_values
-        return cache_pos_for_partitions
-    def rbln_flash_attention(
+class FlashAttentionOp(AttentionOp):
+    def __init__(self, num_heads: int, head_dim: int, num_key_value_heads: int, kvcache_partition_len: int):
+        super().__init__(num_heads=num_heads, head_dim=head_dim, num_key_value_heads=num_key_value_heads)
+        self.kvcache_partition_size = kvcache_partition_len
+    def forward(
         self,
         query_state,
         key_state,
         value_state,
         attn_mask,
-        batch_idx,
+        batch_position,
         past_key_state,
         past_value_state,
-        cache_pos_for_partitions,
+        seq_position,
+        scale,
     ):
         # reshape for removing repeat_kv (batch=1 , num_head, 1, q_len=1, head_dim)
-        key_state = key_state.unsqueeze(2)  # 1, 32, 1, 128, 128
+        key_state = key_state.unsqueeze(2)
         value_state = value_state.unsqueeze(2)
         attn_mask = attn_mask.unsqueeze(2)
@@ -866,9 +925,7 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
             self.head_dim,
         )
-        # RBLN custom flash attention(decode), dummy batch index
         if self.phase == "decode":
-            sidx = cache_pos_for_partitions[batch_idx][0]
             attn_output, key_state, value_state = torch.ops.rbln_custom_ops.flash_attn_decode(
                 query_state,
                 key_state,
@@ -876,11 +933,11 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
                 attn_mask,
                 past_key_state.unsqueeze(2),
                 past_value_state.unsqueeze(2),
-                sidx,
+                seq_position,
+                scale,
                 self.kvcache_partition_size,
             )
         else:
-            sidx = cache_pos_for_partitions[0][0]
             attn_output, key_state, value_state = torch.ops.rbln_custom_ops.flash_attn_prefill(
                 query_state,
                 key_state,
@@ -888,8 +945,9 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
                 attn_mask,
                 past_key_state.unsqueeze(2),
                 past_value_state.unsqueeze(2),
-                batch_idx,
-                sidx,
+                batch_position,
+                seq_position,
+                scale,
                 self.kvcache_partition_size,
             )
@@ -899,60 +957,3 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
         attn_output = attn_output.reshape(1, -1, self.num_heads * self.head_dim)
         return attn_output, key_state, value_state
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        current_steps: torch.LongTensor,
-        batch_position: torch.Tensor,
-        past_key_values: Tuple[Tuple[torch.Tensor]],
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
-    ):
-        batch_size, query_length, _ = hidden_states.size()
-        query_states, key_states, value_states = self.projection(hidden_states=hidden_states)
-        query_states = query_states.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(
-            1, 2
-        )
-        # b, num_head, query, head_dim
-        max_seq_len = past_key_values[self.layer_idx][0].shape[-2]
-        if cos is not None and sin is not None:
-            query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)
-        cache_pos_for_partitions = self.get_cache_pos_for_partitions(
-            current_steps, batch_size=batch_size, max_seq_len=max_seq_len
-        )  # batch_size, num_partitions
-        _key_states = []
-        _value_states = []
-        _attn_outputs = []
-        for b in range(batch_size):
-            attn_output, key_state, value_state = self.rbln_flash_attention(
-                query_states[b].unsqueeze(0),
-                key_states[b].unsqueeze(0),
-                value_states[b].unsqueeze(0),
-                attention_mask[b].unsqueeze(0)
-                if self.phase == "decode"
-                else attention_mask,  # TODO(jongho): fix when msoftmax is supported
-                past_key_state=past_key_values[self.layer_idx][0],
-                past_value_state=past_key_values[self.layer_idx][1],
-                batch_idx=b if self.phase == "decode" else batch_position,
-                cache_pos_for_partitions=cache_pos_for_partitions,
-            )
-            _key_states.append(key_state)
-            _value_states.append(value_state)
-            _attn_outputs.append(attn_output)
-        key_states = torch.cat(_key_states, dim=0)
-        value_states = torch.cat(_value_states, dim=0)
-        attn_outputs = torch.cat(_attn_outputs, dim=0)
-        attn_outputs = self.o_proj(attn_outputs)
-        past_key_values[self.layer_idx] = key_states, value_states
-        return attn_outputs, past_key_values

optimum-rbln 0.1.15__py3-none-any.whl → 0.2.0__py3-none-any.whl

optimum-rbln 0.1.15py3-none-any.whl → 0.2.0py3-none-any.whl