PyPI - optimum-rbln - Versions diffs - 0.8.1a0__py3-none-any.whl → 0.8.1a2__py3-none-any.whl - Mend

optimum-rbln 0.8.1a0py3-none-any.whl → 0.8.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

optimum/rbln/transformers/models/decoderonly/configuration_decoderonly.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 import rebel
@@ -23,8 +23,18 @@ from ...utils.rbln_quantization import RBLNQuantizationConfig
 logger = get_logger()
+CacheImplType = Literal["static", "sliding_window", "hybrid"]
 class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
+    """
+    Configuration class for RBLN decoder-only models for Causal Language Modeling.
+    This class extends RBLNModelConfig with parameters specific to decoder-only transformer
+    architectures optimized for RBLN devices. It controls aspects like attention implementation,
+    KV cache management, and batching for inference.
+    """
     def __init__(
         self,
         batch_size: Optional[int] = None,
@@ -39,36 +49,119 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
         prefill_chunk_size: Optional[int] = None,
         kvcache_num_blocks: Optional[int] = None,
         decoder_batch_sizes: Optional[List[int]] = None,
+        cache_impl: Optional[CacheImplType] = None,
+        sliding_window: Optional[int] = None,
+        sliding_window_layers: Optional[List[int]] = None,
         **kwargs,
     ):
         """
         Args:
             batch_size (Optional[int]): The batch size for inference. Defaults to 1.
             max_seq_len (Optional[int]): The maximum sequence length supported by the model.
-            use_inputs_embeds (Optional[bool]): Whether to use input embeddings directly. Defaults to False.
-            use_attention_mask (Optional[bool]): Whether to use attention masks. This is automatically set to True
-                for RBLN-CA02 devices.
+                If not provided, it attempts to infer from the model's configuration
+                (`max_position_embeddings` or `n_positions`). Must be specified if not available
+                in the model config.
+            use_inputs_embeds (Optional[bool]): Whether to use input embeddings (`inputs_embeds`)
+                directly instead of `input_ids`. Defaults to False. Requires the model to be
+                compiled with this option enabled.
+            use_attention_mask (Optional[bool]): Whether the model requires attention masks during
+                inference. This is typically determined based on the target device and model
+                architecture. Defaults are often set automatically based on the model and RBLN NPU.
             use_position_ids (Optional[bool]): Whether to use position IDs. Defaults to False.
-            attn_impl (Optional[str]): The attention implementation to use.
-            kvcache_partition_len (Optional[int]): The length of each KV cache partition.
-            kvcache_block_size (Optional[int]): The block size for KV cache.
-            quantization (Optional[Dict[str, Any]]): Configuration for model quantization.
-            prefill_chunk_size (Optional[int]): The chunk size for prefilling the KV cache. Defaults to 128,
-                and must be a positive integer divisible by 64.
-            kvcache_num_blocks (Optional[int]): The number of blocks in the KV cache.
+            attn_impl (Optional[str]): Specifies the attention implementation to use.
+                See the "Attention Implementation (`attn_impl`)" section below for details.
+            kvcache_partition_len (Optional[int]): Defines the partition length for the KV cache
+                when using "flash_attn". See the "KV Cache Partition Length (`kvcache_partition_len`)"
+                section below for details.
+            kvcache_block_size (Optional[int]): Sets the size (in number of tokens) of each block
+                in the PagedAttention KV cache. See the "KV Cache Block Size (`kvcache_block_size`)"
+                section below for details.
+            quantization (Optional[Dict[str, Any]]): Configuration dictionary for applying model
+                quantization. Specifies format, etc.
+            prefill_chunk_size (Optional[int]): The chunk size used during the prefill phase for
+                processing input sequences. Defaults to 128. Must be a positive integer
+                divisible by 64. Affects prefill performance and memory usage.
+            kvcache_num_blocks (Optional[int]): The total number of blocks to allocate for the
+                PagedAttention KV cache. See the "KV Cache Number of Blocks (`kvcache_num_blocks`)"
+                section below for details.
             decoder_batch_sizes (Optional[List[int]]): A list of batch sizes for which separate decoder models will be compiled.
                 This allows the model to handle varying batch sizes efficiently during generation. If not specified,
                 defaults to a list containing only the model's main batch size. When specifying multiple batch sizes:
                 1) All values must be less than or equal to the main batch size.
                 2) The list will be sorted in descending order (larger batch sizes first).
                 3) If using multiple decoders, at least one batch size should match the main batch size.
+            cache_impl (Optional[CacheImplType]): Specifies the KV cache implementation strategy. Defaults to "static".
+                - "static": Uses a fixed-size global KV cache for all layers, suitable for standard attention patterns.
+                - "sliding_window": Implements a sliding window KV cache, where each layer maintains a local cache of recent tokens.
+                - "hybrid": Combines both static and sliding window approaches, allowing different layers to use different cache strategies.
+                The choice affects memory usage and attention patterns. When using "sliding_window" or "hybrid",
+                you must specify the `sliding_window` size and optionally `sliding_window_layers` for hybrid mode.
+            sliding_window (Optional[int]): The size of the sliding window. Defaults to None.
+            sliding_window_layers (Optional[List[int]]): The layers to use for the sliding window used in the hybrid model. Defaults to None.
             **kwargs: Additional arguments passed to the parent RBLNModelConfig.
         Raises:
-            ValueError: If batch_size is not a positive integer or if prefill_chunk_size is not
-                a positive integer divisible by 64.
+            ValueError: If `batch_size` is not a positive integer.
+            ValueError: If `prefill_chunk_size` is not a positive integer divisible by 64.
+            ValueError: If `max_seq_len` cannot be determined and is required.
+            ValueError: If attention parameter constraints are violated (e.g., `max_seq_len` vs
+                `kvcache_partition_len` for flash attention).
+        Attention Implementation:
+            `attn_impl` determines the underlying attention mechanism used by the model.
+            - **`"eager"`** (Default if `kvcache_partition_len` is not set): Uses the standard PyTorch
+                attention implementation. Suitable for sequences up to a certain limit (e.g., 32,768 tokens).
+            - **`"flash_attn"`**: Utilizes an optimized Flash Attention implementation, beneficial for
+                longer sequences and potentially faster execution. Requires `max_seq_len` to be at least
+                8,192. If `kvcache_partition_len` is specified, `attn_impl` automatically defaults
+                to `"flash_attn"`. When using `"flash_attn"`, `kvcache_block_size` must equal
+                `kvcache_partition_len`.
+            The choice impacts performance and memory usage, especially for long sequences.
+            Constraints related to `max_seq_len` and `kvcache_partition_len` apply when using
+            `"flash_attn"`.
+        KV Cache Partition Length:
+            `kvcache_partition_len` is relevant **only** when `attn_impl` is `"flash_attn"`.
+            - It defines the length (number of tokens) of each partition within the Key-Value (KV) cache.
+            - Must be between 4,096 and 32,768 (inclusive).
+            - When using `"flash_attn"`, `max_seq_len` must be a multiple of `kvcache_partition_len`
+                and at least twice its value (`max_seq_len >= 2 * kvcache_partition_len`).
+            - If `attn_impl` is `"flash_attn"` and `kvcache_partition_len` is `None`, it defaults to
+                16,384.
+        KV Cache Number of Blocks:
+            `kvcache_num_blocks` controls the total number of memory blocks allocated for the PagedAttention KV cache.
+            Each block holds `kvcache_block_size` tokens of Key and Value states.
+            - **Automatic Estimation (Default)**: If `kvcache_num_blocks` is `None`, the system estimates
+                the maximum number of blocks that can fit into the available RBLN device memory. This
+                calculation considers the model size (kernel memory), required buffer memory, the number
+                of layers and heads, `kvcache_block_size`, tensor parallelism, and available RBLN NPU DRAM.
+                This aims to maximize cache capacity for potentially better performance with long sequences
+                or larger batches without manual tuning.
+            - **Manual Setting**: You can explicitly set the number of blocks. This provides finer control
+                but requires careful consideration of memory limits. Setting it too high may lead to
+                compilation errors if it exceeds available memory. The system will issue warnings if your
+                setting exceeds the estimated maximum.
+            - **Performance Impact**: A larger number of blocks reduces the likelihood of cache eviction,
+                which is beneficial for tasks involving many long sequences or large batch sizes, enabling
+                higher throughput. However, allocating more blocks consumes more memory.
+            - **Minimum Requirement**: The system requires a minimum number of blocks to function,
+                calculated based on `max_seq_len`, `kvcache_block_size`, and `batch_size`. The number of
+                allocated blocks must be sufficient to hold at least one full sequence length per item
+                in the batch concurrently. The system will log warnings or raise errors if constraints
+                are violated (e.g., if `kvcache_num_blocks` is less than `batch_size` when using Flash Attention).
+            The optimal value depends on the specific model, task, hardware, and desired trade-off
+            between performance and memory usage. The automatic estimation provides a robust starting point.
         """
         super().__init__(**kwargs)
         self.batch_size = batch_size or 1
         if not isinstance(self.batch_size, int) or self.batch_size < 0:
@@ -121,6 +214,10 @@ class RBLNDecoderOnlyModelForCausalLMConfig(RBLNModelConfig):
             # Larger batch size should be at the beginning of the list.
             self.decoder_batch_sizes.sort(reverse=True)
+        self.cache_impl = cache_impl or "static"
+        self.sliding_window = sliding_window
+        self.sliding_window_layers = sliding_window_layers or []
     @property
     def use_multiple_decoder(self):
         return isinstance(self.decoder_batch_sizes, list) and len(self.decoder_batch_sizes) > 1

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -21,6 +21,7 @@ from transformers import PretrainedConfig, PreTrainedModel
 from ....utils import logging
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from .configuration_decoderonly import CacheImplType
 logger = logging.get_logger(__name__)
@@ -30,6 +31,7 @@ DEFAULT_MAX_EAGER_ATTN_SEQUENCE_LENGTH = 32_768
 MIN_FLASH_ATTN_MAX_SEQ_LEN = 8_192
 MIN_FLASH_ATTN_PARTITION_LENGTH = 4_096
 MAX_FLASH_ATTN_PARTITION_LENGTH = 32_768
+MAX_SLIDING_WINDOW_SIZE = 32_768
 def set_default_values(
@@ -114,6 +116,13 @@ def validate_attention_method(attn_impl: str, kvcache_partition_len: int, kvcach
             )
+def validate_sliding_window_size(sliding_window: int, prefill_chunk_size: int):
+    if sliding_window > MAX_SLIDING_WINDOW_SIZE - prefill_chunk_size:
+        raise ValueError(
+            f"Sliding window size ({sliding_window}) must be less than 32768 - prefill_chunk_size ({32768 - prefill_chunk_size})"
+        )
 class DecoderOnlyWrapper(nn.Module):
     """A wrapper class for decoder-only language models that handles RBLN-specific optimizations and requirements.
@@ -146,12 +155,15 @@ class DecoderOnlyWrapper(nn.Module):
         max_seq_len: int,
         use_rotary_emb: bool,
         attn_impl: str,
+        cache_impl: CacheImplType,
         use_inputs_embeds: bool,
         use_attention_mask: bool,
         use_position_ids: bool,
         use_learned_pos_emb: Optional[bool] = None,
         kvcache_partition_len: Optional[int] = None,
         kvcache_block_size: Optional[int] = None,
+        sliding_window: Optional[int] = None,
+        sliding_window_layers: Optional[List[int]] = None,
     ):
         super().__init__()
         self.config = causal_lm.config
@@ -171,6 +183,9 @@ class DecoderOnlyWrapper(nn.Module):
         self.use_position_ids = use_position_ids
         self.use_inputs_embeds = use_inputs_embeds
         self.use_learned_pos_emb = use_learned_pos_emb
+        self.sliding_window_layers = sliding_window_layers
+        self.cache_impl = cache_impl
+        self.sliding_window = sliding_window
         if self.attn_impl == "flash_attn":
             self.kvcache_partition_len = kvcache_partition_len or DEFAULT_FLASH_ATTN_PARTITION_LENGTH
@@ -186,7 +201,6 @@ class DecoderOnlyWrapper(nn.Module):
             )
         self.causal_lm = self.convert_to_rbln_causal_lm(causal_lm, max_seq_len)
         self.num_hidden_layers = getattr(self.config, "num_hidden_layers", None) or getattr(self.config, "n_layer")
         self._phase = "prefill"
@@ -195,25 +209,35 @@ class DecoderOnlyWrapper(nn.Module):
     def convert_to_rbln_causal_lm(self, causal_lm: PreTrainedModel, max_seq_len: int):
         new_layers = []
-        for layer in causal_lm.model.layers:
-            if self.attn_impl == "eager":
+        for layer_idx, layer in enumerate(causal_lm.model.layers):
+            if layer_idx in self.sliding_window_layers:
+                # Flash attention is not yet supported for sliding window attention.
                 new_self_attn = DecoderOnlyAttention(
                     layer.self_attn,
                     self.use_attention_mask,
                     self.use_position_ids,
-                    kvcache_block_size=self.kvcache_block_size,
-                )
-            elif self.attn_impl == "flash_attn":
-                new_self_attn = DecoderOnlyFlashAttention(
-                    layer.self_attn,
-                    kvcache_partition_len=self.kvcache_partition_len,
-                    kvcache_block_size=self.kvcache_block_size,
-                    use_attention_mask=self.use_attention_mask,
-                    use_position_ids=self.use_position_ids,
+                    kvcache_block_size=self.sliding_window,
+                    is_sliding=True,
                 )
             else:
-                raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
+                if self.attn_impl == "eager":
+                    new_self_attn = DecoderOnlyAttention(
+                        layer.self_attn,
+                        self.use_attention_mask,
+                        self.use_position_ids,
+                        kvcache_block_size=self.kvcache_block_size,
+                        is_sliding=False,
+                    )
+                elif self.attn_impl == "flash_attn":
+                    new_self_attn = DecoderOnlyFlashAttention(
+                        layer.self_attn,
+                        kvcache_partition_len=self.kvcache_partition_len,
+                        kvcache_block_size=self.kvcache_block_size,
+                        use_attention_mask=self.use_attention_mask,
+                        use_position_ids=self.use_position_ids,
+                    )
+                else:
+                    raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
             new_layer = DecoderOnlyLayer(layer, new_self_attn)
             new_layers.append(new_layer)
@@ -225,6 +249,7 @@ class DecoderOnlyWrapper(nn.Module):
             max_seq_len=max_seq_len,
             kvcache_block_size=self.kvcache_block_size,
             use_learned_pos_emb=self.use_learned_pos_emb,
+            sliding_window_layers=self.sliding_window_layers,
         )
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm
@@ -243,8 +268,9 @@ class DecoderOnlyWrapper(nn.Module):
         input_ids = None if self.use_inputs_embeds else args.pop(0)
         inputs_embeds = args.pop(0) if self.use_inputs_embeds else None
         cache_position = args.pop(0)
-        block_tables = args.pop(0)
-        query_position = args.pop(0) if self.phase == "prefill" else None
+        global_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "static"] else None
+        local_block_tables = args.pop(0) if self.cache_impl in ["hybrid", "sliding_window"] else None
+        query_position = args.pop(0) if "prefill" in self.phase else None
         attention_mask = args.pop(0) if self.use_attention_mask else None
         position_ids = args.pop(0) if self.use_position_ids else None
         past_key_values = args
@@ -264,16 +290,22 @@ class DecoderOnlyWrapper(nn.Module):
             _past_key_values.append(past_key_value)
         past_key_values = _past_key_values
+        if hasattr(self, "rotary_emb_global") and hasattr(self, "rotary_emb_local"):
+            rotary_emb = (self.rotary_emb_global, self.rotary_emb_local)
+        else:
+            rotary_emb = self.rotary_emb
         return (
             input_ids,
             inputs_embeds,
             cache_position,
-            block_tables,
+            global_block_tables,
+            local_block_tables,
             query_position,
             attention_mask,
             position_ids,
             past_key_values,
-            self.rotary_emb,
+            rotary_emb,
         )
     def forward(self, *args):
@@ -281,7 +313,8 @@ class DecoderOnlyWrapper(nn.Module):
             input_ids,
             inputs_embeds,
             cache_position,
-            block_tables,
+            global_block_tables,
+            local_block_tables,
             query_position,
             attention_mask,
             position_ids,
@@ -298,7 +331,8 @@ class DecoderOnlyWrapper(nn.Module):
             query_position=query_position,
             past_key_values=past_key_values,
             rotary_emb=rotary_emb,
-            block_tables=block_tables,
+            global_block_tables=global_block_tables,
+            local_block_tables=local_block_tables,
         )
         return logit
@@ -353,7 +387,8 @@ class DecoderOnlyForCausalLM(nn.Module):
         query_position: torch.Tensor = None,
         past_key_values: Tuple[Tuple[torch.Tensor]] = None,
         rotary_emb: nn.Module = None,
-        block_tables: Optional[torch.Tensor] = None,
+        global_block_tables: Optional[torch.Tensor] = None,
+        local_block_tables: Optional[torch.Tensor] = None,
     ):
         # outputs
         hidden_states = self.model(
@@ -362,12 +397,14 @@ class DecoderOnlyForCausalLM(nn.Module):
             attention_mask=attention_mask,
             cache_position=cache_position,
             position_ids=position_ids,
+            query_position=query_position,
             past_key_values=past_key_values,
             rotary_emb=rotary_emb,
-            block_tables=block_tables,
+            global_block_tables=global_block_tables,
+            local_block_tables=local_block_tables,
         )
-        if self.phase == "prefill":
+        if "prefill" in self.phase:
             hidden_states = hidden_states[:, query_position.to(torch.int).unsqueeze(0)]
         logits = self.lm_head(hidden_states)
@@ -402,6 +439,7 @@ class DecoderOnlyModel(nn.Module):
         max_seq_len=None,
         kvcache_block_size=None,
         use_learned_pos_emb=None,
+        sliding_window_layers=None,
     ):
         super().__init__()
         self._original_mod = model
@@ -411,6 +449,7 @@ class DecoderOnlyModel(nn.Module):
         self.kvcache_block_size = kvcache_block_size
         self.max_seq_len = max_seq_len
         self.use_learned_pos_emb = use_learned_pos_emb
+        self.sliding_window_layers = sliding_window_layers
     @property
     def phase(self):
@@ -441,6 +480,16 @@ class DecoderOnlyModel(nn.Module):
         cache_pos_for_partitions = torch.clamp(cs - pidx * partition_len, 0, partition_len)
         return cache_pos_for_partitions
+    def get_local_cache_positions(self, position_ids, query_position):
+        max_cache_len = self._original_mod.config.sliding_window
+        valid_input_len = 1 if query_position is None else query_position + 1
+        cache_seq_len = torch.clamp(position_ids, max=max_cache_len)[:, :1]  # past seen tokens
+        cache_offset = (
+            torch.clamp(position_ids, max=max_cache_len)[:, :1] + valid_input_len
+        )  # cache offset for next steps
+        return cache_seq_len, cache_offset
     def get_last_layernorm(self) -> nn.LayerNorm:
         return self._original_mod.norm
@@ -459,9 +508,11 @@ class DecoderOnlyModel(nn.Module):
         attention_mask: torch.Tensor = None,
         cache_position: torch.Tensor = None,
         position_ids: torch.Tensor = None,
+        query_position: torch.Tensor = None,
         past_key_values: Tuple[Tuple[torch.Tensor]] = None,
         rotary_emb: Optional[Union[nn.Module, torch.Tensor]] = None,
-        block_tables: Optional[torch.Tensor] = None,
+        global_block_tables: Optional[torch.Tensor] = None,
+        local_block_tables: Optional[torch.Tensor] = None,
     ):
         # retrieve input_ids and inputs_embeds
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -511,7 +562,7 @@ class DecoderOnlyModel(nn.Module):
             hidden_states = hidden_states + position_embeds
             cos, sin = None, None
-        # (batch, seq_len) -> (batch,)
+        # Get sequence positions for flash attention
         if self.attn_impl == "flash_attn":
             seq_positions = cache_position[:, 0]
             seq_positions = self.convert_sequence_positions_for_flash_attn(
@@ -520,15 +571,20 @@ class DecoderOnlyModel(nn.Module):
         else:
             seq_positions = cache_position[:, :1]
-        for layer in self.layers:
+        # Get local cache positions for sliding window layers
+        if len(self.sliding_window_layers) > 0:
+            sliding_cache_pos = self.get_local_cache_positions(position_ids, query_position)
+        for layer_idx, layer in enumerate(self.layers):
+            is_sliding = True if layer_idx in self.sliding_window_layers else False
             hidden_states = layer(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                seq_positions=seq_positions,
+                seq_positions=sliding_cache_pos if is_sliding else seq_positions,
                 past_key_values=past_key_values,
                 cos=cos,
                 sin=sin,
-                block_tables=block_tables,
+                block_tables=local_block_tables if is_sliding else global_block_tables,
             )
         hidden_states = self.get_last_layernorm()(hidden_states)
@@ -625,7 +681,7 @@ class DecoderOnlyAttention(nn.Module):
         self_attn: Original attention module from the base model
     """
-    def __init__(self, self_attn, use_attention_mask, use_position_ids, kvcache_block_size):
+    def __init__(self, self_attn, use_attention_mask, use_position_ids, kvcache_block_size, is_sliding=False):
         super().__init__()
         self._original_mod = self_attn
         self.layer_idx = self_attn.layer_idx
@@ -645,6 +701,7 @@ class DecoderOnlyAttention(nn.Module):
         self.use_attention_mask = use_attention_mask
         self.use_position_ids = use_position_ids
+        self.is_sliding = is_sliding
         self.attention = self.get_attention()
         self.kvcache_block_size = kvcache_block_size
         self.__post_init__()
@@ -659,9 +716,14 @@ class DecoderOnlyAttention(nn.Module):
         self.attention.phase = phase
     def get_attention(self):
-        return AttentionOp(
-            self.num_heads, self.head_dim, self.num_key_value_heads, self.use_attention_mask, self.use_position_ids
-        )
+        if self.is_sliding:
+            return SlidingWindowAttentionOp(
+                self.num_heads, self.head_dim, self.num_key_value_heads, self.use_attention_mask, self.use_position_ids
+            )
+        else:
+            return AttentionOp(
+                self.num_heads, self.head_dim, self.num_key_value_heads, self.use_attention_mask, self.use_position_ids
+            )
     def __post_init__(self):
         self.q_proj = self._original_mod.q_proj
@@ -708,12 +770,14 @@ class DecoderOnlyAttention(nn.Module):
         value_states = value_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(
             1, 2
         )
-        # b, num_head, query, head_dim
+        if hasattr(self, "q_norm") and hasattr(self, "k_norm"):
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
         if cos is not None and sin is not None:
             query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)
-        if batch_size > 1 and self.phase == "prefill":
+        if batch_size > 1 and "prefill" in self.phase:
             raise NotImplementedError(f"batch size should be 1 if prefill phase, but got {batch_size}.")
         attn_output = self.attention(
@@ -987,7 +1051,10 @@ class DecoderOnlyFlashAttention(DecoderOnlyAttention):
         value_states = value_states.view(batch_size, query_length, self.num_key_value_heads, self.head_dim).transpose(
             1, 2
         )
-        # b, num_head, query, head_dim
+        if hasattr(self, "q_norm") and hasattr(self, "k_norm"):
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
         if cos is not None and sin is not None:
             query_states, key_states = self.apply_rotary_pos_embed(query_states, key_states, cos, sin)

optimum-rbln 0.8.1a0__py3-none-any.whl → 0.8.1a2__py3-none-any.whl

optimum-rbln 0.8.1a0py3-none-any.whl → 0.8.1a2py3-none-any.whl