PyPI - optimum-rbln - Versions diffs - 0.7.2rc2__py3-none-any.whl → 0.7.3__py3-none-any.whl - Mend

optimum-rbln 0.7.2rc2py3-none-any.whl → 0.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -13,9 +13,11 @@
 # limitations under the License.
 import inspect
+import math
+from collections import deque
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
@@ -50,14 +52,28 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         phase: str,
         batch_size: int,
         dec_attn_mask: torch.Tensor,
+        block_tables: torch.Tensor,
+        free_block_pool: Deque,
+        kvcache_block_size: int,
+        use_attention_mask: bool,
+        attn_impl: str,
         **kwargs: Any,
     ) -> None:
         super().__init__(runtime, **kwargs)
         self.phase = phase
         self.batch_size = batch_size
+        # shared data structures between prefill and decode phase
+        self.use_attention_mask = use_attention_mask
         # shared tensor between prefill and decode phase
         self.dec_attn_mask = dec_attn_mask
+        self.block_tables = block_tables
+        self.free_block_pool = free_block_pool
+        self.kvcache_block_size = kvcache_block_size
+        self.empty_block = -1
+        self.attn_impl = attn_impl
         if self.phase == "prefill":
             vocab_size = kwargs.pop("vocab_size")
@@ -68,6 +84,75 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 torch.ones(1, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
             )
+    def get_block_tables(self, cache_position: torch.Tensor, batch_idx: int = None):
+        """
+        Manages and returns the KV cache block tables.
+        Updates the block tables based on the given cache_position, allocating new blocks or reusing existing ones as needed.
+        Args:
+            cache_position (torch.Tensor): Tensor containing cache position information, indicating positions within the cache for each batch item.
+            batch_idx (int, optional): Specific batch index, used when phase is 'prefill'.
+        Returns:
+            torch.Tensor: Updated block tables.
+        """
+        NO_BLOCKS_ERROR = (
+            "No memory blocks are available for allocation. "
+            "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln. "
+            "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html). "
+            "Using vllm-rbln should fix this issue and enhance inference performance."
+        )
+        def update_block(batch_idx: int, block_idx: int):
+            """
+            If the block is empty (empty_block), allocates a block from the free_block_pool.
+            """
+            if self.block_tables[batch_idx][block_idx] == self.empty_block:
+                if self.free_block_pool:
+                    block = self.free_block_pool.popleft()
+                    self.block_tables[batch_idx][block_idx] = block
+                else:
+                    raise RuntimeError(NO_BLOCKS_ERROR)
+        def replace_empty_block(block_tables: torch.Tensor):
+            """
+            Replaces all occurrences of `self.empty_block` in `block_tables` with a dummy block from `self.free_block_pool`.
+            """
+            if not torch.any(block_tables == self.empty_block):
+                return block_tables.clone()
+            elif self.free_block_pool:
+                _free_block = self.free_block_pool[0]
+                return torch.where(block_tables == self.empty_block, _free_block, block_tables)
+            else:
+                raise RuntimeError(NO_BLOCKS_ERROR)
+        if self.phase == "prefill":
+            # Track previously used blocks and return them to the free_block_pool and
+            # reset the current batch's block table to empty blocks
+            prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
+            self.free_block_pool.extend(prev_blocks)
+            self.block_tables[batch_idx].fill_(self.empty_block)
+            # Get the start (s) and end (e) positions from cache_position and
+            # iterate over the cache positions to allocate necessary blocks
+            s, e = cache_position[0][0].item(), cache_position[0][-1].item()
+            for position in range(s, e + 1, self.kvcache_block_size):
+                block_idx = position // self.kvcache_block_size
+                if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+                    raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
+                update_block(batch_idx, block_idx)
+            return replace_empty_block(self.block_tables[batch_idx])
+        # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
+        else:
+            for b_idx in range(self.batch_size):
+                position = cache_position[b_idx][0].item()
+                block_idx = position // self.kvcache_block_size
+                update_block(b_idx, block_idx)
+            return replace_empty_block(self.block_tables)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -75,6 +160,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         cache_position: torch.Tensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         batch_idx: Optional[int] = None,
+        block_tables: Optional[torch.Tensor] = None,
     ):
         if input_ids is None and inputs_embeds is None:
             raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
@@ -86,19 +172,29 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         else:
             inputs = inputs_embeds
+        if block_tables is None:
+            block_tables = self.get_block_tables(cache_position, batch_idx=batch_idx)
+            is_external_block_tables = False
+        else:
+            is_external_block_tables = True
         if self.phase == "decode":
             return self.decode_forward(
                 inputs,
                 cache_position,
+                block_tables,
+                is_external_block_tables,
                 attention_mask=attention_mask,
             )
         else:
-            return self.prefill_forward(inputs, cache_position, attention_mask, batch_idx)
+            return self.prefill_forward(inputs, cache_position, attention_mask, batch_idx, block_tables)
     def decode_forward(
         self,
         inputs: torch.Tensor,
         cache_position: torch.Tensor = None,
+        block_tables: torch.Tensor = None,
+        is_external_block_tables: bool = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         batch_size = inputs.shape[0]
@@ -110,19 +206,29 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         if batch_size != cache_position.shape[0]:
             raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
-        if attention_mask is None:
+        if self.use_attention_mask and attention_mask is None:
             for b_idx in range(batch_size):
                 decoding_step = cache_position[b_idx].item()
                 if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
                     raise ValueError(
                         f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
                     )
-                self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+                if is_external_block_tables:
+                    self.dec_attn_mask[b_idx].fill_(0)
+                    self.dec_attn_mask[b_idx, :, :, : decoding_step + 1] = 1
+                else:
+                    self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+            attention_mask = self.dec_attn_mask
+            attention_mask = self.dec_attn_mask
         logits = super().forward(
             inputs,
-            self.dec_attn_mask if attention_mask is None else attention_mask,
             cache_position,
+            attention_mask if self.use_attention_mask else None,
+            block_tables,
         )
         return logits
@@ -133,6 +239,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         cache_position: torch.Tensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         batch_idx: int = None,
+        block_tables: torch.Tensor = None,
+        is_external_block_tables: bool = None,
     ) -> torch.FloatTensor:
         """
         Performs chunked prefill for efficient KV-cache updates and memory optimization.
@@ -140,11 +248,6 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
         """
-        if batch_idx is None or batch_idx >= self.batch_size:
-            raise RuntimeError(
-                f"Invalid batch_idx ({batch_idx}). It must be a non-null value less than the batch size ({self.batch_size})."
-            )
         # Handle continuous batching in a compiled graph by extracting valid inputs
         # If an attention mask is provided, select only the valid (non-masked) inputs
         inputs = inputs[:, attention_mask.bool()] if attention_mask is not None else inputs
@@ -156,7 +259,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             )
         # Initialize attention mask for chunked processing
-        chunked_attention_mask = torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
+        if self.use_attention_mask:
+            chunked_attention_mask = torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
         # Buffer for storing output logits
         out_buffers = [
@@ -195,28 +299,29 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             input_chunk = inputs[:, step : step + self.prefill_chunk_size]
             cache_pos_chunk = cache_position[:, step : step + self.prefill_chunk_size]
-            # Update attention mask to ensure proper causal behavior
-            if step >= self.prefill_chunk_size:
-                chunked_attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
-            chunked_attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+            if self.use_attention_mask:
+                # Update attention mask to ensure proper causal behavior
+                if step >= self.prefill_chunk_size:
+                    chunked_attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
+                chunked_attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            # Define batch position and query position
-            batch_position = torch.tensor(batch_idx, dtype=torch.int16)
+            # Define query position
             query_position = torch.tensor((query_length - 1) % self.prefill_chunk_size, dtype=torch.int16)
             # Forward pass for the current chunk
             logits = super().forward(
                 input_chunk,
-                chunked_attention_mask,
                 cache_pos_chunk,
-                batch_position,
+                chunked_attention_mask if self.use_attention_mask else None,
                 query_position,
+                block_tables,
                 out=out_buffers,
             )
         # Update decoder attention mask with processed KV-cache length from prefill phase
-        self.dec_attn_mask[batch_idx].fill_(0)
-        self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
+        if not is_external_block_tables and self.use_attention_mask:
+            self.dec_attn_mask[batch_idx].fill_(0)
+            self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
         return logits
@@ -256,8 +361,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
         self.max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
         self.prefill_chunk_size = self.rbln_config.model_cfg["prefill_chunk_size"]
+        self.kvcache_block_size = self.rbln_config.model_cfg["kvcache_block_size"]
+        # FIXME get kvcache_num_blocks from compiled results.
+        self.kvcache_num_blocks = self.rbln_config.model_cfg["kvcache_num_blocks"]
+        self.use_attention_mask = self.rbln_config.model_cfg["use_attention_mask"]
+        attn_impl = self.rbln_config.model_cfg["attn_impl"]
         main_input_name = self.main_input_name
         if self.rbln_config.model_cfg["use_inputs_embeds"]:
             main_input_name = "inputs_embeds"
             artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
@@ -271,7 +381,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         else:
             self.embed_tokens = None
+        # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
         dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
+        block_tables = torch.zeros(
+            self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
+        ).fill_(-1)
+        free_block_pool = deque(x for x in range(self.kvcache_num_blocks))
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
             main_input_name=main_input_name,
@@ -279,9 +395,14 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             phase="prefill",
             batch_size=self.batch_size,
             dec_attn_mask=dec_attn_mask,
+            block_tables=block_tables,
+            free_block_pool=free_block_pool,
+            kvcache_block_size=self.kvcache_block_size,
             vocab_size=self.config.vocab_size,
-            max_seq_len=self.max_seq_len,
             prefill_chunk_size=self.prefill_chunk_size,
+            max_seq_len=self.max_seq_len,
+            use_attention_mask=self.use_attention_mask,
+            attn_impl=attn_impl,
         )
         self.decoder = RBLNRuntimeModel(
             runtime=self.model[1],
@@ -290,6 +411,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             phase="decode",
             batch_size=self.batch_size,
             dec_attn_mask=dec_attn_mask,
+            block_tables=block_tables,
+            free_block_pool=free_block_pool,
+            kvcache_block_size=self.kvcache_block_size,
+            use_attention_mask=self.use_attention_mask,
+            attn_impl=attn_impl,
         )
     @classmethod
@@ -363,7 +489,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         def redirect(func):
             return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(self.hf_class, __name, None) or getattr(PreTrainedModel, __name)
+        val = getattr(self.get_hf_class(), __name, None) or getattr(PreTrainedModel, __name)
         if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
             return redirect(val)
         return val
@@ -387,7 +513,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
         wrapper_cfg["attn_impl"] = rbln_config.model_cfg.get("attn_impl")
         wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        wrapper_cfg["kvcache_block_size"] = rbln_config.model_cfg.get("kvcache_block_size")
         wrapper_cfg["use_rotary_emb"] = cls._use_rotary_emb
+        wrapper_cfg["use_attention_mask"] = rbln_config.model_cfg.get("use_attention_mask")
         return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
@@ -438,6 +566,71 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         return compile_model(quantize_config=quantize_config)
+    @classmethod
+    def get_maximum_num_blocks(
+        cls,
+        config: PretrainedConfig,
+        tensor_parallel_size: int,
+        kvcache_block_size: int,
+        nbits_per_param: int,
+        n_model_params: int,
+    ) -> int:
+        def align(x: int, nbytes: int) -> int:
+            return int(math.ceil(x / nbytes) * nbytes)
+        def align_2MB(x: int) -> int:
+            return align(x, 2 * 1024 * 1024)
+        num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
+        num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
+        vocab_size = config.vocab_size
+        hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
+        num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
+        # TODO(jongho): Update if target npu is REBEL.
+        ATOM_DRAM_NBYTES = 16 * 2**30
+        ATOM_SYS_DRAM_NBYTES = 288 * 2**20
+        available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
+        # Get estimated kernel size (approximated)
+        lm_heads_params = align(vocab_size, 64) * hidden_size
+        lm_heads_nbytes = (
+            align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
+        )
+        params = n_model_params - lm_heads_params
+        layer_nbytes = (
+            align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
+            * num_layers
+            * tensor_parallel_size
+        )
+        kernel_size = layer_nbytes + lm_heads_nbytes
+        available_dram -= kernel_size
+        # TODO: Accurate buffer estimation
+        buffer = 2**30  # 1GB Buffer
+        if tensor_parallel_size <= 4:
+            buffer /= 4
+        available_dram -= buffer
+        # Estimate nbytes per a single kvcache block
+        nbytes_per_block = (
+            align_2MB(
+                kvcache_block_size
+                * head_dim
+                * math.ceil(num_key_value_heads / tensor_parallel_size)  # Shard
+                * 2  # (fp16)
+            )
+            * num_layers
+            * 2  # (k, v)
+            * tensor_parallel_size
+        )
+        n_blocks = available_dram // nbytes_per_block
+        return n_blocks, nbytes_per_block
     @classmethod
     def _get_rbln_config(
         cls,
@@ -448,11 +641,19 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_max_seq_len = rbln_kwargs.get("max_seq_len", None)
         rbln_batch_size = rbln_kwargs.get("batch_size", None)
         rbln_use_inputs_embeds = rbln_kwargs.get("use_inputs_embeds", None)
+        rbln_use_attention_mask = rbln_kwargs.get("use_attention_mask", None)
         rbln_attn_impl = rbln_kwargs.get("attn_impl", None)
         rbln_kvcache_partition_len = rbln_kwargs.get("kvcache_partition_len", None)
+        rbln_kvcache_block_size = rbln_kwargs.get("kvcache_block_size", None)
         rbln_quantization = QuantizationManager.validate_quantization_config(rbln_kwargs.get("quantization", None))
         rbln_prefill_chunk_size = rbln_kwargs.get("prefill_chunk_size", None)
+        if rbln_use_attention_mask is None:
+            rbln_use_attention_mask = False
+            rbln_npu = rbln_kwargs.get("npu", None) or rebel.get_npu_name()
+            if rbln_npu == "RBLN-CA02":
+                rbln_use_attention_mask = True
         if rbln_prefill_chunk_size is None:
             rbln_prefill_chunk_size = 128
         elif rbln_prefill_chunk_size % 64 != 0 or rbln_prefill_chunk_size == 0:
@@ -470,12 +671,42 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         rbln_use_inputs_embeds = False if rbln_use_inputs_embeds is None else rbln_use_inputs_embeds
-        rbln_attn_impl, rbln_kvcache_partition_len = validate_attention_method(
+        rbln_attn_impl, rbln_kvcache_partition_len, rbln_kvcache_block_size = validate_attention_method(
             rbln_attn_impl=rbln_attn_impl,
             rbln_kvcache_partition_len=rbln_kvcache_partition_len,
+            rbln_kvcache_block_size=rbln_kvcache_block_size,
             rbln_max_seq_len=rbln_max_seq_len,
         )
+        if rbln_kvcache_block_size is None:
+            if rbln_attn_impl == "eager":
+                rbln_kvcache_block_size = rbln_max_seq_len
+            else:
+                rbln_kvcache_block_size = rbln_kvcache_partition_len
+        rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
+        if rbln_attn_impl == "flash_attn":
+            max_num_blocks, _ = cls.get_maximum_num_blocks(
+                config=model_config,
+                tensor_parallel_size=rbln_kwargs.get("tensor_parallel_size", 1),
+                kvcache_block_size=rbln_kvcache_block_size,
+                nbits_per_param=16 if rbln_quantization is None else 4,  # TODO(jongho): FIX Ad-hoc
+                n_model_params=rbln_kwargs["n_model_params"],
+            )
+            rbln_kvcache_num_blocks = min(rbln_kvcache_num_blocks, max_num_blocks)
+            required_blocks = rbln_max_seq_len // rbln_kvcache_block_size + 1
+            if rbln_kvcache_num_blocks < required_blocks:
+                rbln_kvcache_num_blocks = required_blocks
+            logger.info(f"[KVCache] Compiling with num_blocks: {rbln_kvcache_num_blocks}")
+            if rbln_kvcache_num_blocks < rbln_batch_size:
+                raise RuntimeError(
+                    f"Batch size ({rbln_batch_size}) exceeds available KV cache blocks ({rbln_kvcache_num_blocks}). "
+                    "Ensure the number of blocks is at least equal to the batch size."
+                )
         num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
         num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
         num_hidden_layers = getattr(model_config, "n_layer", None) or getattr(model_config, "num_hidden_layers")
@@ -495,29 +726,42 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             input_info = [
                 main_input,
-                ("attention_mask", [batch_size, 1, query_length, rbln_max_seq_len], "float32"),
                 (
                     "cache_position",
                     [batch_size, query_length],
                     "int32",
                 ),
             ]
+            if rbln_use_attention_mask:
+                input_info.extend(
+                    [
+                        ("attention_mask", [batch_size, 1, query_length, rbln_max_seq_len], "float32"),
+                    ]
+                )
             if query_length > 1:
                 input_info.extend(
                     [
-                        ("batch_position", [], "int16"),
                         ("query_position", [], "int16"),
                     ]
                 )
+            max_block_cnt = rbln_max_seq_len // rbln_kvcache_block_size
+            if query_length > 1:
+                input_info.extend([("block_tables", [max_block_cnt], "int16")])
+            else:
+                input_info.extend([("block_tables", [batch_size, max_block_cnt], "int16")])
             input_info.extend(
                 [
                     (
                         f"past_key_values_{i}",
                         [
-                            rbln_batch_size,
+                            rbln_kvcache_num_blocks,
                             num_key_value_heads,
-                            rbln_max_seq_len,
+                            rbln_kvcache_block_size,
                             head_dim,
                         ],
                         "float32",
@@ -555,9 +799,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 "max_seq_len": rbln_max_seq_len,
                 "batch_size": rbln_batch_size,
                 "prefill_chunk_size": rbln_prefill_chunk_size,
+                "use_attention_mask": rbln_use_attention_mask,
                 "use_inputs_embeds": rbln_use_inputs_embeds,
                 "kvcache_partition_len": rbln_kvcache_partition_len,
+                "kvcache_block_size": rbln_kvcache_block_size,
                 "attn_impl": rbln_attn_impl,
+                "kvcache_num_blocks": rbln_kvcache_num_blocks,
             }
         )

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -36,21 +36,28 @@ logger = logging.get_logger(__name__)
 class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
     """A wrapper class for the Exaone model with a language modeling head."""
-    def convert_to_rbln_causal_lm(self, causal_lm: "ExaoneForCausalLM"):
+    def convert_to_rbln_causal_lm(self, causal_lm: "ExaoneForCausalLM", max_seq_len: int):
         new_layers = []
         for layer in causal_lm.transformer.h:
             if self.attn_impl == "eager":
-                new_self_attn = ExaoneAttention(layer.attn.attention)
+                new_self_attn = ExaoneAttention(
+                    layer.attn.attention, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = ExaoneFlashAttention(
-                    layer.attn.attention, kvcache_partition_len=self.kvcache_partition_len
+                    layer.attn.attention,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    use_attention_mask=self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
             new_layer = ExaoneLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = ExaoneModel(causal_lm.transformer, new_layers, partition_len=self.kvcache_partition_len)
+        new_model = ExaoneModel(
+            causal_lm.transformer, new_layers, partition_len=self.kvcache_partition_len, max_seq_len=max_seq_len
+        )
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -29,20 +29,27 @@ if TYPE_CHECKING:
 class GemmaWrapper(DecoderOnlyWrapper):
-    def convert_to_rbln_causal_lm(self, causal_lm: "GemmaForCausalLM"):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GemmaForCausalLM", max_seq_len: int):
         new_layers = []
         for layer in causal_lm.model.layers:
             if self.attn_impl == "eager":
-                new_self_attn = DecoderOnlyAttention(layer.self_attn)
+                new_self_attn = DecoderOnlyAttention(
+                    layer.self_attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = DecoderOnlyFlashAttention(
-                    layer.self_attn, kvcache_partition_len=self.kvcache_partition_len
+                    layer.self_attn,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    use_attention_mask=self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")
             new_layer = DecoderOnlyLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = GemmaModel(causal_lm.model, new_layers, partition_len=self.kvcache_partition_len)
+        new_model = GemmaModel(
+            causal_lm.model, new_layers, partition_len=self.kvcache_partition_len, max_seq_len=max_seq_len
+        )
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -32,15 +32,17 @@ if TYPE_CHECKING:
 class GPT2Wrapper(DecoderOnlyWrapper):
-    def convert_to_rbln_causal_lm(self, causal_lm: "GPT2LMHeadModel"):
+    def convert_to_rbln_causal_lm(self, causal_lm: "GPT2LMHeadModel", max_seq_len: int):
         if self.attn_impl != "eager":
             raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
         new_layers = []
         for layer in causal_lm.transformer.h:
-            new_self_attn = GPT2Attention(layer.attn)
+            new_self_attn = GPT2Attention(
+                layer.attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+            )
             new_layer = GPT2Layer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = GPT2Model(causal_lm.transformer, new_layers)
+        new_model = GPT2Model(causal_lm.transformer, new_layers, max_seq_len=max_seq_len)
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -55,15 +55,17 @@ class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
         self.config.partial_rotary_factor = self.config.rotary_percentage
         return super().get_rotary_emb(max_seq_len=max_seq_len)
-    def convert_to_rbln_causal_lm(self, causal_lm: "MidmLMHeadModel"):
+    def convert_to_rbln_causal_lm(self, causal_lm: "MidmLMHeadModel", max_seq_len: int):
         if self.attn_impl != "eager":
             raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
         new_layers = []
         for layer in causal_lm.transformer.h:
-            new_self_attn = MidmAttention(layer.attn)
+            new_self_attn = MidmAttention(
+                layer.attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+            )
             new_layer = MidmLayer(layer, new_self_attn)
             new_layers.append(new_layer)
-        new_model = MidmModel(causal_lm.transformer, new_layers)
+        new_model = MidmModel(causal_lm.transformer, new_layers, max_seq_len=max_seq_len)
         new_causal_lm = DecoderOnlyForCausalLM(causal_lm, new_model)
         return new_causal_lm

optimum-rbln 0.7.2rc2__py3-none-any.whl → 0.7.3__py3-none-any.whl

optimum-rbln 0.7.2rc2py3-none-any.whl → 0.7.3py3-none-any.whl