PyPI - optimum-rbln - Versions diffs - 0.7.3a1__py3-none-any.whl → 0.7.3a3__py3-none-any.whl - Mend

optimum-rbln 0.7.3a1py3-none-any.whl → 0.7.3a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -13,9 +13,10 @@
 # limitations under the License.
 import inspect
+from collections import deque
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
@@ -50,17 +51,28 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         phase: str,
         batch_size: int,
         dec_attn_mask: torch.Tensor,
+        block_tables: torch.Tensor,
+        free_block_pool: Deque,
+        kvcache_block_size: int,
         use_attention_mask: bool,
+        attn_impl: str,
         **kwargs: Any,
     ) -> None:
         super().__init__(runtime, **kwargs)
         self.phase = phase
         self.batch_size = batch_size
+        # shared data structures between prefill and decode phase
         self.use_attention_mask = use_attention_mask
         # shared tensor between prefill and decode phase
         self.dec_attn_mask = dec_attn_mask
+        self.block_tables = block_tables
+        self.free_block_pool = free_block_pool
+        self.kvcache_block_size = kvcache_block_size
+        self.empty_block = -1
+        self.attn_impl = attn_impl
         if self.phase == "prefill":
             vocab_size = kwargs.pop("vocab_size")
@@ -71,6 +83,75 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                 torch.ones(1, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
             )
+    def get_block_tables(self, cache_position: torch.Tensor, batch_idx: int = None):
+        """
+        Manages and returns the KV cache block tables.
+        Updates the block tables based on the given cache_position, allocating new blocks or reusing existing ones as needed.
+        Args:
+            cache_position (torch.Tensor): Tensor containing cache position information, indicating positions within the cache for each batch item.
+            batch_idx (int, optional): Specific batch index, used when phase is 'prefill'.
+        Returns:
+            torch.Tensor: Updated block tables.
+        """
+        NO_BLOCKS_ERROR = (
+            "No memory blocks are available for allocation."
+            "The generate() API cannot complete this inference task because Paged Attention is not fully supported by optimum-rbln."
+            "This is supported by vllm-rbln (see: https://docs.rbln.ai/software/model_serving/vllm_support/vllm-rbln.html)."
+            "Using vllm-rbln should fix this issue and enhance inference performance."
+        )
+        def update_block(batch_idx: int, block_idx: int):
+            """
+            If the block is empty (empty_block), allocates a block from the free_block_pool.
+            """
+            if self.block_tables[batch_idx][block_idx] == self.empty_block:
+                if self.free_block_pool:
+                    block = self.free_block_pool.popleft()
+                    self.block_tables[batch_idx][block_idx] = block
+                else:
+                    raise RuntimeError(NO_BLOCKS_ERROR)
+        def replace_empty_block(block_tables: torch.Tensor):
+            """
+            Replaces all occurrences of `self.empty_block` in `block_tables` with a dummy block from `self.free_block_pool`.
+            """
+            if not torch.any(block_tables == self.empty_block):
+                return block_tables.clone()
+            elif self.free_block_pool:
+                _free_block = self.free_block_pool[0]
+                return torch.where(block_tables == self.empty_block, _free_block, block_tables)
+            else:
+                raise RuntimeError(NO_BLOCKS_ERROR)
+        if self.phase == "prefill":
+            # Track previously used blocks and return them to the free_block_pool and
+            # reset the current batch's block table to empty blocks
+            prev_blocks = self.block_tables[batch_idx][self.block_tables[batch_idx] != self.empty_block].tolist()
+            self.free_block_pool.extend(prev_blocks)
+            self.block_tables[batch_idx].fill_(self.empty_block)
+            # Get the start (s) and end (e) positions from cache_position and
+            # iterate over the cache positions to allocate necessary blocks
+            s, e = cache_position[0][0].item(), cache_position[0][-1].item()
+            for position in range(s, e + 1, self.kvcache_block_size):
+                block_idx = position // self.kvcache_block_size
+                if batch_idx >= len(self.block_tables) or block_idx >= len(self.block_tables[batch_idx]):
+                    raise IndexError(f"Invalid index: batch_idx={batch_idx}, block_idx={block_idx}")
+                update_block(batch_idx, block_idx)
+            return replace_empty_block(self.block_tables[batch_idx])
+        # Case for 'decoder' phase, iterate over the cache positions to allocate necessary blocks
+        else:
+            for b_idx in range(self.batch_size):
+                position = cache_position[b_idx][0].item()
+                block_idx = position // self.kvcache_block_size
+                update_block(b_idx, block_idx)
+            return replace_empty_block(self.block_tables)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -78,6 +159,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         cache_position: torch.Tensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         batch_idx: Optional[int] = None,
+        block_tables: Optional[torch.Tensor] = None,
     ):
         if input_ids is None and inputs_embeds is None:
             raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
@@ -89,19 +171,29 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         else:
             inputs = inputs_embeds
+        if block_tables is None:
+            block_tables = self.get_block_tables(cache_position, batch_idx=batch_idx)
+            is_external_block_tables = False
+        else:
+            is_external_block_tables = True
         if self.phase == "decode":
             return self.decode_forward(
                 inputs,
                 cache_position,
+                block_tables,
+                is_external_block_tables,
                 attention_mask=attention_mask,
             )
         else:
-            return self.prefill_forward(inputs, cache_position, attention_mask, batch_idx)
+            return self.prefill_forward(inputs, cache_position, attention_mask, batch_idx, block_tables)
     def decode_forward(
         self,
         inputs: torch.Tensor,
         cache_position: torch.Tensor = None,
+        block_tables: torch.Tensor = None,
+        is_external_block_tables: bool = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         batch_size = inputs.shape[0]
@@ -120,7 +212,14 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                     raise ValueError(
                         f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
                     )
-                self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+                if is_external_block_tables:
+                    self.dec_attn_mask[b_idx].fill_(0)
+                    self.dec_attn_mask[b_idx, :, :, : decoding_step + 1] = 1
+                else:
+                    self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+            attention_mask = self.dec_attn_mask
             attention_mask = self.dec_attn_mask
@@ -128,6 +227,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             inputs,
             cache_position,
             attention_mask if self.use_attention_mask else None,
+            block_tables,
         )
         return logits
@@ -138,6 +238,8 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         cache_position: torch.Tensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         batch_idx: int = None,
+        block_tables: torch.Tensor = None,
+        is_external_block_tables: bool = None,
     ) -> torch.FloatTensor:
         """
         Performs chunked prefill for efficient KV-cache updates and memory optimization.
@@ -145,11 +247,6 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
         and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
         """
-        if batch_idx is None or batch_idx >= self.batch_size:
-            raise RuntimeError(
-                f"Invalid batch_idx ({batch_idx}). It must be a non-null value less than the batch size ({self.batch_size})."
-            )
         # Handle continuous batching in a compiled graph by extracting valid inputs
         # If an attention mask is provided, select only the valid (non-masked) inputs
         inputs = inputs[:, attention_mask.bool()] if attention_mask is not None else inputs
@@ -207,33 +304,21 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
                     chunked_attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
                 chunked_attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            # Define batch position and query position
-            batch_position = torch.tensor(batch_idx, dtype=torch.int16)
+            # Define query position
             query_position = torch.tensor((query_length - 1) % self.prefill_chunk_size, dtype=torch.int16)
-            if self.use_attention_mask:
-                args = (
-                    input_chunk,
-                    cache_pos_chunk,
-                    chunked_attention_mask,
-                    batch_position,
-                    query_position,
-                )
-            else:
-                args = (
-                    input_chunk,
-                    cache_pos_chunk,
-                    batch_position,
-                    query_position,
-                )
             # Forward pass for the current chunk
             logits = super().forward(
-                *args,
+                input_chunk,
+                cache_pos_chunk,
+                chunked_attention_mask if self.use_attention_mask else None,
+                query_position,
+                block_tables,
                 out=out_buffers,
             )
-        if self.use_attention_mask:
-            # Update decoder attention mask with processed KV-cache length from prefill phase
+        # Update decoder attention mask with processed KV-cache length from prefill phase
+        if not is_external_block_tables and self.use_attention_mask:
             self.dec_attn_mask[batch_idx].fill_(0)
             self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
@@ -275,9 +360,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         self.batch_size = self.rbln_config.model_cfg["batch_size"]
         self.max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
         self.prefill_chunk_size = self.rbln_config.model_cfg["prefill_chunk_size"]
+        self.kvcache_block_size = self.rbln_config.model_cfg["kvcache_block_size"]
+        # FIXME get kvcache_num_blocks from compiled results.
+        self.kvcache_num_blocks = self.rbln_config.model_cfg["kvcache_num_blocks"]
         self.use_attention_mask = self.rbln_config.model_cfg["use_attention_mask"]
+        attn_impl = self.rbln_config.model_cfg["attn_impl"]
         main_input_name = self.main_input_name
         if self.rbln_config.model_cfg["use_inputs_embeds"]:
             main_input_name = "inputs_embeds"
             artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
@@ -291,7 +380,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         else:
             self.embed_tokens = None
+        # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
         dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
+        block_tables = torch.zeros(
+            self.batch_size, self.max_seq_len // self.kvcache_block_size, dtype=torch.int16
+        ).fill_(-1)
+        free_block_pool = deque(x for x in range(self.kvcache_num_blocks))
         self.prefill_decoder = RBLNRuntimeModel(
             runtime=self.model[0],
             main_input_name=main_input_name,
@@ -299,10 +394,14 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             phase="prefill",
             batch_size=self.batch_size,
             dec_attn_mask=dec_attn_mask,
+            block_tables=block_tables,
+            free_block_pool=free_block_pool,
+            kvcache_block_size=self.kvcache_block_size,
             vocab_size=self.config.vocab_size,
-            max_seq_len=self.max_seq_len,
             prefill_chunk_size=self.prefill_chunk_size,
+            max_seq_len=self.max_seq_len,
             use_attention_mask=self.use_attention_mask,
+            attn_impl=attn_impl,
         )
         self.decoder = RBLNRuntimeModel(
             runtime=self.model[1],
@@ -311,7 +410,11 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             phase="decode",
             batch_size=self.batch_size,
             dec_attn_mask=dec_attn_mask,
+            block_tables=block_tables,
+            free_block_pool=free_block_pool,
+            kvcache_block_size=self.kvcache_block_size,
             use_attention_mask=self.use_attention_mask,
+            attn_impl=attn_impl,
         )
     @classmethod
@@ -409,6 +512,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         wrapper_cfg = {"max_seq_len": rbln_config.model_cfg["max_seq_len"]}
         wrapper_cfg["attn_impl"] = rbln_config.model_cfg.get("attn_impl")
         wrapper_cfg["kvcache_partition_len"] = rbln_config.model_cfg.get("kvcache_partition_len")
+        wrapper_cfg["kvcache_block_size"] = rbln_config.model_cfg.get("kvcache_block_size")
         wrapper_cfg["use_rotary_emb"] = cls._use_rotary_emb
         wrapper_cfg["use_attention_mask"] = rbln_config.model_cfg.get("use_attention_mask")
@@ -474,6 +578,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_use_attention_mask = rbln_kwargs.get("use_attention_mask", None)
         rbln_attn_impl = rbln_kwargs.get("attn_impl", None)
         rbln_kvcache_partition_len = rbln_kwargs.get("kvcache_partition_len", None)
+        rbln_kvcache_block_size = rbln_kwargs.get("kvcache_block_size", None)
         rbln_quantization = QuantizationManager.validate_quantization_config(rbln_kwargs.get("quantization", None))
         rbln_prefill_chunk_size = rbln_kwargs.get("prefill_chunk_size", None)
@@ -500,12 +605,22 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         rbln_use_inputs_embeds = False if rbln_use_inputs_embeds is None else rbln_use_inputs_embeds
-        rbln_attn_impl, rbln_kvcache_partition_len = validate_attention_method(
+        rbln_attn_impl, rbln_kvcache_partition_len, rbln_kvcache_block_size = validate_attention_method(
             rbln_attn_impl=rbln_attn_impl,
             rbln_kvcache_partition_len=rbln_kvcache_partition_len,
+            rbln_kvcache_block_size=rbln_kvcache_block_size,
             rbln_max_seq_len=rbln_max_seq_len,
         )
+        if rbln_kvcache_block_size is None:
+            if rbln_attn_impl == "eager":
+                rbln_kvcache_block_size = rbln_max_seq_len
+            else:
+                rbln_kvcache_block_size = rbln_kvcache_partition_len
+        # FIXME temporal num_blocks
+        rbln_kvcache_num_blocks = (rbln_max_seq_len // rbln_kvcache_block_size) * rbln_batch_size
         num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
         num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
         num_hidden_layers = getattr(model_config, "n_layer", None) or getattr(model_config, "num_hidden_layers")
@@ -542,19 +657,25 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             if query_length > 1:
                 input_info.extend(
                     [
-                        ("batch_position", [], "int16"),
                         ("query_position", [], "int16"),
                     ]
                 )
+            max_block_cnt = rbln_max_seq_len // rbln_kvcache_block_size
+            if query_length > 1:
+                input_info.extend([("block_tables", [max_block_cnt], "int16")])
+            else:
+                input_info.extend([("block_tables", [batch_size, max_block_cnt], "int16")])
             input_info.extend(
                 [
                     (
                         f"past_key_values_{i}",
                         [
-                            rbln_batch_size,
+                            rbln_kvcache_num_blocks,
                             num_key_value_heads,
-                            rbln_max_seq_len,
+                            rbln_kvcache_block_size,
                             head_dim,
                         ],
                         "float32",
@@ -595,7 +716,9 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 "use_attention_mask": rbln_use_attention_mask,
                 "use_inputs_embeds": rbln_use_inputs_embeds,
                 "kvcache_partition_len": rbln_kvcache_partition_len,
+                "kvcache_block_size": rbln_kvcache_block_size,
                 "attn_impl": rbln_attn_impl,
+                "kvcache_num_blocks": rbln_kvcache_num_blocks,
             }
         )

optimum/rbln/transformers/models/exaone/exaone_architecture.py CHANGED Viewed

@@ -40,10 +40,15 @@ class ExaoneForCausalLMWrapper(DecoderOnlyWrapper):
         new_layers = []
         for layer in causal_lm.transformer.h:
             if self.attn_impl == "eager":
-                new_self_attn = ExaoneAttention(layer.attn.attention, self.use_attention_mask)
+                new_self_attn = ExaoneAttention(
+                    layer.attn.attention, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = ExaoneFlashAttention(
-                    layer.attn.attention, kvcache_partition_len=self.kvcache_partition_len
+                    layer.attn.attention,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    use_attention_mask=self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")

optimum/rbln/transformers/models/gemma/gemma_architecture.py CHANGED Viewed

@@ -33,10 +33,15 @@ class GemmaWrapper(DecoderOnlyWrapper):
         new_layers = []
         for layer in causal_lm.model.layers:
             if self.attn_impl == "eager":
-                new_self_attn = DecoderOnlyAttention(layer.self_attn, self.use_attention_mask)
+                new_self_attn = DecoderOnlyAttention(
+                    layer.self_attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                )
             elif self.attn_impl == "flash_attn":
                 new_self_attn = DecoderOnlyFlashAttention(
-                    layer.self_attn, kvcache_partition_len=self.kvcache_partition_len
+                    layer.self_attn,
+                    kvcache_partition_len=self.kvcache_partition_len,
+                    use_attention_mask=self.use_attention_mask,
+                    kvcache_block_size=self.kvcache_block_size,
                 )
             else:
                 raise NotImplementedError(f"Unknwon attn : {self.attn_impl}")

optimum/rbln/transformers/models/gpt2/gpt2_architecture.py CHANGED Viewed

@@ -37,7 +37,9 @@ class GPT2Wrapper(DecoderOnlyWrapper):
             raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
         new_layers = []
         for layer in causal_lm.transformer.h:
-            new_self_attn = GPT2Attention(layer.attn, self.use_attention_mask)
+            new_self_attn = GPT2Attention(
+                layer.attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+            )
             new_layer = GPT2Layer(layer, new_self_attn)
             new_layers.append(new_layer)
         new_model = GPT2Model(causal_lm.transformer, new_layers, max_seq_len=max_seq_len)

optimum/rbln/transformers/models/midm/midm_architecture.py CHANGED Viewed

@@ -60,7 +60,9 @@ class MidmLMHeadModelWrapper(DecoderOnlyWrapper):
             raise NotImplementedError(f"flash attention ({self.attn_impl}) is not implemented for {self.__class__}")
         new_layers = []
         for layer in causal_lm.transformer.h:
-            new_self_attn = MidmAttention(layer.attn, self.use_attention_mask)
+            new_self_attn = MidmAttention(
+                layer.attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+            )
             new_layer = MidmLayer(layer, new_self_attn)
             new_layers.append(new_layer)
         new_model = MidmModel(causal_lm.transformer, new_layers, max_seq_len=max_seq_len)

optimum/rbln/transformers/models/phi/phi_architecture.py CHANGED Viewed

@@ -36,7 +36,9 @@ class PhiWrapper(DecoderOnlyWrapper):
         new_layers = []
         for layer in causal_lm.model.layers:
             if self.attn_impl == "eager":
-                new_self_attn = PhiAttention(layer.self_attn, self.use_attention_mask)
+                new_self_attn = PhiAttention(
+                    layer.self_attn, self.use_attention_mask, kvcache_block_size=self.kvcache_block_size
+                )
             elif self.attn_impl == "flash_attn":
                 raise NotImplementedError(f"flash attn for {self.__class__} is not implemented yet.")
             else:
@@ -81,10 +83,10 @@ class PhiLayer(DecoderOnlyLayer):
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
         seq_positions: torch.LongTensor,
-        batch_position: torch.Tensor,
         past_key_values: Tuple[Tuple[torch.Tensor]],
         cos: Optional[torch.Tensor] = None,
         sin: Optional[torch.Tensor] = None,
+        block_tables: Optional[torch.Tensor] = None,
     ):
         residual = hidden_states
@@ -94,10 +96,10 @@ class PhiLayer(DecoderOnlyLayer):
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             seq_positions=seq_positions,
-            batch_position=batch_position,
             past_key_values=past_key_values,
             cos=cos,
             sin=sin,
+            block_tables=block_tables,
         )
         feed_forward_hidden_states = self._original_mod.mlp(hidden_states)

optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py CHANGED Viewed

@@ -50,11 +50,14 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
         runtime: rebel.Runtime,
         batch_size: int,
         dec_max_seq_len: int,
+        use_attention_mask: Optional[bool] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(runtime, **kwargs)
         self.batch_size = batch_size
         self.dec_max_seq_len = dec_max_seq_len
+        self.use_attention_mask = use_attention_mask
+        self.default_block_tables = torch.arange(0, self.batch_size, dtype=torch.int16).view(self.batch_size, 1)
     def forward(
         self,
@@ -62,6 +65,7 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
         attention_mask: Optional[torch.FloatTensor] = None,
         decoder_attention_mask: Optional[torch.BoolTensor] = None,
         cache_position: Optional[torch.Tensor] = None,
+        block_tables: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         batch_size = decoder_input_ids.shape[0]
@@ -73,19 +77,24 @@ class RBLNRuntimeDecoder(RBLNPytorchRuntime):
         if batch_size != cache_position.shape[0]:
             raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
-        for b_idx in range(self.batch_size):
-            decoding_step = cache_position[b_idx].item()
-            if not (0 <= decoding_step < self.dec_max_seq_len):
-                raise ValueError(
-                    f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
-                )
-            decoder_attention_mask[b_idx, : decoding_step + 1] = 1
+        if self.use_attention_mask:
+            for b_idx in range(self.batch_size):
+                decoding_step = cache_position[b_idx].item()
+                if not (0 <= decoding_step < self.dec_max_seq_len):
+                    raise ValueError(
+                        f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
+                    )
+                decoder_attention_mask[b_idx, : decoding_step + 1] = 1
+        if block_tables is None:
+            block_tables = self.default_block_tables
         lm_logits = super().forward(
             decoder_input_ids,
-            decoder_attention_mask,
+            decoder_attention_mask if self.use_attention_mask else None,
             attention_mask,
             cache_position,
+            block_tables,
         )
         return Seq2SeqLMOutput(logits=lm_logits)
@@ -110,12 +119,18 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     def __post_init__(self, **kwargs):
         batch_size = self.rbln_config.model_cfg["batch_size"]
         dec_max_seq_len = self.rbln_config.model_cfg["dec_max_seq_len"]
+        self.use_attention_mask = self.rbln_config.model_cfg.get("use_attention_mask", None)
         self.encoder = RBLNRuntimeEncoder(
             runtime=self.model[0],
             main_input_name="input_ids",
         )
         self.decoder = RBLNRuntimeDecoder(
-            runtime=self.model[1], main_input_name="input_ids", batch_size=batch_size, dec_max_seq_len=dec_max_seq_len
+            runtime=self.model[1],
+            main_input_name="input_ids",
+            batch_size=batch_size,
+            dec_max_seq_len=dec_max_seq_len,
+            use_attention_mask=self.use_attention_mask,
         )
     @classmethod
@@ -171,6 +186,13 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
         rbln_dec_max_seq_len = rbln_kwargs.get("dec_max_seq_len", None)
         rbln_batch_size = rbln_kwargs.get("batch_size", None)
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
+        rbln_use_attention_mask = rbln_kwargs.get("use_attention_mask", None)
+        if rbln_use_attention_mask is None:
+            rbln_use_attention_mask = False
+            rbln_npu = rbln_kwargs.get("npu", None) or rebel.get_npu_name()
+            if rbln_npu == "RBLN-CA02":
+                rbln_use_attention_mask = True
         n_layer = getattr(model_config, "decoder_layers", None) or getattr(model_config, "num_layers")
         n_head = getattr(model_config, "decoder_attention_heads", None) or getattr(model_config, "num_heads")
@@ -232,18 +254,22 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
                 ],
                 "float32",
             ),
-            ("batch_position", [], "int16"),
+            ("block_tables", [1], "int16"),
         ]
         dec_input_info = [
             ("input_ids", [rbln_batch_size, 1], "int64"),
-            ("attention_mask", [rbln_batch_size, rbln_dec_max_seq_len], "float32"),
             ("encoder_attention_mask", [rbln_batch_size, rbln_enc_max_seq_len], "float32"),
             (
                 "cache_position",
                 [rbln_batch_size, 1],
                 "int32",
             ),
+            (
+                "block_tables",
+                [rbln_batch_size, 1],
+                "int16",
+            ),
         ]
         dec_input_info.extend(
             [
@@ -275,6 +301,10 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
                 for i in range(n_layer * 2)
             ]
         )
+        if rbln_use_attention_mask:
+            dec_input_info.insert(1, ("attention_mask", [rbln_batch_size, rbln_dec_max_seq_len], "float32"))
         enc_compile_config = RBLNCompileConfig(compiled_model_name="encoder", input_info=enc_input_info)
         dec_compile_config = RBLNCompileConfig(compiled_model_name="decoder", input_info=dec_input_info)
@@ -290,6 +320,7 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
                 "dec_max_seq_len": rbln_dec_max_seq_len,
                 "batch_size": rbln_batch_size,
                 "pad_token_id": rbln_pad_token_id,
+                "use_attention_mask": rbln_use_attention_mask,
             }
         )
@@ -400,9 +431,9 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
         encoder_kwargs["output_attentions"] = False
         for b in range(batch_size):
-            batch_position = torch.tensor(b, dtype=torch.int16)
+            block_tables = torch.tensor([b], dtype=torch.int16)
             encoder_kwargs["input_ids"] = inputs_tensor[b].unsqueeze(0)
             encoder_kwargs["attention_mask"] = model_kwargs["attention_mask"][b].unsqueeze(0).to(torch.float32)
-            model_kwargs["encoder_outputs"] = encoder(**encoder_kwargs, batch_position=batch_position)
+            model_kwargs["encoder_outputs"] = encoder(**encoder_kwargs, block_tables=block_tables)
         return model_kwargs

optimum-rbln 0.7.3a1__py3-none-any.whl → 0.7.3a3__py3-none-any.whl

optimum-rbln 0.7.3a1py3-none-any.whl → 0.7.3a3py3-none-any.whl