PyPI - optimum-rbln - Versions diffs - 0.2.1a3__py3-none-any.whl → 0.2.1a5__py3-none-any.whl - Mend

optimum-rbln 0.2.1a3py3-none-any.whl → 0.2.1a5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

optimum/rbln/__version__.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.1a3'
+__version__ = version = '0.2.1a5'
 __version_tuple__ = version_tuple = (0, 2, 1)

optimum/rbln/modeling_base.py CHANGED Viewed

@@ -442,8 +442,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
-        real_save_dir = self.model_save_dir / self.subfolder
-        save_directory_path = Path(save_directory)
+        # Normalize paths to handle relative paths and symlinks
+        real_save_dir = Path(self.model_save_dir).resolve() / self.subfolder
+        save_directory_path = Path(save_directory).resolve()
         if not os.path.exists(real_save_dir) or not os.path.isdir(real_save_dir):
             raise FileNotFoundError(
@@ -452,13 +453,13 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 f"Please ensure the model directory exists and you have the necessary permissions to access it."
             )
-        if save_directory_path.absolute() == real_save_dir.absolute():
+        if save_directory_path == real_save_dir:
             raise FileExistsError(
                 f"Cannot save model to '{save_directory}'. This directory already exists and contains the model files."
             )
-        # Create a temporary directory next to the target directory
-        tmp_dir = save_directory + ".tmp"
+        # Create a temporary directory with normalized path
+        tmp_dir = str(save_directory_path) + ".tmp"
         try:
             # Remove temporary directory if it exists from a previous failed attempt
             if os.path.exists(tmp_dir):
@@ -473,9 +474,9 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
                 self.generation_config.save_pretrained(tmp_dir)
             # If everything succeeded, atomically replace the target directory
-            if os.path.exists(save_directory):
-                shutil.rmtree(save_directory)
-            os.rename(tmp_dir, save_directory)
+            if os.path.exists(save_directory_path):
+                shutil.rmtree(save_directory_path)
+            os.rename(tmp_dir, save_directory_path)
         except Exception as e:
             # Clean up the temporary directory if anything fails
@@ -484,7 +485,7 @@ class RBLNBaseModel(SubModulesMixin, PushToHubMixin, PreTrainedModel):
             raise e  # Re-raise the exception after cleanup
         if push_to_hub:
-            return super().push_to_hub(save_directory, **kwargs)
+            return super().push_to_hub(str(save_directory_path), **kwargs)
     @staticmethod
     def _raise_missing_compiled_file_error(missing_files: List[str]):

optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py CHANGED Viewed

@@ -427,12 +427,14 @@ class DecoderOnlyModel(nn.Module):
             cos, sin = None, None
         # (batch, seq_len) -> (batch,)
-        seq_positions = cache_position[:, 0]
         if self.attn_impl == "flash_attn":
+            seq_positions = cache_position[:, 0]
             max_seq_len = past_key_values[0][0].shape[-2]
             seq_positions = self.convert_sequence_positions_for_flash_attn(
                 seq_positions=seq_positions, max_seq_len=max_seq_len
             )
+        else:
+            seq_positions = cache_position[:, :1]
         present_key_values = past_key_values
         for layer in self.layers:

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -38,34 +38,188 @@ from .decoderonly_architecture import (
 logger = get_logger()
 if TYPE_CHECKING:
-    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, PretrainedConfig
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name", "embed_tokens"]
+    def __init__(
+        self,
+        runtime: rebel.Runtime,
+        phase: str,
+        batch_size: int,
+        dec_attn_mask: torch.Tensor,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(runtime, **kwargs)
+        self.phase = phase
+        self.batch_size = batch_size
+        # shared tensor between prefill and decode phase
+        self.dec_attn_mask = dec_attn_mask
+        if self.phase == "prefill":
+            vocab_size = kwargs.pop("vocab_size")
+            self.max_seq_len = kwargs.pop("max_seq_len")
+            self.prefill_chunk_size = kwargs.pop("prefill_chunk_size")
+            self.output_size = [1, 1, vocab_size]
+            self.causal_mask = 1 - torch.triu(
+                torch.ones(1, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
+            )
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        inputs_embeds: torch.Tensor,
-        attention_mask: torch.Tensor,
-        cache_position: torch.Tensor,
-        **kwargs,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        batch_idx: Optional[int] = None,
     ):
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
         if inputs_embeds is None:
-            inp = input_ids
+            inputs = input_ids
             if self.embed_tokens is not None:
-                inp = self.embed_tokens(inp)
+                inputs = self.embed_tokens(inputs)
         else:
-            inp = inputs_embeds
+            inputs = inputs_embeds
-        return super().forward(
-            inp,
-            attention_mask,
+        if self.phase == "decode":
+            return self.decode_forward(
+                inputs,
+                cache_position,
+                attention_mask=attention_mask,
+            )
+        else:
+            return self.prefill_forward(inputs, cache_position, attention_mask, batch_idx)
+    def decode_forward(
+        self,
+        inputs: torch.Tensor,
+        cache_position: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size = inputs.shape[0]
+        if batch_size != self.batch_size:
+            raise RuntimeError(
+                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
+            )
+        if batch_size != cache_position.shape[0]:
+            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
+        if attention_mask is None:
+            for b_idx in range(batch_size):
+                decoding_step = cache_position[b_idx].item()
+                if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
+                    raise ValueError(
+                        f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
+                    )
+                self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
+        logits = super().forward(
+            inputs,
+            self.dec_attn_mask if attention_mask is None else attention_mask,
             cache_position,
-            **kwargs,
         )
+        return logits
+    def prefill_forward(
+        self,
+        inputs: torch.Tensor,
+        cache_position: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        batch_idx: int = None,
+    ) -> torch.FloatTensor:
+        """
+        Performs chunked prefill for efficient KV-cache updates and memory optimization.
+        Instead of processing the entire sequence at once, the input is divided into chunks of size `prefill_chunk_size`,
+        and each chunk is processed sequentially. This allows for better memory utilization and compatibility with continuous batching.
+        """
+        if batch_idx is None or batch_idx >= self.batch_size:
+            raise RuntimeError(
+                f"Invalid batch_idx ({batch_idx}). It must be a non-null value less than the batch size ({self.batch_size})."
+            )
+        # Handle continuous batching in a compiled graph by extracting valid inputs
+        # If an attention mask is provided, select only the valid (non-masked) inputs
+        inputs = inputs[:, attention_mask.bool()] if attention_mask is not None else inputs
+        query_length = inputs.shape[1]
+        if query_length > self.max_seq_len:
+            raise ValueError(
+                f"Input length ({query_length}) exceeds the maximum allowed sequence length ({self.max_seq_len})."
+            )
+        # Initialize attention mask for chunked processing
+        chunked_attention_mask = torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
+        # Buffer for storing output logits
+        out_buffers = [
+            torch.empty(
+                size=self.output_size,
+                dtype=torch.float32,
+                device="cpu",
+            )
+        ]
+        # Process input in chunks of size `prefill_chunk_size`
+        for step in range(0, query_length, self.prefill_chunk_size):
+            # Pad input and cache_position if the last chunk is smaller than `prefill_chunk_size`
+            if (step + self.prefill_chunk_size) > query_length:
+                padding_size = step + self.prefill_chunk_size - query_length
+                # inputs_embeds
+                if inputs.dim() == 3:
+                    inputs = torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
+                # inputs_ids
+                else:
+                    inputs = torch.nn.functional.pad(inputs, (0, padding_size))
+                cache_position = torch.cat(
+                    [
+                        cache_position,
+                        torch.arange(
+                            query_length,
+                            step + self.prefill_chunk_size,
+                            dtype=torch.int32,
+                        ).unsqueeze(0),
+                    ],
+                    dim=-1,
+                )
+            # Extract the current chunk of inputs and cache positions
+            input_chunk = inputs[:, step : step + self.prefill_chunk_size]
+            cache_pos_chunk = cache_position[:, step : step + self.prefill_chunk_size]
+            # Update attention mask to ensure proper causal behavior
+            if step >= self.prefill_chunk_size:
+                chunked_attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
+            chunked_attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+            # Define batch position and query position
+            batch_position = torch.tensor(batch_idx, dtype=torch.int16)
+            query_position = torch.tensor((query_length - 1) % self.prefill_chunk_size, dtype=torch.int16)
+            # Forward pass for the current chunk
+            logits = super().forward(
+                input_chunk,
+                chunked_attention_mask,
+                cache_pos_chunk,
+                batch_position,
+                query_position,
+                out=out_buffers,
+            )
+        # Update decoder attention mask with processed KV-cache length from prefill phase
+        self.dec_attn_mask[batch_idx].fill_(0)
+        self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
+        return logits
 @dataclass
 class RBLNDecoderOnlyOutput(ModelOutput):
@@ -103,13 +257,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         self.max_seq_len = self.rbln_config.model_cfg["max_seq_len"]
         self.prefill_chunk_size = self.rbln_config.model_cfg["prefill_chunk_size"]
-        self.prefill_attention_mask = torch.zeros(1, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.float32)
-        self.causal_mask = 1 - torch.triu(
-            torch.ones(1, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
-        )
-        self.dec_attn_mask_init = torch.zeros(1, 1, 1, self.max_seq_len, dtype=torch.float32)
-        self.dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
         main_input_name = self.main_input_name
         if self.rbln_config.model_cfg["use_inputs_embeds"]:
             main_input_name = "inputs_embeds"
@@ -124,11 +271,25 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         else:
             self.embed_tokens = None
+        dec_attn_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.float32)
         self.prefill_decoder = RBLNRuntimeModel(
-            runtime=self.model[0], main_input_name=main_input_name, embed_tokens=self.embed_tokens
+            runtime=self.model[0],
+            main_input_name=main_input_name,
+            embed_tokens=self.embed_tokens,
+            phase="prefill",
+            batch_size=self.batch_size,
+            dec_attn_mask=dec_attn_mask,
+            vocab_size=self.config.vocab_size,
+            max_seq_len=self.max_seq_len,
+            prefill_chunk_size=self.prefill_chunk_size,
         )
         self.decoder = RBLNRuntimeModel(
-            runtime=self.model[1], main_input_name=main_input_name, embed_tokens=self.embed_tokens
+            runtime=self.model[1],
+            main_input_name=main_input_name,
+            embed_tokens=self.embed_tokens,
+            phase="decode",
+            batch_size=self.batch_size,
+            dec_attn_mask=dec_attn_mask,
         )
     @classmethod
@@ -155,7 +316,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     def get_quantized_model(
         cls,
         model_id: str,
-        config: Optional[PretrainedConfig] = None,
+        config: Optional["PretrainedConfig"] = None,
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -496,32 +657,33 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         generate_idx: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
-        # prefll
+        """
+        Forward method for the RBLN-optimized model, designed for integration with the HuggingFace generate API.
+        For continuous batching, the prefill stage processes one batch at a time and updates the KV cache using batch_idx.
+        A for-loop ensures synchronization with the HuggingFace generate API.
+        The decoder stage operates as usual, processing inputs in batch mode.
+        """
+        # Prefll
         if cache_position is None:
             logits = []
-            input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
-            batch_size = input_tensors.shape[0]
+            inputs = inputs_embeds if inputs_embeds is not None else input_ids
+            batch_size = inputs.shape[0]
             for b_idx in range(batch_size):
-                # Transform inputs as vllm format
-                if attention_mask is not None:
-                    input_tensor = input_tensors[b_idx : b_idx + 1, attention_mask[b_idx].bool()]
-                else:
-                    input_tensor = input_tensors[b_idx : b_idx + 1]
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
-                logit = self._forward_prefill(
-                    input_ids=input_tensor if inputs_embeds is None else None,
-                    inputs_embeds=input_tensor if inputs_embeds is not None else None,
+                logit = self.prefill_decoder(
+                    input_ids=inputs[b_idx : b_idx + 1] if inputs_embeds is None else None,
+                    inputs_embeds=inputs[b_idx : b_idx + 1] if inputs_embeds is not None else None,
+                    attention_mask=attention_mask[b_idx] if attention_mask is not None else None,
                     cache_position=cache_position,
                     batch_idx=b_idx,
                 )
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
-        # decoder
+        # Decoder
         else:
-            logits = self._forward_decoder(
+            logits = self.decoder(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
@@ -531,119 +693,3 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             logits=logits,
             generate_idx=generate_idx,
         )
-    def _forward_prefill(
-        self,
-        input_ids: torch.LongTensor = None,
-        inputs_embeds: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-        batch_idx: int = None,
-    ) -> torch.FloatTensor:
-        if batch_idx is None or batch_idx >= self.batch_size:
-            raise RuntimeError(
-                f"Invalid batch_idx ({batch_idx}). It must be a non-null value less than the batch size ({self.batch_size})."
-            )
-        out_buffers = [
-            torch.empty(
-                size=[
-                    1,
-                    1,
-                    self.config.vocab_size,
-                ],
-                dtype=torch.float32,
-                device="cpu",
-            )
-        ]
-        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
-        query_length = input_tensors.shape[1]
-        if query_length > self.max_seq_len:
-            raise ValueError(
-                f"Input length ({query_length}) exceeds the maximum allowed sequence length ({self.max_seq_len})."
-            )
-        _attention_mask = self.prefill_attention_mask.clone()
-        for step in range(0, query_length, self.prefill_chunk_size):
-            # pad input_tensors & cache_position for prefill_chunk
-            if (step + self.prefill_chunk_size) > query_length:
-                pad_to_chunk = step + self.prefill_chunk_size - query_length
-                if inputs_embeds is not None:
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, 0, 0, pad_to_chunk))
-                else:
-                    input_tensors = torch.nn.functional.pad(input_tensors, (0, pad_to_chunk))
-                cache_position = torch.cat(
-                    [
-                        cache_position,
-                        torch.arange(
-                            query_length,
-                            step + self.prefill_chunk_size,
-                            dtype=torch.int32,
-                        ).unsqueeze(0),
-                    ],
-                    dim=-1,
-                )
-            # slice input_tensor & cache_position with prefill_chunk_size
-            _input_tensors = input_tensors[:, step : step + self.prefill_chunk_size]
-            _cache_position = cache_position[:, step : step + self.prefill_chunk_size]
-            # update attention_mask
-            if step >= self.prefill_chunk_size:
-                _attention_mask[:, :, :, step - self.prefill_chunk_size : step] = 1
-            _attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
-            query_position = (query_length - 1) % self.prefill_chunk_size
-            logits = self.prefill_decoder(
-                input_ids=_input_tensors.contiguous() if inputs_embeds is None else None,
-                inputs_embeds=_input_tensors.contiguous() if inputs_embeds is not None else None,
-                attention_mask=_attention_mask.contiguous(),
-                cache_position=_cache_position.contiguous(),
-                batch_position=torch.tensor(batch_idx, dtype=torch.int16),
-                query_position=torch.tensor(query_position, dtype=torch.int16),
-                out=out_buffers,
-            )
-        # update decoder_attn_mask with preprocessed kv-cache length in prefill phase
-        self.dec_attn_mask[batch_idx] = self.dec_attn_mask_init.clone()
-        self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
-        return logits
-    def _forward_decoder(
-        self,
-        input_ids: torch.LongTensor = None,
-        inputs_embeds: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-    ) -> torch.FloatTensor:
-        input_tensors = inputs_embeds if inputs_embeds is not None else input_ids
-        if input_tensors is None:
-            raise ValueError("Either `input_ids` or `inputs_embeds` must be provided.")
-        batch_size = input_tensors.shape[0]
-        if batch_size != self.batch_size:
-            raise RuntimeError(
-                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
-            )
-        if batch_size != cache_position.shape[0]:
-            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
-        for b_idx in range(batch_size):
-            decoding_step = cache_position[b_idx].item()
-            if not (0 <= decoding_step < self.dec_attn_mask.shape[-1]):
-                raise ValueError(
-                    f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
-                )
-            self.dec_attn_mask[b_idx, :, :, decoding_step] = 1
-        logits = self.decoder(
-            input_ids=input_tensors.contiguous() if inputs_embeds is None else None,
-            inputs_embeds=input_tensors.contiguous() if inputs_embeds is not None else None,
-            attention_mask=self.dec_attn_mask.contiguous(),
-            cache_position=cache_position.contiguous(),
-        )
-        return logits

optimum/rbln/transformers/models/llava_next/modeling_llava_next.py CHANGED Viewed

@@ -25,7 +25,6 @@ from transformers import (
     PreTrainedModel,
 )
 from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.models.llava_next.modeling_llava_next import LlavaNextCausalLMOutputWithPast
 from ....modeling import RBLNModel
 from ....modeling_config import RBLNCompileConfig, RBLNConfig
@@ -337,7 +336,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
         generate_idx: Optional[torch.Tensor] = None,
         batch_idx: Optional[int] = None,
         **kwargs,
-    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
+    ) -> Union[Tuple, RBLNDecoderOnlyOutput]:
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -378,7 +377,7 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
             inputs_embeds = [inputs_embeds[i : i + 1, attention_mask[i].bool()] for i in range(batch_size)]
             for batch_idx in range(batch_size):
                 generate_idx[batch_idx] = inputs_embeds[batch_idx].shape[-2]
-                logit = self.language_model._forward_prefill(
+                logit = self.language_model.prefill_decoder(
                     inputs_embeds=inputs_embeds[batch_idx],
                     batch_idx=batch_idx,
                     cache_position=torch.arange(
@@ -390,15 +389,13 @@ class RBLNLlavaNextForConditionalGeneration(RBLNModel):
                 logits.append(logit)
             logits = torch.cat(logits, dim=0)
-            outputs = RBLNDecoderOnlyOutput(logits=logits, generate_idx=generate_idx)
         else:
-            outputs: RBLNDecoderOnlyOutput = self.language_model(
+            logits = self.language_model.decoder(
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
-                generate_idx=generate_idx,
             )
-        return outputs
+        return RBLNDecoderOnlyOutput(logits=logits, generate_idx=generate_idx)
     # Almost copied from : https://github.com/huggingface/transformers/blob/6b550462139655d488d4c663086a63e98713c6b9/src/transformers/models/llava_next/modeling_llava_next.py
     def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):

optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 import rebel
 import torch
 from rebel.compile_context import CompileContext
-from transformers import AutoModelForSeq2SeqLM, GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers import AutoModelForSeq2SeqLM, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
 from ....modeling import RBLNModel
@@ -31,12 +31,7 @@ from ....utils.runtime_utils import RBLNPytorchRuntime
 logger = get_logger(__name__)
 if TYPE_CHECKING:
-    from transformers import (
-        AutoFeatureExtractor,
-        AutoProcessor,
-        AutoTokenizer,
-        PretrainedConfig,
-    )
+    from transformers import AutoFeatureExtractor, AutoProcessor, AutoTokenizer, GenerationConfig, PretrainedConfig
 class RBLNRuntimeEncoder(RBLNPytorchRuntime):
@@ -50,9 +45,50 @@ class RBLNRuntimeEncoder(RBLNPytorchRuntime):
 class RBLNRuntimeDecoder(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name"]
-    def forward(self, *args: List[torch.Tensor], **kwargs: Dict[str, torch.Tensor]):
-        outputs = super().forward(*args, **kwargs)
-        return Seq2SeqLMOutput(logits=outputs)
+    def __init__(
+        self,
+        runtime: rebel.Runtime,
+        batch_size: int,
+        dec_max_seq_len: int,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(runtime, **kwargs)
+        self.batch_size = batch_size
+        self.dec_max_seq_len = dec_max_seq_len
+    def forward(
+        self,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        batch_size = decoder_input_ids.shape[0]
+        if batch_size != self.batch_size:
+            raise RuntimeError(
+                f"Batch size mismatch: got {batch_size}, expected {self.batch_size} (compiled batch size)."
+            )
+        if batch_size != cache_position.shape[0]:
+            raise RuntimeError(f"Cache position size mismatch: got {cache_position.shape[0]}, expected {batch_size}.")
+        for b_idx in range(self.batch_size):
+            decoding_step = cache_position[b_idx].item()
+            if not (0 <= decoding_step < self.dec_max_seq_len):
+                raise ValueError(
+                    f"Decoding step {decoding_step} out of bounds for attention mask with shape {self.dec_attn_mask.shape}."
+                )
+            decoder_attention_mask[b_idx, : decoding_step + 1] = 1
+        lm_logits = super().forward(
+            decoder_input_ids,
+            decoder_attention_mask,
+            attention_mask,
+            cache_position,
+        )
+        return Seq2SeqLMOutput(logits=lm_logits)
 class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
@@ -72,8 +108,15 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     auto_model_class = AutoModelForSeq2SeqLM
     def __post_init__(self, **kwargs):
-        self.encoder = RBLNRuntimeEncoder(runtime=self.model[0], main_input_name="input_ids")
-        self.decoder = RBLNRuntimeDecoder(runtime=self.model[1], main_input_name="input_ids")
+        batch_size = self.rbln_config.model_cfg["batch_size"]
+        dec_max_seq_len = self.rbln_config.model_cfg["dec_max_seq_len"]
+        self.encoder = RBLNRuntimeEncoder(
+            runtime=self.model[0],
+            main_input_name="input_ids",
+        )
+        self.decoder = RBLNRuntimeDecoder(
+            runtime=self.model[1], main_input_name="input_ids", batch_size=batch_size, dec_max_seq_len=dec_max_seq_len
+        )
     @classmethod
     @torch.inference_mode()
@@ -304,46 +347,24 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        decoder_input_ids: torch.LongTensor = None,
         cache_position: Union[List[torch.Tensor], torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         # common decoder
         cache_position = torch.full((self.rbln_config.model_cfg["batch_size"], 1), cache_position, dtype=torch.int32)
-        logits = self._forward_decoder(input_ids=input_ids, cache_position=cache_position, **kwargs).logits
+        logits = self.decoder(decoder_input_ids=decoder_input_ids, cache_position=cache_position, **kwargs).logits
         return Seq2SeqLMOutput(
             logits=logits,
         )
-    def _forward_decoder(
-        self,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        cache_position: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor]:
-        dec_attention_mask = decoder_attention_mask.clone()
-        for b_idx in range(self.rbln_config.model_cfg["batch_size"]):
-            dec_attention_mask[b_idx, : cache_position[b_idx] + 1] = 1
-        decoder_output = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=dec_attention_mask,
-            encoder_attention_mask=attention_mask,
-            cache_position=cache_position,
-        )
-        lm_logits = decoder_output.logits
-        return Seq2SeqLMOutput(logits=lm_logits)
     def _prepare_encoder_decoder_kwargs_for_generation(
         self,
         inputs_tensor: torch.Tensor,
         model_kwargs,
         model_input_name: Optional[str] = None,
-        generation_config: Optional[GenerationConfig] = None,
+        generation_config: Optional["GenerationConfig"] = None,
     ) -> Dict[str, Any]:
         # 1. get encoder
         encoder = self.get_encoder()
@@ -373,6 +394,7 @@ class RBLNModelForSeq2SeqLM(RBLNModel, ABC):
         )
         # 3. make sure that encoder returns `ModelOutput`
+        model_input_name = model_input_name if model_input_name is not None else self.main_input_name
         encoder_kwargs["return_dict"] = True
         encoder_kwargs["output_hidden_states"] = False
         encoder_kwargs["output_attentions"] = False

optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py CHANGED Viewed

@@ -459,7 +459,7 @@ class Seq2SeqSelfAttention(nn.Module):
             ),  # Unsqueeze group axis since CustomKernel expects it for group query attention
             past_key_value[0].view(bsz, self.num_heads, 1, -1, self.head_dim),
             past_key_value[1].view(bsz, self.num_heads, 1, -1, self.head_dim),
-            cache_position.squeeze(1),
+            cache_position,
             torch.tensor(1.0, dtype=torch.float32),  # scale
         )

{optimum_rbln-0.2.1a3.dist-info → optimum_rbln-0.2.1a5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: optimum-rbln
-Version: 0.2.1a3
+Version: 0.2.1a5
 Summary: Optimum RBLN is the interface between the Hugging Face Transformers and Diffusers libraries and RBLN accelerators. It provides a set of tools enabling easy model loading and inference on single and multiple rbln device settings for different downstream tasks.
 Project-URL: Homepage, https://rebellions.ai
 Project-URL: Documentation, https://docs.rbln.ai

{optimum_rbln-0.2.1a3.dist-info → optimum_rbln-0.2.1a5.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 optimum/rbln/__init__.py,sha256=sLCjJu_MLZEKDOwHIlJP4u4GzGZx-1kqHTYGw5B4xDg,6096
-optimum/rbln/__version__.py,sha256=Qa8tLTuiehljsgp_ibSY6aee43cZYh5J_fQ5zMTZ6SA,413
+optimum/rbln/__version__.py,sha256=J4Eyn4HLzB0UpyosVo-P3LCDkB5knEOS6Nu24mnl5NA,413
 optimum/rbln/modeling.py,sha256=REImAAKO82CqSNABR-9E1jJEsWch9amSOwOOQhFEYLY,8283
-optimum/rbln/modeling_base.py,sha256=_5M8hVySDwCJ6qfeku2_nJAPu_5JLfAUu3HO1bc3ALM,21098
+optimum/rbln/modeling_base.py,sha256=fQ0bI1Bb6GJquRXftmSSN9K-TXLhFltZJ6C-2w43xMg,21193
 optimum/rbln/modeling_config.py,sha256=7104bxmrvKW4Q6XTruQayiIGl8GHDFmPkJ3cknMIInE,11335
 optimum/rbln/diffusers/__init__.py,sha256=68FTAMpbbMflm8qiSqfM5J2_gFb3iU3fng6AL0TG47A,2913
 optimum/rbln/diffusers/modeling_diffusers.py,sha256=E1x-iOKEJCUB6ml0RgtFEVPPk6J6pqEF-JTEyOZzOyc,14928
@@ -53,8 +53,8 @@ optimum/rbln/transformers/models/bert/modeling_bert.py,sha256=-nv-sgmHkyHQIoQvF8
 optimum/rbln/transformers/models/clip/__init__.py,sha256=ssJqlEt318ti2QaEakGh_tO3Ap1VSPCVF-ymUuvjAJs,698
 optimum/rbln/transformers/models/clip/modeling_clip.py,sha256=E1QfVNq1sTCp7uvuha1ZPfXMwvMTkGV9L4oFdmy1w4g,5724
 optimum/rbln/transformers/models/decoderonly/__init__.py,sha256=pDogsdpJKKB5rqnVFrRjwfhUvOSV-jZ3oARMsqSvOOQ,665
-optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=BjQHwoPZfM-KUQzxm4AU-PdmoMgLxnCG6kfSpGjUvrk,36578
-optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=mAgRRMGVHvTUjJBDlmUOjNhSNjprKSD7tLeFknrx0Rw,25810
+optimum/rbln/transformers/models/decoderonly/decoderonly_architecture.py,sha256=eT1fbKDL92yGBXtUKA_JibD4kiRPdf3tAFJHP5nlfH4,36646
+optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py,sha256=2OO8MEgFgcl1VPrQXxqkvmRJJEuFdexwu8XqbHDbR6Y,27609
 optimum/rbln/transformers/models/dpt/__init__.py,sha256=gP1tkR3XMNlHq1GT87ugIVvb2o_1eAUg1JaniXjy1Lw,651
 optimum/rbln/transformers/models/dpt/modeling_dpt.py,sha256=ZsS2SOiqcA4azULB-WFEMQZbgIoOyVUKqVKqrw_tWzA,3430
 optimum/rbln/transformers/models/exaone/__init__.py,sha256=zYH_5tVa8-juEdsOIky7I33WSC3Zuhoq1upI0OHYeVw,859
@@ -70,7 +70,7 @@ optimum/rbln/transformers/models/llama/__init__.py,sha256=jo_j_eIrHYGNEhR5lb6g3r
 optimum/rbln/transformers/models/llama/llama_architecture.py,sha256=S7MCPfyjG5eUqgaS-QNBB0ApUD6wnb5fR0RHq7k7-pA,728
 optimum/rbln/transformers/models/llama/modeling_llama.py,sha256=Z3iony7icoFhRQ11MAuFx9UF03uJCsvJQZ6bxHXlrgk,1530
 optimum/rbln/transformers/models/llava_next/__init__.py,sha256=VLieyWm-UgvuNxw9B38wrL1Jsa09NBDX_ebABmdpTbs,670
-optimum/rbln/transformers/models/llava_next/modeling_llava_next.py,sha256=_8zKsI-Kj4bbsPLnERJqg-0oC6EyAWrmnxvszsAtRaA,26398
+optimum/rbln/transformers/models/llava_next/modeling_llava_next.py,sha256=w_plsUOzxnhkQBhQeUqW9aJqGCvCvLtsx0XNKYjOprU,26203
 optimum/rbln/transformers/models/midm/__init__.py,sha256=UJSaErsF-z6dZERIS143WTaygffZyzEGqoQ2ZPDiM-c,855
 optimum/rbln/transformers/models/midm/midm_architecture.py,sha256=mueRmMGX6UplZb0C0RFdUOa9lsNH8YJHV6rYrDLOdlQ,5302
 optimum/rbln/transformers/models/midm/modeling_midm.py,sha256=GG25BozEZriAL-OPFGpzOjyDtSFB-NfeiLJTDAqxe20,1734
@@ -84,8 +84,8 @@ optimum/rbln/transformers/models/qwen2/__init__.py,sha256=RAMWc21W_2I6DH9xBjeNxP
 optimum/rbln/transformers/models/qwen2/modeling_qwen2.py,sha256=9-aFDvjMzPNUyGOz0qo33RE18bUFGYZ3Wt_68zb5uJY,1530
 optimum/rbln/transformers/models/qwen2/qwen2_architecture.py,sha256=XlNAMYAcDLohnSAhIFGKOPuCB5XLgzYs5ABWdeQSaZs,720
 optimum/rbln/transformers/models/seq2seq/__init__.py,sha256=EmEMV4rOYqKyruX85d0fR73-b8N6BSD6CPcbpYdBuVk,651
-optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py,sha256=2hkCPvaiyS16zdtUiJKhvpk1qJfsXVLrAQPgAtixCg0,15426
-optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=15yoF-wyhcLcK-Z2MOUmyPlkOMNTVOJ013uBepqtpxA,18387
+optimum/rbln/transformers/models/seq2seq/modeling_seq2seq.py,sha256=HG_-8ufRWIls67imU1547V0bk9FUWC0haOBL7eyRV6k,16365
+optimum/rbln/transformers/models/seq2seq/seq2seq_architecture.py,sha256=_TL4-vpjM9lfRnQUXRFm3mtVdz_h5B23k01uc_XnW5I,18376
 optimum/rbln/transformers/models/t5/__init__.py,sha256=1skR1RmnG62WTAP3-F5P1x-V_ReFhMyirH3u56vWwvc,675
 optimum/rbln/transformers/models/t5/modeling_t5.py,sha256=MFs-3yYviV1QqSpsTB2GarTEs9wGH5AYofksLQLMBXg,8043
 optimum/rbln/transformers/models/t5/t5_architecture.py,sha256=kkjErS42mW2jv5O_xL7BaKobvvqy7BGmYOowKyHakvI,7189
@@ -108,7 +108,7 @@ optimum/rbln/utils/model_utils.py,sha256=DfD_Z2qvZHqcddXqnzTM1AN8khanj3-DXK2lJvV
 optimum/rbln/utils/runtime_utils.py,sha256=5-DYniyP59nx-mrrbi7AqA77L85b4Cm5oLpaxidSyss,3699
 optimum/rbln/utils/save_utils.py,sha256=hG5uOtYmecSXZuGTvCXsTM-SiyZpr5q3InUGCCq_jzQ,3619
 optimum/rbln/utils/submodule.py,sha256=oZoGrItB8WqY4i-K9WJPlLlcLohc1YGB9OHB8_XZw3A,4071
-optimum_rbln-0.2.1a3.dist-info/METADATA,sha256=umGg7JkKhTcNc5AOyzubqzpoPXnGY1WosDi48dfAROw,5300
-optimum_rbln-0.2.1a3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-optimum_rbln-0.2.1a3.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
-optimum_rbln-0.2.1a3.dist-info/RECORD,,
+optimum_rbln-0.2.1a5.dist-info/METADATA,sha256=WSMoEbo3z3TMFB1lqbdJsu4ZeVI9AtewXktRjMk6WQw,5300
+optimum_rbln-0.2.1a5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+optimum_rbln-0.2.1a5.dist-info/licenses/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
+optimum_rbln-0.2.1a5.dist-info/RECORD,,

{optimum_rbln-0.2.1a3.dist-info → optimum_rbln-0.2.1a5.dist-info}/WHEEL RENAMED Viewed

File without changes

{optimum_rbln-0.2.1a3.dist-info → optimum_rbln-0.2.1a5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

optimum-rbln 0.2.1a3__py3-none-any.whl → 0.2.1a5__py3-none-any.whl

optimum-rbln 0.2.1a3py3-none-any.whl → 0.2.1a5py3-none-any.whl