PyPI - optimum-rbln - Versions diffs - 0.8.2a4__py3-none-any.whl → 0.8.2a6__py3-none-any.whl - Mend - Supply Chain Defender

optimum-rbln 0.8.2a4py3-none-any.whl → 0.8.2a6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of optimum-rbln might be problematic. Click here for more details.

Files changed (64) hide show

optimum/rbln/transformers/models/decoderonly/modeling_decoderonly.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import inspect
-import math
 from collections import deque
 from dataclasses import dataclass
 from pathlib import Path
@@ -22,7 +21,8 @@ from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, List, Optional, Tu
 import rebel
 import torch
 from rebel.compile_context import CompileContext
-from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import no_init_weights
 from transformers.utils import ModelOutput
@@ -30,14 +30,15 @@ from ....configuration_utils import RBLNCompileConfig
 from ....modeling import RBLNModel
 from ....utils.logging import get_logger
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ...utils.rbln_quantization import prepare_model_for_quantization
-from .configuration_decoderonly import RBLNDecoderOnlyModelForCausalLMConfig
-from .decoderonly_architecture import (
-    DecoderOnlyWrapper,
+from ...modeling_attention_utils import (
+    RBLNDecoderOnlyFlashAttentionMixin,
     set_default_values,
     validate_attention_method,
-    validate_sliding_window_size,
+    validate_sliding_window,
 )
+from ...utils.rbln_quantization import prepare_model_for_quantization
+from .configuration_decoderonly import RBLNDecoderOnlyModelConfig, RBLNDecoderOnlyModelForCausalLMConfig
+from .decoderonly_architecture import DecoderOnlyWrapper
 logger = get_logger()
@@ -267,7 +268,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             attention_mask = self.dec_attn_mask
-        if self.rbln_config.cache_impl in ["hybrid", "static"] and self.batch_size < block_tables.shape[0]:
+        if self.rbln_config.use_global_attention and self.batch_size < block_tables.shape[0]:
             block_tables = block_tables[: self.batch_size]
         if attention_mask is not None and self.batch_size < attention_mask.shape[0]:
@@ -283,7 +284,7 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             position_ids if self.rbln_config.use_position_ids else None,
         )
-        return RBLNDecoderOnlyOutput(logits=logits)
+        return RBLNDecoderOnlyForCausalLMOutput(logits=logits)
     def _prepare_prefill_inputs(
         self,
@@ -449,94 +450,64 @@ class RBLNRuntimeModel(RBLNPytorchRuntime):
             self.dec_attn_mask[batch_idx].fill_(0)
             self.dec_attn_mask[batch_idx, :, :, :query_length] = 1
-        return RBLNDecoderOnlyOutput(logits=logits, padded_cache_lengths=padded_cache_lengths)
+        return RBLNDecoderOnlyForCausalLMOutput(logits=logits, padded_cache_lengths=padded_cache_lengths)
 @dataclass
-class RBLNDecoderOnlyOutput(ModelOutput):
+class RBLNDecoderOnlyForCausalLMOutput(ModelOutput):
     logits: torch.FloatTensor = None
     generate_idx: torch.Tensor = None
     padded_cache_lengths: int = None
-class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
+class RBLNDecoderOnlyModel(RBLNModel, RBLNDecoderOnlyFlashAttentionMixin):
     """
-    A base class for decoder-only transformer models optimized for causal language modeling tasks on RBLN devices.
+    A base class for decoder-only transformer models outputting raw hidden-states without any specific head on top.
+    This class is used for RBLN-optimized models that are not causal language models.
     This class serves as the foundation for various decoder-only architectures like GPT, LLaMA, etc.
     The class provides core functionality for:
     1. Converting pre-trained transformer models to RBLN-optimized format
     2. Handling the compilation process for RBLN devices
-    3. Managing inference operations for causal language modeling
+    3. Managing inference operations for decoder-only architectures
     This class inherits from RBLNModel and implements specific methods required for
-    decoder-only architectures and causal language modeling tasks.
+    decoder-only architectures.
     Note:
         - This class is designed to be subclassed by specific model implementations
-          (e.g., RBLNLlamaForCausalLM, RBLNGPT2LMHeadModel)
+          (e.g., RBLNLlamaModel, RBLNQwen2Model)
         - Subclasses should implement model-specific conversion logic.
         - The class handles RBLN-specific optimizations automatically during compilation
     """
     main_input_name = "input_ids"
-    auto_model_class = AutoModelForCausalLM
+    auto_model_class = AutoModel
     _decoder_wrapper_cls = DecoderOnlyWrapper
     _use_rotary_emb = True
     def __post_init__(self, **kwargs):
-        main_input_name = self.main_input_name
         if self.rbln_config.use_inputs_embeds:
-            main_input_name = "inputs_embeds"
             artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
             self.embed_tokens = self._create_embedding_layer()
             self.embed_tokens.load_state_dict(artifacts["embed_tokens"])
         else:
             self.embed_tokens = None
-        # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
-        dec_attn_mask = torch.zeros(
-            self.rbln_config.batch_size, 1, 1, self.rbln_config.max_seq_len, dtype=torch.float32
-        )
-        block_tables = torch.zeros(
-            self.rbln_config.batch_size,
-            self.rbln_config.max_seq_len // self.rbln_config.kvcache_block_size,
-            dtype=torch.int16,
-        ).fill_(-1)
-        free_block_pool = deque(x for x in range(self.rbln_config.kvcache_num_blocks))
-        self.prefill_decoder = RBLNRuntimeModel(
-            runtime=self.model[0],
-            main_input_name=main_input_name,
-            embed_tokens=self.embed_tokens,
-            phase="prefill",
-            batch_size=self.rbln_config.batch_size,
-            dec_attn_mask=dec_attn_mask,
-            block_tables=block_tables,
-            free_block_pool=free_block_pool,
-            rbln_config=self.rbln_config,
-            vocab_size=self.config.vocab_size,
-        )
+        # TODO: add prefill runtime class.
+        self.prefill_decoder = RBLNPytorchRuntime(runtime=self.model[0])
-        self.decoders = {}
-        for i, batch_size in enumerate(self.rbln_config.decoder_batch_sizes):
-            self.decoders[batch_size] = RBLNRuntimeModel(
-                runtime=self.model[i + 1],
-                main_input_name=main_input_name,
-                embed_tokens=self.embed_tokens,
-                phase="decode",
-                batch_size=batch_size,
-                dec_attn_mask=dec_attn_mask,
-                block_tables=block_tables,
-                free_block_pool=free_block_pool,
-                rbln_config=self.rbln_config,
+        # attributes for prefill
+        if self.rbln_config.use_global_attention:
+            self.block_tables = torch.arange(self.rbln_config.kvcache_num_blocks, dtype=torch.int16)
+        if self.rbln_config.use_local_attention:
+            self.local_block_tables = torch.tensor([0], dtype=torch.int16)
+        if self.rbln_config.use_attention_mask:
+            self.causal_mask = 1 - torch.triu(
+                torch.ones(1, 1, self.rbln_config.prefill_chunk_size, self.rbln_config.prefill_chunk_size), diagonal=1
             )
-        # NOTE(eunji): Use a decoder whose batch size matches the model's main batch size for compatibility.
-        self.decoder = self.decoders[self.rbln_config.batch_size]
     @classmethod
     def save_torch_artifacts(
         cls,
@@ -571,79 +542,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         return self.rbln_config.kvcache_num_blocks
     @classmethod
-    def get_quantized_model(
-        cls,
-        model_id: str,
-        config: Optional[PretrainedConfig] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        revision: Optional[str] = None,
-        force_download: bool = False,
-        cache_dir: Optional[str] = None,
-        subfolder: str = "",
-        local_files_only: bool = False,
-        trust_remote_code: bool = False,
-        **kwargs,
-    ):
-        kwargs = cls.update_kwargs(kwargs)
-        if config is None:
-            config = AutoConfig.from_pretrained(
-                model_id,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                force_download=force_download,
-                cache_dir=cache_dir,
-                trust_remote_code=trust_remote_code,
-                **kwargs,
-            )
-        with no_init_weights():
-            model = AutoModelForCausalLM.from_config(config)
-        model = prepare_model_for_quantization(
-            model,
-            model_id,
-            kwargs.get("num_hidden_layers"),
-            use_auth_token=use_auth_token,
-            revision=revision,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            local_files_only=local_files_only,
-        )
-        return model
-    def __getattr__(self, __name: str) -> Any:
-        # Special method to delegate attribute access to the original Huggingface LM class.
-        # This method is called when an attribute is not found in the current instance's dictionary.
-        # It enables transparent access to the original model's attributes and methods while maintaining
-        # proper method binding.
-        # The method implements a delegation pattern that:
-        # 1. For methods: Creates a wrapper that properly binds 'self' to method calls
-        # 2. For other attributes: Returns them directly from the original class
-        def redirect(func):
-            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
-        val = getattr(self.get_hf_class(), __name, None) or getattr(PreTrainedModel, __name)
-        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
-            return redirect(val)
-        return val
-    @classmethod
-    def get_pytorch_model(
-        cls, *args, rbln_config: Optional[RBLNDecoderOnlyModelForCausalLMConfig] = None, **kwargs
-    ) -> PreTrainedModel:
-        if rbln_config and rbln_config.quantization:
-            model = cls.get_quantized_model(*args, **kwargs)
-        else:
-            model = super().get_pytorch_model(*args, **kwargs)
-        return model
-    @classmethod
-    def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelForCausalLMConfig"):
+    def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelConfig"):
         wrapper_cfg = {
             "max_seq_len": rbln_config.max_seq_len,
             "attn_impl": rbln_config.attn_impl,
@@ -660,205 +559,95 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
     @classmethod
-    @torch.inference_mode()
-    def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
-        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
-        rbln_compile_configs = rbln_config.compile_cfgs
-        prefill_compile_config = rbln_compile_configs[0]
+    def _compile_model(
+        cls,
+        wrapped_model,
+        compile_config,
+        example_inputs,
+        compile_context,
+        rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
+        quantization=None,
+        phase: str = "prefill",
+    ):
+        try:
+            wrapped_model.phase = phase
+            if quantization:
+                quantization.maybe_set_quantization_env()
+            original_linear = torch.nn.functional.linear
+            torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
+            compiled_model = cls.compile(
+                wrapped_model,
+                compile_config,
+                create_runtimes=rbln_config.create_runtimes,
+                device=rbln_config.device,
+                example_inputs=example_inputs,
+                compile_context=compile_context,
+            )
+            return compiled_model
+        finally:
+            torch.nn.functional.linear = original_linear
+            if quantization:
+                quantization.maybe_reset_quantization_env()
+    @classmethod
+    def _get_compile_context(
+        cls,
+        compile_config: RBLNCompileConfig,
+        example_inputs: List[torch.Tensor],
+    ):
         context = CompileContext(use_weight_sharing=True)
-        # Here we use meta tensor, for the memory efficiency.
-        meta_tensor_names = [name for name, _, _ in prefill_compile_config.input_info if "past_key_values" in name]
-        prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
         # Mark static tensors (self kv states)
         static_tensors = {}
-        for (name, _, _), tensor in zip(prefill_compile_config.input_info, prefill_example_inputs):
+        for (name, _, _), tensor in zip(compile_config.input_info, example_inputs):
             if "past_key_values" in name:
                 static_tensors[name] = tensor
                 context.mark_static_address(tensor)
-        def compile_model(wrapped_model, compile_config, example_inputs, compile_context, quantization):
-            try:
-                if quantization:
-                    quantization.maybe_set_quantization_env()
-                original_linear = torch.nn.functional.linear
-                torch.nn.functional.linear = torch.ops.rbln_custom_ops.linear
-                compiled_model = cls.compile(
-                    wrapped_model,
-                    compile_config,
-                    create_runtimes=rbln_config.create_runtimes,
-                    device=rbln_config.device,
-                    example_inputs=example_inputs,
-                    compile_context=compile_context,
-                )
-                return compiled_model
-            finally:
-                torch.nn.functional.linear = original_linear
-                if quantization:
-                    quantization.maybe_reset_quantization_env()
-        wrapped_model.phase = "prefill"
-        compiled_prefill = compile_model(
-            wrapped_model, prefill_compile_config, prefill_example_inputs, context, rbln_config.quantization
-        )
-        wrapped_model.phase = "decode"
-        compiled_models = {"prefill": compiled_prefill}
-        for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_compile_configs[1:]):
-            dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
-            compiled_decoder = compile_model(
-                wrapped_model, dec_compile_config, dec_example_inputs, context, rbln_config.quantization
-            )
-            compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
-        # check if the memory is enough to have additional blocks
-        required_num_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
-        if rbln_config.kvcache_num_blocks < required_num_blocks:
-            cls.maybe_suggest_kvcache_num_blocks(
-                compiled_models=compiled_models,
-                model_config=model.config,
-                rbln_config=rbln_config,
-            )
-        return compiled_models
+        return context, static_tensors
     @classmethod
-    def maybe_suggest_kvcache_num_blocks(
+    @torch.inference_mode()
+    def get_compiled_model(
         cls,
-        compiled_models: Dict[str, rebel.RBLNCompiledModel],
-        model_config: PretrainedConfig,
-        rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
-    ) -> None:
-        # Get the actual memory allocation of each node by key
-        alloc_memory_per_node_by_key: Dict[str, List[int]] = compiled_models["prefill"].get_alloc_per_node_by_key()
-        alloc_memory_by_key: Dict[str, int] = {
-            key: sum(memory_per_node) for key, memory_per_node in alloc_memory_per_node_by_key.items()
-        }
-        for batch_size in rbln_config.decoder_batch_sizes:
-            for key, memory_per_node in (
-                compiled_models[f"decoder_batch_{batch_size}"].get_alloc_per_node_by_key().items()
-            ):
-                alloc_memory_by_key[key] += sum(memory_per_node)
-        alloc_memory_by_key.pop("PortRecur", None)  # Old compiler's kv-cache Key
-        alloc_memory_by_key.pop("DramTensor", None)  # kv-cache
-        kernel_size = alloc_memory_by_key.pop("Kernel")  # model weight
-        # Get the maximum number of blocks that can be allocated
-        buffer = sum(alloc_memory_by_key.values())
-        max_num_blocks = cls.get_maximum_num_blocks(
-            config=model_config,
-            tensor_parallel_size=rbln_config.tensor_parallel_size,
-            kvcache_block_size=rbln_config.kvcache_block_size,
-            kernel_size=kernel_size,
-            buffer=buffer,
+        model: PreTrainedModel,
+        rbln_config: RBLNDecoderOnlyModelConfig,
+    ):
+        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
+        compile_config = rbln_config.compile_cfgs[0]
+        # Here we use meta tensor, for the memory efficiency.
+        meta_tensor_names = [name for name, _, _ in compile_config.input_info if "past_key_values" in name]
+        example_inputs = compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
+        context, _ = cls._get_compile_context(compile_config, example_inputs)
+        compiled_model = cls._compile_model(
+            wrapped_model, compile_config, example_inputs, context, rbln_config, rbln_config.quantization, "prefill"
         )
+        compiled_models = {"prefill": compiled_model}
-        # Since our estimation logic is not always accurate,
-        # users can set `kvcache_num_blocks` to `max_num_blocks`.
-        # If the memory is not enough, the model will fail to compile.
-        if rbln_config.kvcache_num_blocks < max_num_blocks:
-            logger.warning(
-                f"Current `kvcache_num_blocks` setting is {rbln_config.kvcache_num_blocks}. "
-                "Our analysis indicates that additional memory is available for more blocks. "
-                f"Consider increasing `kvcache_num_blocks` to {max_num_blocks} for potentially improved performance. "
-                "Please be advised that our memory estimation algorithm has limitations, "
-                "and increasing this value may not guarantee successful model compilation."
-            )
+        return compiled_models
     @classmethod
-    def get_maximum_num_blocks(
-        cls,
-        config: PretrainedConfig,
-        tensor_parallel_size: int,
-        kvcache_block_size: int,
-        nbits_per_param: Optional[int] = None,
-        n_model_params: Optional[int] = None,
-        kernel_size: Optional[int] = None,
-        buffer: Optional[int] = None,
-        num_runtimes: int = 2,
-    ) -> int:
-        # We are finding max_n_blocks(x) that satisfies the following equation:
-        # available_dram - kernel_size - buffer
-        #     - num_layers * 2 * tensor_parallel_size
-        #     * align_2MB(
-        #         x
-        #         * block_size
-        #         * align_64(head_dim)
-        #         * math.ceil(num_key_value_heads / tensor_parallel_size)
-        #         * 2
-        #     ) > 0
-        # This inequality can be rewritten as follows:
-        # a - c * align_2MB(b * x) > 0
-        # where
-        #    a = available_dram - kernel_size - buffer
-        #    b = block_size * align_64(head_dim) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
-        #    c = num_layers * 2 * tensor_parallel_size
-        # We can rewrite the inequality as follows:
-        # k > align_2MB(b*x)
-        # where
-        #    k = a / c
-        # After that, we can derive the following equation:
-        # x = floor(2**21 / b * floor((k - 1) / 2**21))
-        def align(x: int, nbytes: int) -> int:
-            return int(math.ceil(x / nbytes) * nbytes)
-        def align_2MB(x: int) -> int:
-            return align(x, 2**21)
-        num_attention_heads = getattr(config, "n_head", None) or getattr(config, "num_attention_heads")
-        num_layers = getattr(config, "n_layer", None) or getattr(config, "num_hidden_layers")
-        head_dim = getattr(config, "head_dim", None) or config.hidden_size // num_attention_heads
-        vocab_size = config.vocab_size
-        hidden_size = getattr(config, "n_embd", None) or getattr(config, "hidden_size")
-        num_key_value_heads = getattr(config, "num_key_value_heads", None) or num_attention_heads
-        # TODO(jongho): Update if target npu is REBEL.
-        ATOM_DRAM_NBYTES = 16 * 2**30
-        ATOM_SYS_DRAM_NBYTES = 288 * 2**20
-        available_dram = tensor_parallel_size * (ATOM_DRAM_NBYTES - ATOM_SYS_DRAM_NBYTES)
-        if kernel_size is None:
-            if n_model_params is None:
-                raise ValueError("`n_model_params` should be specified to estimate the kernel memory.")
-            # Get estimated kernel size (approximated)
-            lm_heads_params = align(vocab_size, 64) * hidden_size
-            lm_heads_nbytes = (
-                align_2MB(lm_heads_params * nbits_per_param // 8 / tensor_parallel_size) * tensor_parallel_size
-            )
-            params = n_model_params - lm_heads_params
-            layer_nbytes = (
-                align_2MB(params * nbits_per_param // 8 / num_layers / tensor_parallel_size)
-                * num_layers
-                * tensor_parallel_size
-            )
-            kernel_size = layer_nbytes + lm_heads_nbytes
-        elif n_model_params is not None:
-            raise ValueError("Both `n_model_params` and `kernel_size` cannot be specified.")
-        available_dram -= kernel_size
+    def get_quantized_model(
+        cls, *args, rbln_config: Optional[RBLNDecoderOnlyModelConfig] = None, **kwargs
+    ) -> PreTrainedModel:
+        raise NotImplementedError
-        if buffer is None:
-            # TODO: Accurate buffer estimation
-            buffer_per_runtime_per_core = 2**28  # 256MB per runtime
-            buffer_per_core = buffer_per_runtime_per_core * num_runtimes  # 1 for prefill, 1 for decoder
-            buffer = buffer_per_core * tensor_parallel_size
-        available_dram -= buffer
+    @classmethod
+    def get_pytorch_model(
+        cls, *args, rbln_config: Optional[RBLNDecoderOnlyModelConfig] = None, **kwargs
+    ) -> PreTrainedModel:
+        if rbln_config and rbln_config.quantization:
+            model = cls.get_quantized_model(*args, **kwargs)
+        else:
+            model = super().get_pytorch_model(*args, **kwargs)
-        b = kvcache_block_size * align(head_dim, 64) * math.ceil(num_key_value_heads / tensor_parallel_size) * 2
-        c = num_layers * 2 * tensor_parallel_size
-        k = available_dram / c
-        max_n_blocks = math.floor(2**21 / b * math.floor((k - 1) / 2**21))
+        return model
-        return max_n_blocks
+    @classmethod
+    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
+        return use_local_attention
     @classmethod
     def get_input_info(
@@ -868,13 +657,12 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
         model_config: PretrainedConfig,
     ):
-        is_prefill: bool = query_length > 1
         num_attention_heads = getattr(model_config, "n_head", None) or getattr(model_config, "num_attention_heads")
         num_key_value_heads = getattr(model_config, "num_key_value_heads", None) or num_attention_heads
         num_hidden_layers = getattr(model_config, "n_layer", None) or getattr(model_config, "num_hidden_layers")
         hidden_size = getattr(model_config, "n_embd", None) or getattr(model_config, "hidden_size")
         head_dim = getattr(model_config, "head_dim", None) or hidden_size // num_attention_heads
-        local_kvcache_num_blocks = max(rbln_config.decoder_batch_sizes)
+        is_prefill = query_length > 1
         # 1. main input
         if rbln_config.use_inputs_embeds:
@@ -893,16 +681,16 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         ]
         # 3. block_tables
-        if rbln_config.cache_impl in ["static", "hybrid"]:
+        if rbln_config.use_global_attention:
             max_block_cnt = rbln_config.max_seq_len // rbln_config.kvcache_block_size
             input_info.extend(
                 [("block_tables", [max_block_cnt] if is_prefill else [batch_size, max_block_cnt], "int16")]
             )
-        if rbln_config.cache_impl in ["hybrid", "sliding_window"]:
+        if rbln_config.use_local_attention:
             input_info.extend([("local_block_tables", [1] if is_prefill else [batch_size, 1], "int16")])
-        # 4. query_position
-        if is_prefill:
+        # 4. query_position for sliding window attention
+        if cls.use_query_position(rbln_config.use_local_attention, is_prefill):
             input_info.extend([("query_position", [], "int16")])
         # 5. attention_mask & position_ids
@@ -924,7 +712,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             rbln_config.kvcache_block_size,
             head_dim,
         ]
-        local_kvcache_shape = [local_kvcache_num_blocks, num_key_value_heads, rbln_config.sliding_window, head_dim]
+        local_kvcache_shape = [rbln_config.batch_size, num_key_value_heads, rbln_config.sliding_window, head_dim]
         input_info.extend(
             [
                 (
@@ -971,13 +759,38 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         #     ```
         # Returns:
-        #     RBLNDecoderOnlyModelForCausalLMConfig: The updated RBLN model configuration.
+        #     RBLNDecoderOnlyModelConfig: The updated RBLN model configuration.
         raise NotImplementedError(
             "Subclasses must implement _update_sliding_window_config to configure sliding window attention settings. "
             "See method docstring for required configuration details."
         )
+    @classmethod
+    def _update_attention_config(
+        cls, model: PreTrainedModel, model_config: PretrainedConfig, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
+    ):
+        rbln_config.attn_impl, rbln_config.kvcache_partition_len, rbln_config.kvcache_block_size = set_default_values(
+            attn_impl=rbln_config.attn_impl,
+            kvcache_partition_len=rbln_config.kvcache_partition_len,
+            kvcache_block_size=rbln_config.kvcache_block_size,
+            max_seq_len=rbln_config.max_seq_len,
+        )
+        validate_attention_method(
+            attn_impl=rbln_config.attn_impl,
+            kvcache_partition_len=rbln_config.kvcache_partition_len,
+            kvcache_block_size=rbln_config.kvcache_block_size,
+            max_seq_len=rbln_config.max_seq_len,
+        )
+        if rbln_config.kvcache_num_blocks is None:
+            rbln_config.kvcache_num_blocks = (
+                rbln_config.max_seq_len // rbln_config.kvcache_block_size
+            ) * rbln_config.batch_size
+        return rbln_config
     @classmethod
     def _update_rbln_config(
         cls,
@@ -998,8 +811,384 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         ):
             rbln_config = cls._update_sliding_window_config(model_config, rbln_config)
             if rbln_config.sliding_window is not None:
-                validate_sliding_window_size(rbln_config.sliding_window, rbln_config.prefill_chunk_size)
+                validate_sliding_window(rbln_config)
+        rbln_config = cls._update_attention_config(model, model_config, rbln_config)
+        prefill_input_info = cls.get_input_info(
+            batch_size=1,
+            query_length=rbln_config.prefill_chunk_size,
+            rbln_config=rbln_config,
+            model_config=model_config,
+        )
+        prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
+        rbln_config.set_compile_cfgs([prefill_compile_config])
+        return rbln_config
+    @classmethod
+    def _create_runtimes(
+        cls,
+        compiled_models: List[rebel.RBLNCompiledModel],
+        rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
+    ) -> List[rebel.Runtime]:
+        expected_model_names = [
+            "prefill",
+        ]
+        if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
+            cls._raise_missing_compiled_file_error(expected_model_names)
+        return [
+            rebel.Runtime(
+                compiled_models[0],
+                tensor_type="pt",
+                device=rbln_config.device_map["prefill"],
+                activate_profiler=rbln_config.activate_profiler,
+            ),
+        ]
+    def _preprocess_chunked_prefill(
+        self,
+        inputs: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embed: Optional[torch.Tensor] = None,
+    ):
+        # valid sequence length of inputs_embeds
+        query_length = inputs.shape[1] if attention_mask is None else torch.sum(attention_mask.view(-1)).item()
+        # extract valid inputs
+        inputs = inputs[:, attention_mask.bool()] if attention_mask is not None else inputs
+        if inputs.dim() == 2 and self.rbln_config.use_inputs_embeds:
+            inputs = self.get_input_embeddings()(inputs)
+        if position_embed is not None:
+            position_embed = (
+                position_embed[:, :, :, attention_mask.bool(), :] if attention_mask is not None else position_embed
+            )
+        # padding for chunked prefill
+        padding_size = (
+            self.rbln_config.prefill_chunk_size - (query_length % self.rbln_config.prefill_chunk_size)
+        ) % self.rbln_config.prefill_chunk_size
+        padded_len = query_length + padding_size
+        inputs = (
+            torch.nn.functional.pad(inputs, (0, padding_size))
+            if not self.rbln_config.use_inputs_embeds
+            else torch.nn.functional.pad(inputs, (0, 0, 0, padding_size))
+        )
+        position_embed = (
+            None if position_embed is None else torch.nn.functional.pad(position_embed, (0, 0, 0, padding_size))
+        )
+        cache_position = torch.arange(padded_len, dtype=torch.int32).unsqueeze(0)
+        chunked_attention_mask = (
+            torch.zeros(1, 1, self.rbln_config.prefill_chunk_size, self.rbln_config.max_seq_len, dtype=torch.float32)
+            if self.rbln_config.use_attention_mask
+            else None
+        )
+        return inputs, position_embed, cache_position, query_length, chunked_attention_mask
+    def _chunked_prefill_forward(
+        self,
+        inputs: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embed: Optional[torch.Tensor] = None,
+    ):
+        padded_input, padded_position_embed, cache_position, query_length, chunked_attention_mask = (
+            self._preprocess_chunked_prefill(inputs, attention_mask, position_embed)
+        )
+        # chunked prefill
+        last_hidden_states = []
+        for step in range(0, query_length, self.rbln_config.prefill_chunk_size):
+            # Extract the current chunk of inputs and cache positions
+            input_chunk = padded_input[:, step : step + self.rbln_config.prefill_chunk_size]
+            cache_pos_chunk = cache_position[:, step : step + self.rbln_config.prefill_chunk_size]
+            valid_length = (
+                self.rbln_config.prefill_chunk_size
+                if (step + self.rbln_config.prefill_chunk_size) <= query_length
+                else query_length - step
+            )
+            if self.rbln_config.use_local_attention:
+                query_position = torch.tensor(valid_length - 1, dtype=torch.int16)
+            else:
+                query_position = None
+            if self.rbln_config.use_attention_mask:
+                if step > 0:
+                    chunked_attention_mask[:, :, :, :step] = 1
+                chunked_attention_mask[:, :, :, step : step + self.rbln_config.prefill_chunk_size] = self.causal_mask
+            # Forward pass for the current chunk
+            last_hidden_states_chunk = self.prefill_decoder(
+                input_ids=input_chunk if not self.rbln_config.use_inputs_embeds else None,
+                inputs_embeds=input_chunk if self.rbln_config.use_inputs_embeds else None,
+                cache_position=cache_pos_chunk,
+                block_tables=self.block_tables if self.rbln_config.use_global_attention else None,
+                local_block_tables=self.local_block_tables if self.rbln_config.use_local_attention else None,
+                query_position=query_position,
+                attention_mask=chunked_attention_mask,
+                position_emb=padded_position_embed,
+            )
+            last_hidden_states.append(last_hidden_states_chunk)
+        last_hidden_states = torch.concat(last_hidden_states, dim=-2)[:, :query_length]
+        return self._postprocess_chunked_prefill(last_hidden_states, attention_mask)
+    def _postprocess_chunked_prefill(
+        self, last_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None
+    ):
+        # index copy for attention mask
+        if attention_mask is not None:
+            new_last_hidden_states = torch.full(
+                (1, attention_mask.shape[-1], last_hidden_states.shape[-1]),
+                fill_value=1e-10,
+                dtype=last_hidden_states.dtype,
+            )
+            mask_indices = torch.nonzero(attention_mask, as_tuple=True)[0]
+            new_last_hidden_states.index_copy_(dim=-2, index=mask_indices, source=last_hidden_states)
+        else:
+            new_last_hidden_states = last_hidden_states
+        return new_last_hidden_states
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_embed: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        inputs = inputs_embeds if inputs_embeds is not None else input_ids
+        batch_size = inputs.shape[0]
+        all_last_hidden_states = []
+        for b_idx in range(batch_size):
+            last_hidden_states = self._chunked_prefill_forward(
+                inputs[b_idx : b_idx + 1],
+                attention_mask[b_idx] if attention_mask is not None else None,
+                position_embed[b_idx : b_idx + 1] if position_embed is not None else None,
+            )
+            all_last_hidden_states.append(last_hidden_states)
+        last_hidden_states = torch.concat(all_last_hidden_states, dim=0)
+        return BaseModelOutputWithPast(last_hidden_state=last_hidden_states)
+class RBLNDecoderOnlyModelForCausalLM(RBLNDecoderOnlyModel):
+    """
+    A base class for decoder-only transformer models optimized for causal language modeling tasks on RBLN devices.
+    This class serves as the foundation for various decoder-only architectures like GPT, LLaMA, etc.
+    The class provides core functionality for:
+    1. Converting pre-trained transformer models to RBLN-optimized format
+    2. Handling the compilation process for RBLN devices
+    3. Managing inference operations for causal language modeling
+    This class inherits from RBLNModel and implements specific methods required for
+    decoder-only architectures and causal language modeling tasks.
+    Note:
+        - This class is designed to be subclassed by specific model implementations
+          (e.g., RBLNLlamaForCausalLM, RBLNGPT2LMHeadModel)
+        - Subclasses should implement model-specific conversion logic.
+        - The class handles RBLN-specific optimizations automatically during compilation
+    """
+    auto_model_class = AutoModelForCausalLM
+    def __post_init__(self, **kwargs):
+        main_input_name = self.main_input_name
+        if self.rbln_config.use_inputs_embeds:
+            main_input_name = "inputs_embeds"
+            artifacts = torch.load(self.model_save_dir / self.subfolder / "torch_artifacts.pth", weights_only=False)
+            self.embed_tokens = self._create_embedding_layer()
+            self.embed_tokens.load_state_dict(artifacts["embed_tokens"])
+        else:
+            self.embed_tokens = None
+        # Initialize shared resources to be used across Runtime instances (prefill and decode phases)
+        dec_attn_mask = torch.zeros(
+            self.rbln_config.batch_size, 1, 1, self.rbln_config.max_seq_len, dtype=torch.float32
+        )
+        block_tables = torch.zeros(
+            self.rbln_config.batch_size,
+            self.rbln_config.max_seq_len // self.rbln_config.kvcache_block_size,
+            dtype=torch.int16,
+        ).fill_(-1)
+        free_block_pool = deque(x for x in range(self.rbln_config.kvcache_num_blocks))
+        self.prefill_decoder = RBLNRuntimeModel(
+            runtime=self.model[0],
+            main_input_name=main_input_name,
+            embed_tokens=self.embed_tokens,
+            phase="prefill",
+            batch_size=self.rbln_config.batch_size,
+            dec_attn_mask=dec_attn_mask,
+            block_tables=block_tables,
+            free_block_pool=free_block_pool,
+            rbln_config=self.rbln_config,
+            vocab_size=self.config.vocab_size,
+        )
+        if self.can_generate():
+            self.decoders = {}
+            for i, batch_size in enumerate(self.rbln_config.decoder_batch_sizes):
+                self.decoders[batch_size] = RBLNRuntimeModel(
+                    runtime=self.model[i + 1],
+                    main_input_name=main_input_name,
+                    embed_tokens=self.embed_tokens,
+                    phase="decode",
+                    batch_size=batch_size,
+                    dec_attn_mask=dec_attn_mask,
+                    block_tables=block_tables,
+                    free_block_pool=free_block_pool,
+                    rbln_config=self.rbln_config,
+                )
+            # NOTE(eunji): Use a decoder whose batch size matches the model's main batch size for compatibility.
+            self.decoder = self.decoders[self.rbln_config.batch_size]
+    @classmethod
+    def get_quantized_model(
+        cls,
+        model_id: str,
+        config: Optional[PretrainedConfig] = None,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        trust_remote_code: bool = False,
+        **kwargs,
+    ):
+        kwargs = cls.update_kwargs(kwargs)
+        if config is None:
+            config = AutoConfig.from_pretrained(
+                model_id,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+                cache_dir=cache_dir,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
+            )
+        with no_init_weights():
+            model = AutoModelForCausalLM.from_config(config)
+        model = prepare_model_for_quantization(
+            model,
+            model_id,
+            kwargs.get("num_hidden_layers"),
+            use_auth_token=use_auth_token,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            local_files_only=local_files_only,
+        )
+        return model
+    def __getattr__(self, __name: str) -> Any:
+        # Special method to delegate attribute access to the original Huggingface LM class.
+        # This method is called when an attribute is not found in the current instance's dictionary.
+        # It enables transparent access to the original model's attributes and methods while maintaining
+        # proper method binding.
+        # The method implements a delegation pattern that:
+        # 1. For methods: Creates a wrapper that properly binds 'self' to method calls
+        # 2. For other attributes: Returns them directly from the original class
+        def redirect(func):
+            return lambda *pargs, **kwargs: func(self, *pargs, **kwargs)
+        val = getattr(self.get_hf_class(), __name, None) or getattr(PreTrainedModel, __name)
+        if isinstance(val, Callable) and "self" in set(inspect.signature(val).parameters):
+            return redirect(val)
+        return val
+    @classmethod
+    def wrap_model_if_needed(cls, model: PreTrainedModel, rbln_config: "RBLNDecoderOnlyModelForCausalLMConfig"):
+        wrapper_cfg = {
+            "max_seq_len": rbln_config.max_seq_len,
+            "attn_impl": rbln_config.attn_impl,
+            "kvcache_partition_len": rbln_config.kvcache_partition_len,
+            "kvcache_block_size": rbln_config.kvcache_block_size,
+            "use_rotary_emb": cls._use_rotary_emb,
+            "use_attention_mask": rbln_config.use_attention_mask,
+            "use_position_ids": rbln_config.use_position_ids,
+            "use_inputs_embeds": rbln_config.use_inputs_embeds,
+            "cache_impl": rbln_config.cache_impl,
+            "sliding_window": rbln_config.sliding_window,
+            "sliding_window_layers": rbln_config.sliding_window_layers,
+        }
+        return cls._decoder_wrapper_cls(model, **wrapper_cfg).eval()
+    @classmethod
+    @torch.inference_mode()
+    def get_compiled_model(cls, model: PreTrainedModel, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig):
+        wrapped_model = cls.wrap_model_if_needed(model, rbln_config)
+        prefill_compile_config = rbln_config.compile_cfgs[0]
+        # Here we use meta tensor, for the memory efficiency.
+        meta_tensor_names = [name for name, _, _ in prefill_compile_config.input_info if "past_key_values" in name]
+        prefill_example_inputs = prefill_compile_config.get_dummy_inputs(fill=0, meta_tensor_names=meta_tensor_names)
+        context, static_tensors = cls._get_compile_context(prefill_compile_config, prefill_example_inputs)
+        compiled_models = {}
+        compiled_models["prefill"] = cls._compile_model(
+            wrapped_model,
+            prefill_compile_config,
+            prefill_example_inputs,
+            context,
+            rbln_config,
+            rbln_config.quantization,
+            phase="prefill",
+        )
+        if rbln_config.can_generate:
+            wrapped_model.phase = "decode"
+            for batch_size, dec_compile_config in zip(rbln_config.decoder_batch_sizes, rbln_config.compile_cfgs[1:]):
+                dec_example_inputs = dec_compile_config.get_dummy_inputs(fill=0, static_tensors=static_tensors)
+                compiled_decoder = cls._compile_model(
+                    wrapped_model,
+                    dec_compile_config,
+                    dec_example_inputs,
+                    context,
+                    rbln_config,
+                    rbln_config.quantization,
+                    phase="decode",
+                )
+                compiled_models[f"decoder_batch_{batch_size}"] = compiled_decoder
+            # check if the memory is enough to have additional blocks
+            required_num_blocks = (rbln_config.max_seq_len // rbln_config.kvcache_block_size) * rbln_config.batch_size
+            if rbln_config.kvcache_num_blocks < required_num_blocks:
+                cls.maybe_suggest_kvcache_num_blocks(
+                    compiled_models=compiled_models,
+                    model_config=model.config,
+                    rbln_config=rbln_config,
+                )
+        return compiled_models
+    @classmethod
+    def use_query_position(cls, use_local_attention: bool, is_prefill: bool = True):
+        return is_prefill
+    @classmethod
+    def _update_attention_config(
+        cls, model: PreTrainedModel, model_config: PretrainedConfig, rbln_config: RBLNDecoderOnlyModelForCausalLMConfig
+    ):
         rbln_config.attn_impl, rbln_config.kvcache_partition_len, rbln_config.kvcache_block_size = set_default_values(
             attn_impl=rbln_config.attn_impl,
             kvcache_partition_len=rbln_config.kvcache_partition_len,
@@ -1024,13 +1213,13 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
                 kvcache_block_size=rbln_config.kvcache_block_size,
                 nbits_per_param=16 if not rbln_config.quantization else 4,  # TODO(jongho): FIX Ad-hoc
                 n_model_params=sum(p.numel() for p in model.parameters()),
-                num_runtimes=1 + len(rbln_config.decoder_batch_sizes),
+                num_runtimes=1 if not rbln_config.can_generate else 1 + len(rbln_config.decoder_batch_sizes),
             )
             max_num_blocks = min(max_num_blocks, estimated_max_num_blocks)
             flash_min_blocks = rbln_config.max_seq_len // rbln_config.kvcache_block_size + 1
-            if max_num_blocks < flash_min_blocks:
+            if rbln_config.batch_size > 1 and max_num_blocks < flash_min_blocks:
                 max_num_blocks = flash_min_blocks
             if max_num_blocks < rbln_config.batch_size:
@@ -1049,27 +1238,30 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
             )
         logger.info(f"[KVCache] Compiling with num_blocks: {rbln_config.kvcache_num_blocks}")
-        prefill_input_info = cls.get_input_info(
-            batch_size=1,
-            query_length=rbln_config.prefill_chunk_size,
-            rbln_config=rbln_config,
-            model_config=model_config,
-        )
-        prefill_compile_config = RBLNCompileConfig(compiled_model_name="prefill", input_info=prefill_input_info)
+        return rbln_config
-        dec_compile_configs = []
-        for batch_size in rbln_config.decoder_batch_sizes:
-            dec_input_info = cls.get_input_info(
-                batch_size=batch_size,
-                query_length=1,
-                rbln_config=rbln_config,
-                model_config=model_config,
-            )
-            dec_compile_configs.append(
-                RBLNCompileConfig(compiled_model_name=f"decoder_batch_{batch_size}", input_info=dec_input_info)
-            )
-        rbln_config.set_compile_cfgs([prefill_compile_config, *dec_compile_configs])
+    @classmethod
+    def _update_rbln_config(
+        cls,
+        preprocessors: Optional[Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"]] = None,
+        model: Optional[PreTrainedModel] = None,
+        model_config: Optional[PretrainedConfig] = None,
+        rbln_config: Optional[RBLNDecoderOnlyModelForCausalLMConfig] = None,
+    ) -> RBLNDecoderOnlyModelForCausalLMConfig:
+        rbln_config = super()._update_rbln_config(preprocessors, model, model_config, rbln_config)
+        if rbln_config.can_generate:
+            compile_configs = rbln_config.compile_cfgs
+            for batch_size in rbln_config.decoder_batch_sizes:
+                dec_input_info = cls.get_input_info(
+                    batch_size=batch_size,
+                    query_length=1,
+                    rbln_config=rbln_config,
+                    model_config=model_config,
+                )
+                compile_configs.append(
+                    RBLNCompileConfig(compiled_model_name=f"decoder_batch_{batch_size}", input_info=dec_input_info)
+                )
+            rbln_config.set_compile_cfgs(compile_configs)
         return rbln_config
@@ -1079,38 +1271,45 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         compiled_models: List[rebel.RBLNCompiledModel],
         rbln_config: RBLNDecoderOnlyModelForCausalLMConfig,
     ) -> List[rebel.Runtime]:
-        expected_model_names = [
-            "prefill",
-            *[f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes],
-        ]
+        expected_model_names = ["prefill"]
+        if rbln_config.can_generate:
+            expected_model_names.extend(
+                [f"decoder_batch_{batch_size}" for batch_size in rbln_config.decoder_batch_sizes]
+            )
         if any(model_name not in rbln_config.device_map for model_name in expected_model_names):
             cls._raise_missing_compiled_file_error(expected_model_names)
-        return [
+        ret_val = [
             rebel.Runtime(
                 compiled_models[0],
                 tensor_type="pt",
                 device=rbln_config.device_map["prefill"],
                 activate_profiler=rbln_config.activate_profiler,
                 timeout=rbln_config.timeout,
-            ),
-            *[
-                rebel.Runtime(
-                    compiled_models[i + 1],
-                    tensor_type="pt",
-                    device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
-                    activate_profiler=rbln_config.activate_profiler,
-                    timeout=rbln_config.timeout,
-                )
-                for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
-            ],
+            )
         ]
+        if rbln_config.can_generate:
+            ret_val.extend(
+                [
+                    rebel.Runtime(
+                        compiled_models[i + 1],
+                        tensor_type="pt",
+                        device=rbln_config.device_map[f"decoder_batch_{batch_size}"],
+                        activate_profiler=rbln_config.activate_profiler,
+                        timeout=rbln_config.timeout,
+                    )
+                    for i, batch_size in enumerate(rbln_config.decoder_batch_sizes)
+                ]
+            )
+        return ret_val
     def get_decoder(self):
+        if not self.can_generate():
+            raise ValueError("Decode stage is not supported in this model.")
         return self.decoder
     def can_generate(self):
-        return True
+        return self.rbln_config.can_generate
     def _reorder_cache(self, past_key_values, beam_idx):
         raise NotImplementedError
@@ -1167,7 +1366,7 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
     def _update_model_kwargs_for_generation(
         self,
-        outputs: RBLNDecoderOnlyOutput,
+        outputs: RBLNDecoderOnlyForCausalLMOutput,
         model_kwargs: Dict[str, Any],
         **kwargs,
     ) -> Dict[str, Any]:
@@ -1195,15 +1394,19 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         # A for-loop ensures synchronization with the HuggingFace generate API.
         # The decoder stage operates as usual, processing inputs in batch mode.
+        # for only use forward
+        if generate_idx is None:
+            generate_idx = (
+                attention_mask.sum(dim=-1, keepdim=True).int()
+                if attention_mask is not None
+                else torch.full((input_ids.shape[0], 1), input_ids.shape[1], dtype=torch.int32)
+            )
+            padded_cache_lengths = torch.zeros_like(generate_idx)
         # Prefll
         if cache_position is None:
             logits = []
             inputs = inputs_embeds if inputs_embeds is not None else input_ids
-            # for only use forward
-            if generate_idx is None:
-                generate_idx = attention_mask.sum(dim=-1, keepdim=True).int()
-            if padded_cache_lengths is None:
-                padded_cache_lengths = torch.zeros_like(generate_idx)
             batch_size = inputs.shape[0]
             for b_idx in range(batch_size):
                 cache_position = torch.arange(0, generate_idx[b_idx].item(), dtype=torch.int32).unsqueeze(0)
@@ -1238,6 +1441,6 @@ class RBLNDecoderOnlyModelForCausalLM(RBLNModel):
         if not return_dict:
             return logits, generate_idx, padded_cache_lengths
         else:
-            return RBLNDecoderOnlyOutput(
+            return RBLNDecoderOnlyForCausalLMOutput(
                 logits=logits, generate_idx=generate_idx, padded_cache_lengths=padded_cache_lengths
             )