PyPI - optimum-rbln - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

optimum-rbln 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

optimum/rbln/modeling_config.py CHANGED Viewed

@@ -23,24 +23,38 @@
 import copy
 import json
-from collections import UserDict
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
+import rebel
 import torch
+from .__version__ import __version__
+from .utils.runtime_utils import ContextRblnConfig
 DEFAULT_COMPILED_MODEL_NAME = "compiled_model"
 DEFAULT_MOD_NAME = "default"
 @dataclass
-class RBLNRuntimeConfig:
+class RBLNCompileConfig:
+    """
+    Configuration for RBLN compilation.
+    Attributes:
+        compiled_model_name (str): Name of the compiled model.
+        mod_name (str): Name of the RBLN module.
+        input_info (List[Tuple[str, Tuple[int], Optional[str]]]): Information about input tensors.
+        fusion (Optional[bool]): Whether to use fusion optimization.
+        npu (Optional[str]): NPU configuration.
+        tensor_parallel_size (Optional[int]): Size for tensor parallelism.
+    """
     compiled_model_name: str = DEFAULT_COMPILED_MODEL_NAME
-    rbln_mod_name: str = DEFAULT_MOD_NAME
+    mod_name: str = DEFAULT_MOD_NAME
     input_info: List[Tuple[str, Tuple[int], Optional[str]]] = None
-    batch_size: Optional[int] = None
     fusion: Optional[bool] = None
     npu: Optional[str] = None
     tensor_parallel_size: Optional[int] = None
@@ -48,8 +62,14 @@ class RBLNRuntimeConfig:
     @staticmethod
     def normalize_dtype(dtype):
         """
-        framework's dtype to string.
+        Convert framework-specific dtype to string representation.
         i.e. torch.float32 -> "float32"
+        Args:
+            dtype: The input dtype (can be string, torch dtype, or numpy dtype).
+        Returns:
+            str: The normalized string representation of the dtype.
         """
         if isinstance(dtype, str):
             return dtype
@@ -60,13 +80,12 @@ class RBLNRuntimeConfig:
             return dtype
     def __post_init__(self):
-        self.input_info = [(i[0], i[1], RBLNRuntimeConfig.normalize_dtype(i[2]) or "float32") for i in self.input_info]
+        self.input_info = [(i[0], i[1], RBLNCompileConfig.normalize_dtype(i[2]) or "float32") for i in self.input_info]
-    def update(self, **kwargs):
+    def update(self, kwargs: Dict[str, Any]):
         self.compiled_model_name = kwargs.get("compiled_model_name", self.compiled_model_name)
-        self.rbln_mod_name = kwargs.get("rbln_mod_name", self.rbln_mod_name)
+        self.mod_name = kwargs.get("mod_name", self.mod_name)
         self.input_info = kwargs.get("input_info", self.input_info)
-        self.batch_size = kwargs.get("batch_size", self.batch_size)
         self.fusion = kwargs.get("fusion", self.fusion)
         self.npu = kwargs.get("npu", self.npu)
         self.tensor_parallel_size = kwargs.get("tensor_parallel_size", self.tensor_parallel_size)
@@ -86,84 +105,140 @@ class RBLNRuntimeConfig:
         return asdict(self)
-class RBLNConfig(UserDict):
-    def __init__(self, runtime_cfgs: Dict[str, List[RBLNRuntimeConfig]], _rbln_meta: Dict[str, Any] = None):
-        """Configurations for RBLN model compilation and inference.
+RUNTIME_KEYWORDS = ["create_runtimes", "optimize_host_memory", "device", "device_map"]
+COMPILE_KEYWORDS = ["compiled_model_name", "mod_name", "input_info", "fusion", "npu", "tensor_parallel_size"]
-        Args:
-            _rbln_meta (Dict[str, Any], optional):
-                     Any rbln-specific configurations.
-                     (i.e. max_seq_len for language models, image_size for image models).
-                     Defaults to None.
-        """
-        super().__init__(runtime_cfgs)
-        if _rbln_meta:
-            self.meta = _rbln_meta
+class RBLNConfig:
+    """
+    Configuration for single RBLN OptimizedModel, representing multiple compiled models.
+    Attributes:
+        compile_cfgs (List[RBLNCompileConfig]): Compilation configurations.
+        meta (dict): Metadata including version and class information.
+        runtime_cfg (dict): Runtime-specific configuration.
+    """
+    # It represents multiple compiled model, one of each can have multiple runtimes.
+    def __init__(
+        self,
+        rbln_cls,
+        compile_cfgs: List[RBLNCompileConfig],
+        rbln_kwargs=None,
+        meta=None,
+    ) -> None:
+        if rbln_kwargs is None:
+            rbln_kwargs = {}
         else:
-            self.meta: Dict[str, Any] = {}
+            rbln_kwargs = copy.deepcopy(rbln_kwargs)
-    @staticmethod
-    def from_rbln_configs(rbln_configs: List["RBLNConfig"], names: Optional[List[str]] = None) -> "RBLNConfig":
-        # assume each rbln_config has exact one rbln_runtime_config
-        names = [None] * len(rbln_configs) if names is None else names
-        runtime_cfgs = []
-        for name, cfg in zip(names, rbln_configs):
-            if len(cfg) > 1:
-                msg = (
-                    "`from_rbln_configs` requires exact one `RBLNRuntimeConfig` for each `RBLNConfig`."
-                    f"But got {len(cfg)} `RBLNRuntimeConfig`."
-                )
-                raise RuntimeError(msg)
-            runtime_cfg = cfg[list(cfg.keys())[0]][0]
-            runtime_cfg = copy.deepcopy(runtime_cfg)
-            if name is not None:
-                runtime_cfg.compiled_model_name = name
-            runtime_cfgs.append(runtime_cfg)
-        metas = [cfg.meta for cfg in rbln_configs]
-        merged_meta = {k: v for meta in metas for k, v in meta.items()}
-        return RBLNConfig.from_rbln_runtime_configs(runtime_cfgs, _rbln_meta=merged_meta)
+        # meta : class, version and other informations.
+        if meta is None:
+            self.meta = {"version": __version__, "cls": rbln_cls}
+        else:
+            self.meta = meta
-    @staticmethod
-    def from_rbln_runtime_configs(
-        rbln_runtime_configs: List[RBLNRuntimeConfig],
-        _rbln_meta: Dict[str, Any] = None,
-    ) -> "RBLNConfig":
-        cfgs: Dict[str, List[RBLNRuntimeConfig]] = {}
-        for rbln_runtime_config in rbln_runtime_configs:
-            if rbln_runtime_config.compiled_model_name in cfgs:
-                cfgs[rbln_runtime_config.compiled_model_name].append(rbln_runtime_config)
-            else:
-                cfgs[rbln_runtime_config.compiled_model_name] = [rbln_runtime_config]
-        return RBLNConfig(cfgs, _rbln_meta=_rbln_meta)
+        # compile_cfgs : compile args for each runtime
+        self.compile_cfgs = compile_cfgs
+        for compile_cfg in self.compile_cfgs:
+            compile_cfg.update(rbln_kwargs)
+        for K in COMPILE_KEYWORDS:
+            rbln_kwargs.pop(K, None)
+        # runtime_cfg : Values that don't be saved / loaded.
+        self.runtime_cfg = {}
+        for runtime_key in RUNTIME_KEYWORDS:
+            if runtime_key in rbln_kwargs:
+                self.runtime_cfg[runtime_key] = rbln_kwargs.pop(runtime_key)
+        # model_cfg : All user-provided values such as "max_seq_len".
+        self.model_cfg: Dict[str, Any] = rbln_kwargs
     def save(self, dir_path: str):
         dir_path = Path(dir_path)
-        data = self.asdict()
-        data.update({"rbln_config_meta": self.meta})
+        s_json = {}
+        compile_cfgs = [asdict(cfg) for cfg in self.compile_cfgs]
+        s_json["_compile_cfgs"] = compile_cfgs
+        s_json["_meta"] = self.meta
+        s_json.update(self.model_cfg)
         with open(dir_path / "rbln_config.json", "w") as jsonf:
-            json.dump(data, jsonf, indent=2)
+            json.dump(s_json, jsonf, indent=2)
-    @staticmethod
-    def load(dir_path: str) -> "RBLNConfig":
+    @classmethod
+    def load(cls, dir_path: str) -> "RBLNConfig":
         dir_path = Path(dir_path)
         with open(dir_path / "rbln_config.json", "r") as jsonf:
             config_file = json.load(jsonf)
-        return RBLNConfig.fromdict(config_file)
-    def asdict(self):
-        dic = {k: [asdict(cfg) for cfg in cfgs] for k, cfgs in self.data.items()}
-        return dic
-    @staticmethod
-    def fromdict(dic: dict):
-        runtime_cfgs = {
-            k: [RBLNRuntimeConfig(**cfg) for cfg in cfgs] for k, cfgs in dic.items() if k != "rbln_config_meta"
-        }
-        if "rbln_config_meta" in dic:
-            meta = dic["rbln_config_meta"]
-        else:
-            meta = None
-        return RBLNConfig(runtime_cfgs, _rbln_meta=meta)
+        return cls.fromdict(config_file)
+    @classmethod
+    def fromdict(cls, dic: dict):
+        compile_cfgs = dic.pop("_compile_cfgs")
+        compile_cfgs = [RBLNCompileConfig(**cfg) for cfg in compile_cfgs]
+        meta = dic.pop("_meta")
+        rbln_cls = meta["cls"]
+        rbln_kwargs = dic
+        return cls(rbln_cls=rbln_cls, compile_cfgs=compile_cfgs, rbln_kwargs=rbln_kwargs, meta=meta)
+    def update_runtime_cfg(self, rbln_kwargs: Dict[str, Any]):
+        keys = list(rbln_kwargs.keys())
+        for key in keys:
+            if key in RUNTIME_KEYWORDS:
+                self.runtime_cfg[key] = rbln_kwargs[key]
+    def __repr__(self):
+        compile_cfgs_repr = [f"\n    {cfg!r}" for cfg in self.compile_cfgs]
+        return (
+            f"RBLNConfig(\n"
+            f"  rbln_cls={self.meta['cls']},\n"
+            f"  version='{self.meta['version']}',\n"
+            f"  compile_cfgs=[{''.join(compile_cfgs_repr)}\n  ],\n"
+            f"  model_cfg={self.model_cfg},\n"
+            f"  runtime_cfg={self.runtime_cfg}\n"
+            f")"
+        )
+    @property
+    def create_runtimes(self):
+        context = ContextRblnConfig.get_current_context()["create_runtimes"]
+        if context is not None:
+            return context
+        elif self.runtime_cfg.get("create_runtimes", None) is None:
+            return rebel.npu_is_available()
+        return self.runtime_cfg["create_runtimes"]
+    @property
+    def optimize_host_memory(self):
+        context = ContextRblnConfig.get_current_context()["optimize_host_memory"]
+        if context is not None:
+            return context
+        elif self.runtime_cfg.get("optimize_host_memory", None) is None:
+            return True
+        return self.runtime_cfg["optimize_host_memory"]
+    @property
+    def device(self):
+        context = ContextRblnConfig.get_current_context()["device"]
+        if context:
+            return context
+        elif self.runtime_cfg.get("device", None) is None:
+            return 0
+        return self.runtime_cfg["device"]
+    @property
+    def device_map(self):
+        context = ContextRblnConfig.get_current_context()["device_map"]
+        if context:
+            return context
+        elif self.runtime_cfg.get("device_map", None) is None:
+            rbln_device_map = {}
+            device_val = self.device
+            for cfg in self.compile_cfgs:
+                rbln_device_map[cfg.compiled_model_name] = device_val
+            return rbln_device_map
+        return self.runtime_cfg["device_map"]

optimum/rbln/modeling_seq2seq.py CHANGED Viewed

@@ -37,7 +37,7 @@ from transformers import (
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
 from .modeling_base import RBLNModel
-from .modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
+from .modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNCompileConfig, RBLNConfig
 from .transformers.models.bart import BartDecoderWrapper, BartEncoderWrapper
 from .transformers.models.t5 import T5DecoderWrapper, T5EncoderWrapper
 from .utils.runtime_utils import RBLNPytorchRuntime
@@ -88,12 +88,14 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
     def __post_init__(self, **kwargs):
         self.model_dim = self.config.d_model
-        self.batch_size = self.rbln_config[DEFAULT_COMPILED_MODEL_NAME][0].batch_size
-        self.enc_max_seq_len = self.rbln_config.meta["rbln_enc_max_seq_len"]
-        self.dec_max_seq_len = self.rbln_config.meta["rbln_dec_max_seq_len"]
-        self.pad_token_id = self.rbln_config.meta["rbln_pad_token_id"]
+        self.batch_size = self.rbln_config.model_cfg["batch_size"]
+        self.enc_max_seq_len = self.rbln_config.model_cfg["enc_max_seq_len"]
+        self.dec_max_seq_len = self.rbln_config.model_cfg["dec_max_seq_len"]
+        self.pad_token_id = self.rbln_config.model_cfg["pad_token_id"]
         self.encoder = RBLNRuntimeEncoder(runtime=self.model[0], main_input_name="input_ids")
         self.decoder = RBLNRuntimeDecoder(runtime=self.model[1], main_input_name="input_ids")
+        self.enc_attention_mask = torch.zeros(1, self.enc_max_seq_len, dtype=torch.float32)
+        self.dec_enc_attention_mask = torch.zeros(self.batch_size, self.enc_max_seq_len, dtype=torch.float32)
     def can_generate(self):
         return True
@@ -117,32 +119,6 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
             return redirect(val)
         return val
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        decoder_attention_mask=None,
-        **kwargs,
-    ):
-        max_seq_len = self.dec_max_seq_len
-        cur_seq_len = input_ids.shape[-1]
-        decoder_batch_size = input_ids.shape[0]
-        input_ids = input_ids[:, cur_seq_len - 1 : cur_seq_len].contiguous()
-        # In greedy decoding
-        decoder_attention_mask = torch.zeros(decoder_batch_size, max_seq_len, dtype=torch.int64)
-        decoder_attention_mask[:, :cur_seq_len] = 1
-        cache_position = torch.tensor(cur_seq_len - 1, dtype=torch.int32)
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
-            "cache_position": cache_position,
-        }
     @classmethod
     def update_kwargs(cls, kwargs):
         kwargs.update(
@@ -170,50 +146,54 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
         wrapped_encoder, wrapped_decoder = optimized_models(model)
-        wrapped_encoder.encoder_max_length = rbln_config.meta["rbln_enc_max_seq_len"]
-        wrapped_encoder.decoder_max_length = rbln_config.meta["rbln_dec_max_seq_len"]
-        wrapped_encoder.decoder_batch_size = rbln_config.meta["rbln_batch_size"]
+        wrapped_encoder.encoder_max_length = rbln_config.model_cfg["enc_max_seq_len"]
+        wrapped_encoder.decoder_max_length = rbln_config.model_cfg["dec_max_seq_len"]
+        wrapped_encoder.decoder_batch_size = rbln_config.model_cfg["batch_size"]
-        wrapped_decoder.encoder_max_length = rbln_config.meta["rbln_enc_max_seq_len"]
-        wrapped_decoder.decoder_max_length = rbln_config.meta["rbln_dec_max_seq_len"]
-        wrapped_decoder.decoder_batch_size = rbln_config.meta["rbln_batch_size"]
+        wrapped_decoder.encoder_max_length = rbln_config.model_cfg["enc_max_seq_len"]
+        wrapped_decoder.decoder_max_length = rbln_config.model_cfg["dec_max_seq_len"]
+        wrapped_decoder.decoder_batch_size = rbln_config.model_cfg["batch_size"]
-        enc_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
-        dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
+        enc_rbln_compile_config = rbln_config.compile_cfgs[0]
+        dec_rbln_compile_config = rbln_config.compile_cfgs[1]
         if isinstance(model, T5ForConditionalGeneration):
-            enc_example_inputs = enc_rbln_runtime_config.get_dummy_inputs(fill=1)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=1)
+            enc_example_inputs = enc_rbln_compile_config.get_dummy_inputs(fill=1)
+            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=1)
         else:
-            enc_example_inputs = enc_rbln_runtime_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=0)
+            enc_example_inputs = enc_rbln_compile_config.get_dummy_inputs(fill=0)
+            dec_example_inputs = dec_rbln_compile_config.get_dummy_inputs(fill=0)
+        enc_example_inputs[3].fill_(0)
+        dec_example_inputs[4].fill_(-1)
         enc_scripted_model = torch.jit.trace(wrapped_encoder, enc_example_inputs, check_trace=False)
         dec_scripted_model = torch.jit.trace(wrapped_decoder, dec_example_inputs, check_trace=False)
         enc_ir = rebel.torchscript_to_ir(
             enc_scripted_model,
-            input_names=[v[0] for v in enc_rbln_runtime_config.input_info],
-            name=enc_rbln_runtime_config.rbln_mod_name,
+            input_names=[v[0] for v in enc_rbln_compile_config.input_info],
+            name=enc_rbln_compile_config.mod_name,
         )
         dec_ir = rebel.torchscript_to_ir(
             dec_scripted_model,
-            input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
-            name=dec_rbln_runtime_config.rbln_mod_name,
+            input_names=[v[0] for v in dec_rbln_compile_config.input_info],
+            name=dec_rbln_compile_config.mod_name,
         )
-        dec_ir.decoder_batch_size = rbln_config.meta["rbln_batch_size"]
+        dec_ir.decoder_batch_size = rbln_config.model_cfg["batch_size"]
         connections = [
-            (enc_ir.outputs[0], dec_ir.inputs[5]),
-            (dec_ir.outputs[1], dec_ir.inputs[4]),
+            (enc_ir.outputs[0], enc_ir.inputs[2], dec_ir.inputs[6]),
+            # (enc_ir.outputs[0], enc_ir.inputs[2]),
+            (dec_ir.outputs[1], dec_ir.inputs[5]),
         ]
         compiled_model = rebel.compile(
             enc_ir,
             dec_ir,
             connections=connections,
-            fusion=enc_rbln_runtime_config.fusion,
-            npu=enc_rbln_runtime_config.npu,
-            tensor_parallel_size=enc_rbln_runtime_config.tensor_parallel_size,
+            fusion=enc_rbln_compile_config.fusion,
+            npu=enc_rbln_compile_config.npu,
+            tensor_parallel_size=enc_rbln_compile_config.tensor_parallel_size,
         )
         return compiled_model
@@ -222,11 +202,12 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
         cls,
         preprocessors: Union["AutoFeatureExtractor", "AutoProcessor", "AutoTokenizer"],
         model_config: "PretrainedConfig",
-        rbln_enc_max_seq_len: Optional[int] = None,
-        rbln_dec_max_seq_len: Optional[int] = None,
-        rbln_batch_size: Optional[int] = 1,
+        rbln_kwargs: Dict[str, Any] = {},
     ) -> RBLNConfig:
-        meta = {}
+        rbln_enc_max_seq_len = rbln_kwargs.get("enc_max_seq_len", None)
+        rbln_dec_max_seq_len = rbln_kwargs.get("dec_max_seq_len", None)
+        rbln_batch_size = rbln_kwargs.get("batch_size", None)
+        rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
         if isinstance(model_config, BartConfig):
             n_layer = model_config.decoder_layers
@@ -274,28 +255,36 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
         if max_position_embeddings is not None and rbln_dec_max_seq_len > max_position_embeddings:
             raise ValueError("`rbln_dec_max_seq_len` should be less or equal than max_position_embeddings!")
-        rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
-        meta["rbln_enc_max_seq_len"] = rbln_enc_max_seq_len
-        meta["rbln_dec_max_seq_len"] = rbln_dec_max_seq_len
-        meta["rbln_batch_size"] = rbln_batch_size
-        meta["rbln_pad_token_id"] = rbln_pad_token_id
         # model input info
         enc_input_info = [
-            ("input_ids", [rbln_batch_size, rbln_enc_max_seq_len], "int64"),
-            ("attention_mask", [rbln_batch_size, rbln_enc_max_seq_len], "int64"),
+            ("input_ids", [1, rbln_enc_max_seq_len], "int64"),
+            ("attention_mask", [1, rbln_enc_max_seq_len], "float32"),
+            (
+                "cross_key_value_states",
+                [
+                    n_layer * 2,
+                    rbln_batch_size,
+                    n_head,
+                    rbln_enc_max_seq_len,
+                    d_kv,
+                ],
+                "float32",
+            ),
+            # int16 available?
+            ("batch_idx", [], "int32"),
         ]
         dec_input_info = [
             ("input_ids", [rbln_batch_size, 1], "int64"),
-            ("attention_mask", [rbln_batch_size, rbln_dec_max_seq_len], "int64"),
-            ("encoder_attention_mask", [rbln_batch_size, rbln_enc_max_seq_len], "int64"),
+            ("attention_mask", [rbln_batch_size, rbln_dec_max_seq_len], "float32"),
+            ("encoder_attention_mask", [rbln_batch_size, rbln_enc_max_seq_len], "float32"),
             (
                 "cache_position",
-                [],
+                [rbln_batch_size, 1],
+                # [],
                 "int32",
             ),
+            ("batch_position", [], "int32"),
         ]
         dec_input_info.extend(
             [
@@ -327,12 +316,22 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
                 )
             ]
         )
-        enc_rbln_runtime_config = RBLNRuntimeConfig(rbln_mod_name="encoder", input_info=enc_input_info)
-        dec_rbln_runtime_config = RBLNRuntimeConfig(rbln_mod_name="decoder", input_info=dec_input_info)
+        enc_rbln_compile_config = RBLNCompileConfig(mod_name="encoder", input_info=enc_input_info)
+        dec_rbln_compile_config = RBLNCompileConfig(mod_name="decoder", input_info=dec_input_info)
-        rbln_config = RBLNConfig.from_rbln_runtime_configs(
-            [enc_rbln_runtime_config, dec_rbln_runtime_config],
-            _rbln_meta=meta,
+        rbln_config = RBLNConfig(
+            rbln_cls=cls.__name__,
+            compile_cfgs=[enc_rbln_compile_config, dec_rbln_compile_config],
+            rbln_kwargs=rbln_kwargs,
+        )
+        rbln_config.model_cfg.update(
+            {
+                "enc_max_seq_len": rbln_enc_max_seq_len,
+                "dec_max_seq_len": rbln_dec_max_seq_len,
+                "batch_size": rbln_batch_size,
+                "pad_token_id": rbln_pad_token_id,
+            }
         )
         return rbln_config
@@ -347,7 +346,84 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
             compiled_models[0].create_runtime("decoder", tensor_type="pt", device=device_val),
         ]
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        **kwargs,
+    ):
+        past_cache_length = past_key_values
+        if past_cache_length == 0:
+            cache_pos = []
+            for i in range(input_ids.shape[0]):
+                cache_pos.append([0])
+            cache_position = torch.tensor(cache_pos, dtype=torch.int32)
+        max_seq_len = self.dec_max_seq_len
+        cur_seq_len = input_ids.shape[-1]
+        decoder_batch_size = input_ids.shape[0]
+        input_ids = input_ids[:, cur_seq_len - 1 : cur_seq_len].contiguous()
+        # In greedy decoding
+        decoder_attention_mask = torch.zeros(decoder_batch_size, max_seq_len, dtype=torch.float32)
+        decoder_attention_mask[:, :cur_seq_len] = 1
+        cache_pos = []
+        for i in range(input_ids.shape[0]):
+            cache_pos.append([cur_seq_len - 1])
+        cache_position = torch.tensor(cache_pos, dtype=torch.int32)
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "attention_mask": attention_mask.to(torch.float32),
+            "decoder_attention_mask": decoder_attention_mask,
+            "cache_position": cache_position,
+        }
     def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        cache_position: Union[List[torch.Tensor], torch.Tensor] = None,  # vllm keyword argument
+        batch_idx: Optional[torch.LongTensor] = None,
+        enc_lengths: List[int] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        # common decoder
+        if enc_lengths is None:
+            output = self._forward_decoder(input_ids=input_ids, cache_position=cache_position, **kwargs)
+            return output
+        # vllm & encoder
+        if batch_idx is not None:
+            enc_attention_mask = self.enc_attention_mask.clone()
+            enc_attention_mask[0][: enc_lengths[batch_idx] + 1] = 1
+            padding_need = self.enc_max_seq_len - input_ids.shape[-1]
+            input_ids = torch.nn.functional.pad(input_ids, (0, padding_need))
+            _ = self.encoder(input_ids, enc_attention_mask, batch_idx=batch_idx.to(torch.int32))
+            logits = torch.zeros(1, 1, self.config.vocab_size + 100)
+            logits[0][0][-1] = 1
+        # vllm & decoder
+        else:
+            input_ids[input_ids == (self.config.vocab_size + 99)] = self.config.decoder_start_token_id
+            cache_position[cache_position != 0] = cache_position[cache_position != 0] - 2
+            enc_attention_mask = self.dec_enc_attention_mask.clone()
+            dec_attention_mask = torch.zeros(self.batch_size, self.dec_max_seq_len, dtype=torch.float32)
+            for batch_idx in range(self.batch_size):
+                enc_attention_mask[batch_idx, : enc_lengths[batch_idx] + 1] = 1
+            logits = self._forward_decoder(
+                attention_mask=enc_attention_mask,
+                decoder_input_ids=input_ids,
+                decoder_attention_mask=dec_attention_mask,
+                cache_position=cache_position,
+            ).logits
+        return Seq2SeqLMOutput(
+            logits=logits,
+        )
+    def _forward_decoder(
         self,
         attention_mask: Optional[torch.FloatTensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
@@ -355,13 +431,18 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
         cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
+        dec_attention_mask = decoder_attention_mask.clone()
+        for b_idx in range(self.rbln_config.model_cfg["batch_size"]):
+            dec_attention_mask[b_idx, : cache_position[b_idx] + 1] = 1
         decoder_output = self.decoder(
             input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
+            attention_mask=dec_attention_mask,
             encoder_attention_mask=attention_mask,
             cache_position=cache_position,
+            batch_position=torch.tensor(0, dtype=torch.int32),
         )
-        lm_logits = decoder_output.logits
+        lm_logits = decoder_output.logits[0]
         return Seq2SeqLMOutput(logits=lm_logits)
@@ -405,6 +486,14 @@ class RBLNModelForSeq2SeqLM(RBLNModel):
         model_input_name = model_input_name if model_input_name is not None else self.main_input_name
         encoder_kwargs["return_dict"] = True
         encoder_kwargs[model_input_name] = inputs_tensor
-        model_kwargs["encoder_outputs"] = encoder(**encoder_kwargs)
+        for b in range(batch_size):
+            batch_idx = torch.tensor(b, dtype=torch.int32)
+            cb_inputs = {}
+            cb_inputs["return_dict"] = True
+            cb_inputs["output_hidden_states"] = False
+            cb_inputs["output_attentions"] = False
+            cb_inputs["input_ids"] = encoder_kwargs["input_ids"][b].unsqueeze(0)
+            cb_inputs["attention_mask"] = encoder_kwargs["attention_mask"][b].unsqueeze(0).to(torch.float32)
+            model_kwargs["encoder_outputs"] = encoder(**cb_inputs, batch_idx=batch_idx)
         return model_kwargs

optimum-rbln 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

optimum-rbln 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl