PyPI - optimum-rbln - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

optimum-rbln 0.1.1py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -23,28 +23,32 @@
 import inspect  # noqa: I001
 import logging
-from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import torch  # noqa: F401
 import rebel  # noqa: F401
-from optimum.exporters import TasksManager
-from transformers import AutoModelForCausalLM, LlamaForCausalLM, PretrainedConfig, AutoConfig
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, PreTrainedModel, PretrainedConfig, AutoConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from ...generation.utils import RBLNGenerationMixin
-from ....modeling_base import RBLNBaseModel
+from ....modeling_base import RBLNModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
-from ....utils.save_utils import maybe_save_preprocessors
+# FIXME:: Merge Two architecture Codes
 from .llama_architecture import (
     LlamaWrapper,
     wrap_llama,
     unwrap_llama,
 )
+from .llama_architecture_cb import (
+    LlamaDynamicBatchWrapper as LlamaWrapper_cb,
+    wrap_llama as wrap_llama_cb,
+)
 logger = logging.getLogger(__name__)
@@ -57,29 +61,17 @@ if TYPE_CHECKING:
     )
+SUPPORTED_BATCHING_MODES = ["static", "vllm"]
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name"]
-    # RBLN_Runtimemodule
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: torch.LongTensor = None,
-        cache_position: torch.Tensor = None,
-        **kwargs: Dict[str, Any],
-    ):
-        logits = super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-        )
-        return logits
-class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
+class RBLNLlamaForCausalLM(RBLNModel, RBLNGenerationMixin):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
-    This model inherits from [`RBLNBaseModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
+    This model inherits from [`RBLNMultiModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
     A class to convert and run pre-trained transformers based LlamaForCausalLM model on RBLN devices.
     It implements the methods to convert a pre-trained transformers LlamaForCausalLM model into a RBLN transformer model by:
@@ -87,7 +79,6 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
     - compiling the resulting graph using the RBLN compiler.
     """
-    model_type = "rbln_model"
     main_input_name = "input_ids"
     auto_model_class = AutoModelForCausalLM
@@ -95,25 +86,45 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         self.batch_size = self.rbln_config.meta["rbln_batch_size"]
         self.max_seq_len = self.rbln_config.meta["rbln_max_seq_len"]
         self.prefill_chunk_size = self.rbln_config.meta["rbln_prefill_chunk_size"]
+        self.use_continuous_batch = self.rbln_config.meta["rbln_batching"] == "vllm"
+        prefill_batch_size = self.batch_size if not self.use_continuous_batch else 1
         self.prefill_attention_mask = torch.zeros(
-            self.batch_size, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.int64
+            prefill_batch_size, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.int64
         )
         self.causal_mask = 1 - torch.triu(
-            torch.ones(self.batch_size, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
+            torch.ones(prefill_batch_size, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
         )
+        self.decoder_attention_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.int64)
-        self.prefill_decoder = RBLNRuntimeModel(runtime=self.runtimes[0], main_input_name="input_ids")
-        self.decoder = RBLNRuntimeModel(runtime=self.runtimes[1], main_input_name="input_ids")
+        self.prefill_decoder = RBLNRuntimeModel(runtime=self.model[0], main_input_name="input_ids")
+        self.decoder = RBLNRuntimeModel(runtime=self.model[1], main_input_name="input_ids")
         self.past_cached_length = 0
         self.right_padding = True
     @classmethod
-    @torch.no_grad()
-    def _export(
+    def update_kwargs(cls, kwargs):
+        """
+        Update user-given kwargs to get proper pytorch model.
+        For example, `torchscript`=True should be set because torch.jit
+        does not support `transformers` output instances as module output;
+        """
+        kwargs.update(
+            {
+                "torchscript": True,
+                "return_dict": False,
+                "use_cache": True,
+                "torch_dtype": torch.float32,
+                "_attn_implementation": "eager",
+            }
+        )
+        return kwargs
+    @classmethod
+    def get_pytorch_model(
         cls,
         model_id: str,
-        config: "PretrainedConfig",
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -121,126 +132,94 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        rbln_config_kwargs: Optional[Dict[str, Any]] = None,
+        rbln_constructor_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
-    ) -> "RBLNLlamaForCausalLM":
-        task = kwargs.pop("task", None)
-        if task is None:
-            task = TasksManager.infer_task_from_model(cls.auto_model_class)
-        if model_save_dir is None:
-            save_dir = TemporaryDirectory()
-            save_dir_path = Path(save_dir.name)
-        else:
-            save_dir = model_save_dir
-            if isinstance(save_dir, TemporaryDirectory):
-                save_dir_path = Path(model_save_dir.name)
-            else:
-                save_dir_path = Path(model_save_dir)
-                save_dir_path.mkdir(exist_ok=True)
-        def update_configs(kwargs):
-            hf_max_position_embeddings = getattr(AutoConfig.from_pretrained(model_id), "max_position_embeddings", None)
-            max_seq_len = kwargs.get("rbln_max_seq_len", None)
-            if max_seq_len is not None:
-                if max_seq_len <= hf_max_position_embeddings:
-                    kwargs.update({"max_position_embeddings": max_seq_len})
-                else:
-                    raise ValueError("`max_seq_len` should be less or equal than max_position_embeddings!")
-            kwargs.update(
-                {
-                    "torchscript": True,
-                    "return_dict": False,
-                    "use_cache": True,
-                    "torch_dtype": torch.float32,
-                    "_attn_implementation": "eager",
-                }
-            )
-            return kwargs
-        kwargs = update_configs(kwargs)
+    ) -> PreTrainedModel:
+        if rbln_max_seq_len := rbln_config_kwargs.get("rbln_max_seq_len", None):
+            config = AutoConfig.from_pretrained(model_id)
+            if hf_position_embedding := getattr(config, "max_position_embeddings", None):
+                if hf_position_embedding < rbln_max_seq_len:
+                    logger.warning(
+                        f"`rbln_max_seq_len` is larger than original config({hf_position_embedding})."
+                        "This may lead to incorrect inferences of the model."
+                    )
+            kwargs.update({"max_position_embeddings": rbln_max_seq_len})
-        rbln_config_kwargs, rbln_constructor_kwargs = cls.pop_rbln_kwargs_from_kwargs(kwargs)
+        # FIXME :: This should be moved when wrapping removed.
+        use_continuous_batch = rbln_config_kwargs.get("rbln_batching", "static") == "vllm"
+        wrap_llama_cb() if use_continuous_batch else wrap_llama()
-        origin_mehtods = wrap_llama()
-        model: LlamaForCausalLM = TasksManager.get_model_from_task(
-            task=task,
-            model_name_or_path=model_id,
-            subfolder=subfolder,
+        model = super().get_pytorch_model(
+            model_id=model_id,
+            use_auth_token=use_auth_token,
             revision=revision,
-            framework="pt",
+            force_download=force_download,
             cache_dir=cache_dir,
-            use_auth_token=use_auth_token,
+            subfolder=subfolder,
             local_files_only=local_files_only,
-            force_download=force_download,
             trust_remote_code=trust_remote_code,
+            rbln_config_kwargs=rbln_config_kwargs,
+            rbln_constructor_kwargs=rbln_constructor_kwargs,
             **kwargs,
         )
-        if config is None:
-            config = model.config
+        unwrap_llama()
-        config.save_pretrained(save_dir_path)
-        preprocessors = maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
+        return model
-        # Get compilation arguments
-        if rbln_config_kwargs.get("rbln_config", None) is None:
-            rbln_config = cls.get_rbln_config(
-                preprocessors=preprocessors, model_config=model.config, **rbln_config_kwargs
-            )
+    @classmethod
+    @torch.inference_mode()
+    def get_compiled_model(cls, model: "PreTrainedModel", rbln_config: RBLNConfig):
+        use_continuous_batch = rbln_config.meta["rbln_batching"] == "vllm"
-        def compile_llama():
-            wrapped_model = LlamaWrapper(model).eval()
+        wrapper_cls = LlamaWrapper_cb if use_continuous_batch else LlamaWrapper
-            prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
-            dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
+        wrapped_model = wrapper_cls(model).eval()
-            prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=0)
+        prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
+        dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
-            prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs)
-            dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs)
+        prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
+        dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
-            prefill_ir = rebel.torchscript_to_ir(
-                prefill_scripted_model,
-                input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
-            )
-            dec_ir = rebel.torchscript_to_ir(
-                dec_scripted_model,
-                input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
-            )
+        if use_continuous_batch:
+            batch_index_index = 3
+            dec_example_inputs[batch_index_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
-            # Caching prefill_decoder/decoder I/O
-            connections = [
-                (prefill_ir.outputs[1 + i], prefill_ir.inputs[3 + i])
-                for i in range(model.config.num_hidden_layers * 2)
-            ]
+        wrap_llama_cb() if use_continuous_batch else wrap_llama()
-            compiled_model = rebel.compile(
-                prefill_ir,
-                dec_ir,
-                connections=connections,
-                fusion=prefill_rbln_runtime_config.fusion,
-                npu=prefill_rbln_runtime_config.npu,
-                tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
-                use_weight_sharing=True,
-            )
-            compiled_model.save(save_dir_path / f"{DEFAULT_COMPILED_MODEL_NAME}.rbln")
+        prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs, check_trace=False)
+        dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs, check_trace=False)
-        compile_llama()
-        unwrap_llama(origin_mehtods)
+        unwrap_llama()
-        rbln_config.save(save_dir_path)
+        prefill_ir = rebel.torchscript_to_ir(
+            prefill_scripted_model,
+            input_names=[v[0] for v in prefill_rbln_runtime_config.input_info],
+        )
+        dec_ir = rebel.torchscript_to_ir(
+            dec_scripted_model,
+            input_names=[v[0] for v in dec_rbln_runtime_config.input_info],
+        )
-        return cls._from_pretrained(
-            model_id=save_dir_path,
-            config=config,
-            model_save_dir=save_dir,
-            **rbln_constructor_kwargs,
-            **kwargs,
+        # Caching prefill_decoder/decoder I/O
+        cache_index_offset = 4 if use_continuous_batch else 3
+        connections = [
+            (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
+            for i in range(model.config.num_hidden_layers * 2)
+        ]
+        compiled_model = rebel.compile(
+            prefill_ir,
+            dec_ir,
+            connections=connections,
+            fusion=prefill_rbln_runtime_config.fusion,
+            npu=prefill_rbln_runtime_config.npu,
+            tensor_parallel_size=prefill_rbln_runtime_config.tensor_parallel_size,
+            use_weight_sharing=True,
         )
+        return compiled_model
     @classmethod
     def _get_rbln_config(
@@ -249,6 +228,7 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         model_config: "PretrainedConfig",
         rbln_max_seq_len: Optional[int] = None,
         rbln_batch_size: Optional[int] = None,
+        rbln_batching: Optional[str] = None,
     ) -> RBLNConfig:
         meta = {}
@@ -256,21 +236,38 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         if rbln_max_seq_len is None:
             rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None)
         rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
+        rbln_batching = "static" if rbln_batching is None else rbln_batching
         meta["rbln_max_seq_len"] = rbln_max_seq_len
         meta["rbln_batch_size"] = rbln_batch_size
         meta["rbln_prefill_chunk_size"] = prefill_chunk_size
+        meta["rbln_batching"] = rbln_batching
+        use_continuous_batching = meta["rbln_batching"] == "vllm"
+        if rbln_batching not in SUPPORTED_BATCHING_MODES:
+            raise ValueError(
+                f'rbln_batching="{rbln_batching}" is not a supported batch mode, '
+                f"Possible: {SUPPORTED_BATCHING_MODES}"
+            )
-        def get_input_info(query_length):
+        def get_input_info(
+            batch_size,  # should be 1 if continous batch prefill
+            query_length,
+            continuous_batch=False,  # determines the shape of `cache position`
+        ):
             input_info = [
-                ("input_ids", [rbln_batch_size, query_length], "int64"),
-                ("attention_mask", [rbln_batch_size, 1, query_length, rbln_max_seq_len], "int64"),
+                ("input_ids", [batch_size, query_length], "int64"),
+                ("attention_mask", [batch_size, 1, query_length, rbln_max_seq_len], "int64"),
                 (
                     "cache_position",
-                    [],
+                    [batch_size, query_length] if continuous_batch else [],
                     "int32",
                 ),
             ]
+            if continuous_batch:
+                input_info.append(("batch_position", [], "int16"))
             input_info.extend(
                 [
                     (
@@ -286,10 +283,19 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
                     for i in range(model_config.num_hidden_layers * 2)
                 ]
             )
             return input_info
-        prefill_input_info = get_input_info(query_length=prefill_chunk_size)
-        dec_input_info = get_input_info(query_length=1)
+        prefill_input_info = get_input_info(
+            batch_size=1 if use_continuous_batching else rbln_batch_size,
+            query_length=prefill_chunk_size,
+            continuous_batch=use_continuous_batching,
+        )
+        dec_input_info = get_input_info(
+            batch_size=rbln_batch_size,
+            query_length=1,
+            continuous_batch=use_continuous_batching,
+        )
         prefill_rbln_runtime_config = RBLNRuntimeConfig(input_info=prefill_input_info)
         dec_rbln_runtime_config = RBLNRuntimeConfig(input_info=dec_input_info)
@@ -303,11 +309,14 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         return rbln_config
-    def _create_runtimes(self, rbln_device_map: Dict[str, int]) -> List[rebel.Runtime]:
+    @classmethod
+    def _create_runtimes(
+        cls, compiled_models: List[rebel.RBLNCompiledModel], rbln_device_map: Dict[str, int]
+    ) -> List[rebel.Runtime]:
         device_val = rbln_device_map[DEFAULT_COMPILED_MODEL_NAME]
         return [
-            self.compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
-            self.compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=0, tensor_type="pt", device=device_val),
+            compiled_models[0].create_runtime(input_info_index=1, tensor_type="pt", device=device_val),
         ]
     def get_decoder(self):
@@ -337,7 +346,6 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         # In greedy decoding
         if past_cached_length == 0:
             # padding with prefill_chunk_size
             # TODO left padding + left padding has issue on stoppingcriteria(max_len)
             if cur_len % self.prefill_chunk_size != 0:
@@ -384,7 +392,13 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         return model_inputs
-    def forward(
+    def forward(self, *args, **kwargs):
+        if self.use_continuous_batch:
+            return self.forward_cb(*args, **kwargs)
+        else:
+            return self.forward_static(*args, **kwargs)
+    def forward_static(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -393,7 +407,6 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
         query_length: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor]:
         if past_key_values is not None:
             past_key_values += query_length
@@ -425,3 +438,58 @@ class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
             logits=outputs,
             past_key_values=past_key_values,
         )
+    def forward_cb(
+        self,
+        input_ids: torch.LongTensor = None,
+        cache_position: Optional[torch.Tensor] = None,  # torch.tensor(,dtype=int32) (1,64) // (4,1)
+        batch_idx: int = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        # prefill_decoder
+        if cache_position.shape[1] > 1:
+            query_length = input_ids.shape[1]
+            attention_mask = self.prefill_attention_mask.clone()
+            for step in range(0, query_length, self.prefill_chunk_size):
+                if step + self.prefill_chunk_size > query_length:
+                    input_ids = torch.nn.functional.pad(input_ids, (0, step + self.prefill_chunk_size - query_length))
+                    cache_position = torch.cat(
+                        [
+                            cache_position,
+                            torch.arange(
+                                query_length,
+                                step + self.prefill_chunk_size,
+                                dtype=torch.int32,
+                            ).unsqueeze(0),
+                        ],
+                        dim=-1,
+                    )
+                sliced_input_ids = input_ids[:, step : step + self.prefill_chunk_size]
+                sliced_cache_positions = cache_position[:, step : step + self.prefill_chunk_size]
+                attention_mask[:, :, :, :step] = 1
+                attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+                outputs, _ = self.prefill_decoder(
+                    sliced_input_ids.contiguous(),
+                    attention_mask.contiguous(),
+                    sliced_cache_positions.contiguous(),
+                    torch.tensor(batch_idx, dtype=torch.int16),
+                )
+            outputs = outputs[:, query_length % self.prefill_chunk_size - 1].unsqueeze(1)
+        # decoder
+        else:
+            attention_mask = self.decoder_attention_mask.clone()
+            for b_idx in range(self.batch_size):
+                attention_mask[b_idx, :, :, : cache_position[b_idx].item() + 1] = 1
+            outputs = self.decoder(
+                input_ids.contiguous(),
+                attention_mask.contiguous(),
+                cache_position.contiguous(),
+                torch.tensor(0, dtype=torch.int16),
+            )[0]
+        return CausalLMOutputWithPast(
+            logits=outputs,
+        )

optimum/rbln/transformers/models/midm/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import os
+from os import environ
+this_path = os.path.abspath(__file__)
+local_dir = "/" + os.path.join(*this_path.split("/")[:-1]) + "/hf_hub_cached"
+environ["LOCAL_CACHE_ROOT_CUSTOM_CODE_MIDM"] = local_dir
+from .modeling_midm import RBLNMidmLMHeadModel

optimum/rbln/transformers/models/midm/hf_hub_cached/configuration_midm.py ADDED Viewed

@@ -0,0 +1,22 @@
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+class MidmBitextConfig(GPT2Config):
+    model_type = "midm-bitext-S"
+    def __init__(
+        self,
+        use_absolute_position_embedding: bool = True,
+        use_rotary_position_embedding: bool = False,
+        rotary_percentage: float = 1.0,
+        normalization_type: str = "layernorm",
+        scale_qk_by_inverse_layer_idx: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.use_absolute_position_embedding = use_absolute_position_embedding
+        self.use_rotary_position_embedding = use_rotary_position_embedding
+        self.rotary_percentage = rotary_percentage
+        self.normalization_type = normalization_type
+        self.scale_qk_by_inverse_layer_idx = scale_qk_by_inverse_layer_idx

optimum-rbln 0.1.1__py3-none-any.whl → 0.1.7__py3-none-any.whl

optimum-rbln 0.1.1py3-none-any.whl → 0.1.7py3-none-any.whl