PyPI - optimum-rbln - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

optimum-rbln 0.1.0py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

optimum/rbln/transformers/models/llama/modeling_llama.py CHANGED Viewed

@@ -34,16 +34,25 @@ from optimum.exporters import TasksManager
 from transformers import AutoModelForCausalLM, LlamaForCausalLM, PretrainedConfig, AutoConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from ...generation.utils import RBLNGenerationMixin
 from ....modeling_base import RBLNBaseModel
 from ....modeling_config import DEFAULT_COMPILED_MODEL_NAME, RBLNConfig, RBLNRuntimeConfig
 from ....utils.runtime_utils import RBLNPytorchRuntime
 from ....utils.save_utils import maybe_save_preprocessors
+# FIXME:: Merge Two architecture Codes
 from .llama_architecture import (
     LlamaWrapper,
     wrap_llama,
     unwrap_llama,
 )
+from .llama_architecture_cb import (
+    LlamaDynamicBatchWrapper as LlamaWrapper_cb,
+    wrap_llama as wrap_llama_cb,
+)
 logger = logging.getLogger(__name__)
@@ -56,26 +65,14 @@ if TYPE_CHECKING:
     )
+SUPPORTED_BATCHING_MODES = ["static", "vllm"]
 class RBLNRuntimeModel(RBLNPytorchRuntime):
     mandatory_members = ["main_input_name"]
-    # RBLN_Runtimemodule
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: torch.LongTensor = None,
-        cache_position: torch.Tensor = None,
-        **kwargs: Dict[str, Any],
-    ):
-        logits = super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-        )
-        return logits
-class RBLNLlamaForCausalLM(RBLNBaseModel):
+class RBLNLlamaForCausalLM(RBLNBaseModel, RBLNGenerationMixin):
     """
     The Llama Model transformer with a language modeling head (linear layer) on top.
     This model inherits from [`RBLNBaseModel`]. Check the superclass documentation for the generic methods the library implements for all its models.
@@ -91,21 +88,24 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
     auto_model_class = AutoModelForCausalLM
     def __post_init__(self, **kwargs):
         self.batch_size = self.rbln_config.meta["rbln_batch_size"]
         self.max_seq_len = self.rbln_config.meta["rbln_max_seq_len"]
         self.prefill_chunk_size = self.rbln_config.meta["rbln_prefill_chunk_size"]
+        self.use_continuous_batch = self.rbln_config.meta["rbln_batching"] == "vllm"
+        prefill_batch_size = self.batch_size if not self.use_continuous_batch else 1
         self.prefill_attention_mask = torch.zeros(
-            self.batch_size, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.int64
+            prefill_batch_size, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.int64
         )
         self.causal_mask = 1 - torch.triu(
-            torch.ones(self.batch_size, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
+            torch.ones(prefill_batch_size, 1, self.prefill_chunk_size, self.prefill_chunk_size), diagonal=1
         )
+        self.decoder_attention_mask = torch.zeros(self.batch_size, 1, 1, self.max_seq_len, dtype=torch.int64)
         self.prefill_decoder = RBLNRuntimeModel(runtime=self.runtimes[0], main_input_name="input_ids")
         self.decoder = RBLNRuntimeModel(runtime=self.runtimes[1], main_input_name="input_ids")
         self.past_cached_length = 0
+        self.right_padding = True
     @classmethod
     @torch.no_grad()
@@ -120,14 +120,23 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "RBLNLlamaForCausalLM":
         task = kwargs.pop("task", None)
         if task is None:
             task = TasksManager.infer_task_from_model(cls.auto_model_class)
-        save_dir = TemporaryDirectory()
-        save_dir_path = Path(save_dir.name)
+        if model_save_dir is None:
+            save_dir = TemporaryDirectory()
+            save_dir_path = Path(save_dir.name)
+        else:
+            save_dir = model_save_dir
+            if isinstance(save_dir, TemporaryDirectory):
+                save_dir_path = Path(model_save_dir.name)
+            else:
+                save_dir_path = Path(model_save_dir)
+                save_dir_path.mkdir(exist_ok=True)
         def update_configs(kwargs):
             hf_max_position_embeddings = getattr(AutoConfig.from_pretrained(model_id), "max_position_embeddings", None)
@@ -154,7 +163,10 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
         rbln_config_kwargs, rbln_constructor_kwargs = cls.pop_rbln_kwargs_from_kwargs(kwargs)
-        origin_mehtods = wrap_llama()
+        # FIXME :: This should be moved when wrapping removed.
+        use_continuous_batch = rbln_config_kwargs.get("rbln_batching", "static") == "vllm"
+        origin_mehtods = wrap_llama_cb() if use_continuous_batch else wrap_llama()
         model: LlamaForCausalLM = TasksManager.get_model_from_task(
             task=task,
             model_name_or_path=model_id,
@@ -181,14 +193,18 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
                 preprocessors=preprocessors, model_config=model.config, **rbln_config_kwargs
             )
-        def compile_llama():
-            wrapped_model = LlamaWrapper(model).eval()
+        def compile_llama(use_continuous_batch, wrapper_cls):
+            wrapped_model = wrapper_cls(model).eval()
             prefill_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][0]
             dec_rbln_runtime_config = rbln_config[DEFAULT_COMPILED_MODEL_NAME][1]
             prefill_example_inputs = prefill_rbln_runtime_config.get_dummy_inputs(fill=0)
-            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=0)
+            dec_example_inputs = dec_rbln_runtime_config.get_dummy_inputs(fill=4)
+            if use_continuous_batch:
+                batch_index_index = 3
+                dec_example_inputs[batch_index_index].fill_(-1)  # fill batch_position -1 to indicate it is decoder.
             prefill_scripted_model = torch.jit.trace(wrapped_model, prefill_example_inputs)
             dec_scripted_model = torch.jit.trace(wrapped_model, dec_example_inputs)
@@ -203,8 +219,9 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
             )
             # Caching prefill_decoder/decoder I/O
+            cache_index_offset = 4 if use_continuous_batch else 3
             connections = [
-                (prefill_ir.outputs[1 + i], prefill_ir.inputs[3 + i])
+                (prefill_ir.outputs[1 + i], prefill_ir.inputs[cache_index_offset + i])
                 for i in range(model.config.num_hidden_layers * 2)
             ]
@@ -219,7 +236,8 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
             )
             compiled_model.save(save_dir_path / f"{DEFAULT_COMPILED_MODEL_NAME}.rbln")
-        compile_llama()
+        wrapper_cls = LlamaWrapper_cb if use_continuous_batch else LlamaWrapper
+        compile_llama(use_continuous_batch=use_continuous_batch, wrapper_cls=wrapper_cls)
         unwrap_llama(origin_mehtods)
         rbln_config.save(save_dir_path)
@@ -239,27 +257,46 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
         model_config: "PretrainedConfig",
         rbln_max_seq_len: Optional[int] = None,
         rbln_batch_size: Optional[int] = None,
+        rbln_batching: Optional[str] = None,
     ) -> RBLNConfig:
         meta = {}
         prefill_chunk_size = 128
         if rbln_max_seq_len is None:
             rbln_max_seq_len = getattr(model_config, "max_position_embeddings", None)
+        rbln_batch_size = 1 if rbln_batch_size is None else rbln_batch_size
+        rbln_batching = "static" if rbln_batching is None else rbln_batching
         meta["rbln_max_seq_len"] = rbln_max_seq_len
         meta["rbln_batch_size"] = rbln_batch_size
         meta["rbln_prefill_chunk_size"] = prefill_chunk_size
+        meta["rbln_batching"] = rbln_batching
+        use_continuous_batching = meta["rbln_batching"] == "vllm"
-        def get_input_info(query_length):
+        if rbln_batching not in SUPPORTED_BATCHING_MODES:
+            raise ValueError(
+                f'rbln_batching="{rbln_batching}" is not a supported batch mode, '
+                f"Possible: {SUPPORTED_BATCHING_MODES}"
+            )
+        def get_input_info(
+            batch_size,  # should be 1 if continous batch prefill
+            query_length,
+            continuous_batch=False,  # determines the shape of `cache position`
+        ):
             input_info = [
-                ("input_ids", [rbln_batch_size, query_length], "int64"),
-                ("attention_mask", [rbln_batch_size, 1, query_length, rbln_max_seq_len], "int64"),
+                ("input_ids", [batch_size, query_length], "int64"),
+                ("attention_mask", [batch_size, 1, query_length, rbln_max_seq_len], "int64"),
                 (
                     "cache_position",
-                    [],
+                    [batch_size, query_length] if continuous_batch else [],
                     "int32",
                 ),
             ]
+            if continuous_batch:
+                input_info.append(("batch_position", [], "int16"))
             input_info.extend(
                 [
                     (
@@ -275,10 +312,19 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
                     for i in range(model_config.num_hidden_layers * 2)
                 ]
             )
             return input_info
-        prefill_input_info = get_input_info(query_length=prefill_chunk_size)
-        dec_input_info = get_input_info(query_length=1)
+        prefill_input_info = get_input_info(
+            batch_size=1 if use_continuous_batching else rbln_batch_size,
+            query_length=prefill_chunk_size,
+            continuous_batch=use_continuous_batching,
+        )
+        dec_input_info = get_input_info(
+            batch_size=rbln_batch_size,
+            query_length=1,
+            continuous_batch=use_continuous_batching,
+        )
         prefill_rbln_runtime_config = RBLNRuntimeConfig(input_info=prefill_input_info)
         dec_rbln_runtime_config = RBLNRuntimeConfig(input_info=dec_input_info)
@@ -321,23 +367,46 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
     # args input_ids, past_key_values and attention_mask are updated by _update_model_kwargs_for_generation() in _greedy_search() in GenerationMixin
     def prepare_inputs_for_generation(self, input_ids, past_key_values=0, attention_mask=None, **kwargs):
-        batch_size, hf_input_length = input_ids.shape
+        batch_size, cur_len = input_ids.shape
         past_cached_length = past_key_values
-        query_length = hf_input_length - past_cached_length
         # In greedy decoding
-        if past_key_values == 0:
-            self.prompt_length = query_length
-            self.prompt_ids = input_ids
-            self.prompt_attn_mask = attention_mask.unsqueeze(1).unsqueeze(1).contiguous()
-            attention_mask = torch.zeros(batch_size, 1, self.prefill_chunk_size, self.max_seq_len, dtype=torch.int64)
+        if past_cached_length == 0:
+            # padding with prefill_chunk_size
+            # TODO left padding + left padding has issue on stoppingcriteria(max_len)
+            if cur_len % self.prefill_chunk_size != 0:
+                pad_len = self.prefill_chunk_size - cur_len % self.prefill_chunk_size
+                input_ids = torch.nn.functional.pad(input_ids, (0, pad_len))
+            # padding_side
+            if batch_size > 1 and torch.all(attention_mask[..., -1] == 1):
+                self.right_padding = False
+            if self.right_padding:
+                self.rightpad_max_len = cur_len
+                prompt_min_len = torch.min(torch.sum(attention_mask, dim=-1))
+                self.dummy_len = torch.sum(attention_mask, dim=-1) - prompt_min_len  # dummy_decoder generation length
+                query_length = prompt_min_len.item()
+            else:
+                query_length = cur_len - past_cached_length
+                self.prompt_length = query_length
+                self.prompt_attn_mask = attention_mask.unsqueeze(1).unsqueeze(1).contiguous()
+            attention_mask = self.prefill_attention_mask.clone()
             cache_position = torch.tensor(0, dtype=torch.int32)
         else:
-            attention_mask = torch.nn.functional.pad(attention_mask, (0, self.max_seq_len - hf_input_length))
-            attention_mask = attention_mask.reshape(batch_size, 1, 1, -1).contiguous()
+            if self.right_padding:
+                attention_mask = torch.zeros(batch_size, 1, 1, self.max_seq_len, dtype=torch.int64)
+                attention_mask[:, :, :, : past_cached_length + 1] = 1
+                input_ids = input_ids[:, past_cached_length : past_cached_length + 1].contiguous()
+            else:
+                attention_mask = torch.nn.functional.pad(attention_mask, (0, self.max_seq_len - cur_len))
+                attention_mask = attention_mask.reshape(batch_size, 1, 1, -1).contiguous()
+                input_ids = input_ids[:, -1:]
             cache_position = torch.tensor(past_cached_length, dtype=torch.int32)
-            input_ids = input_ids[:, -1:]
+            query_length = 1
         model_inputs = {
             "input_ids": input_ids,
@@ -349,7 +418,13 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
         return model_inputs
-    def forward(
+    def forward(self, *args, **kwargs):
+        if self.use_continuous_batch:
+            return self.forward_cb(*args, **kwargs)
+        else:
+            return self.forward_static(*args, **kwargs)
+    def forward_static(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
@@ -363,38 +438,20 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
         # prefill_decoder
         if cache_position == 0:
-            while query_length > self.prefill_chunk_size:
-                # prepare input_ids & attention_mask
-                sliced_input_ids = input_ids[:, cache_position : cache_position + self.prefill_chunk_size].contiguous()
-                attention_mask[:, :, :, :cache_position] = 1
-                attention_mask[:, :, :, cache_position : cache_position + self.prefill_chunk_size] = self.causal_mask
-                attention_mask[:, :, :, : self.prompt_length] *= self.prompt_attn_mask[:, :, :, :]
-                _ = self.prefill_decoder(
-                    sliced_input_ids,
-                    attention_mask,
-                    cache_position,
+            for step in range(0, query_length, self.prefill_chunk_size):
+                sliced_input_ids = input_ids[:, step : step + self.prefill_chunk_size]
+                attention_mask[:, :, :, :step] = 1
+                attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+                if not self.right_padding:
+                    attention_mask[:, :, :, : self.prompt_length] &= self.prompt_attn_mask[:, :, :, :]
+                outputs = self.prefill_decoder(
+                    input_ids=sliced_input_ids.contiguous(),
+                    attention_mask=attention_mask.contiguous(),
+                    cache_position=cache_position + step,
                 )
-                # update query_length & cache_position
-                query_length -= self.prefill_chunk_size
-                cache_position += self.prefill_chunk_size
-            # prepare input_ids & attention_mask
-            last_input_ids = input_ids[:, cache_position : cache_position + query_length]
-            last_input_ids = torch.nn.functional.pad(last_input_ids, (0, self.prefill_chunk_size - query_length))
+            outputs = outputs[:, query_length % self.prefill_chunk_size - 1].unsqueeze(1)
-            attention_mask[:, :, :, :cache_position] = 1
-            mask_slice = self.causal_mask[:, :, :query_length, :query_length]
-            attention_mask[:, :, :query_length, cache_position : cache_position + query_length] = mask_slice
-            attention_mask[:, :, :, : self.prompt_length] *= self.prompt_attn_mask[:, :, :, :]
-            outputs = self.prefill_decoder(
-                last_input_ids.contiguous(),
-                attention_mask.contiguous(),
-                cache_position,
-            )
-            outputs = outputs[:, query_length - 1].unsqueeze(1)
         # decoder
         else:
             outputs = self.decoder(
@@ -407,3 +464,58 @@ class RBLNLlamaForCausalLM(RBLNBaseModel):
             logits=outputs,
             past_key_values=past_key_values,
         )
+    def forward_cb(
+        self,
+        input_ids: torch.LongTensor = None,
+        cache_position: Optional[torch.Tensor] = None,  # torch.tensor(,dtype=int32) (1,64) // (4,1)
+        batch_idx: int = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor]:
+        # prefill_decoder
+        if cache_position.shape[1] > 1:
+            query_length = input_ids.shape[1]
+            attention_mask = self.prefill_attention_mask.clone()
+            for step in range(0, query_length, self.prefill_chunk_size):
+                if step + self.prefill_chunk_size > query_length:
+                    input_ids = torch.nn.functional.pad(input_ids, (0, step + self.prefill_chunk_size - query_length))
+                    cache_position = torch.cat(
+                        [
+                            cache_position,
+                            torch.arange(
+                                query_length,
+                                step + self.prefill_chunk_size,
+                                dtype=torch.int32,
+                            ).unsqueeze(0),
+                        ],
+                        dim=-1,
+                    )
+                sliced_input_ids = input_ids[:, step : step + self.prefill_chunk_size]
+                sliced_cache_positions = cache_position[:, step : step + self.prefill_chunk_size]
+                attention_mask[:, :, :, :step] = 1
+                attention_mask[:, :, :, step : step + self.prefill_chunk_size] = self.causal_mask
+                outputs, _ = self.prefill_decoder(
+                    sliced_input_ids.contiguous(),
+                    attention_mask.contiguous(),
+                    sliced_cache_positions.contiguous(),
+                    torch.tensor(batch_idx, dtype=torch.int16),
+                )
+            outputs = outputs[:, query_length % self.prefill_chunk_size - 1].unsqueeze(1)
+        # decoder
+        else:
+            attention_mask = self.decoder_attention_mask.clone()
+            for b_idx in range(self.batch_size):
+                attention_mask[b_idx, :, :, : cache_position[b_idx].item() + 1] = 1
+            outputs = self.decoder(
+                input_ids.contiguous(),
+                attention_mask.contiguous(),
+                cache_position.contiguous(),
+                torch.tensor(0, dtype=torch.int16),
+            )[0]
+        return CausalLMOutputWithPast(
+            logits=outputs,
+        )

optimum/rbln/transformers/models/midm/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2024 Rebellions Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Portions of this software are licensed under the Apache License,
+# Version 2.0. See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.
+# All other portions of this software, including proprietary code,
+# are the intellectual property of Rebellions Inc. and may not be
+# copied, modified, or distributed without prior written permission
+# from Rebellions Inc.
+import os
+from os import environ
+this_path = os.path.abspath(__file__)
+local_dir = "/" + os.path.join(*this_path.split("/")[:-1]) + "/hf_hub_cached"
+environ["LOCAL_CACHE_ROOT_CUSTOM_CODE_MIDM"] = local_dir
+from .modeling_midm import RBLNMidmLMHeadModel

optimum/rbln/transformers/models/midm/hf_hub_cached/configuration_midm.py ADDED Viewed

@@ -0,0 +1,22 @@
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+class MidmBitextConfig(GPT2Config):
+    model_type = "midm-bitext-S"
+    def __init__(
+        self,
+        use_absolute_position_embedding: bool = True,
+        use_rotary_position_embedding: bool = False,
+        rotary_percentage: float = 1.0,
+        normalization_type: str = "layernorm",
+        scale_qk_by_inverse_layer_idx: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.use_absolute_position_embedding = use_absolute_position_embedding
+        self.use_rotary_position_embedding = use_rotary_position_embedding
+        self.rotary_percentage = rotary_percentage
+        self.normalization_type = normalization_type
+        self.scale_qk_by_inverse_layer_idx = scale_qk_by_inverse_layer_idx

optimum-rbln 0.1.0__py3-none-any.whl → 0.1.4__py3-none-any.whl

optimum-rbln 0.1.0py3-none-any.whl → 0.1.4py3-none-any.whl